workbench 0.8.162__py3-none-any.whl → 0.8.202__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of workbench might be problematic. Click here for more details.

Files changed (113) hide show
  1. workbench/algorithms/dataframe/__init__.py +1 -2
  2. workbench/algorithms/dataframe/fingerprint_proximity.py +2 -2
  3. workbench/algorithms/dataframe/proximity.py +261 -235
  4. workbench/algorithms/graph/light/proximity_graph.py +10 -8
  5. workbench/api/__init__.py +2 -1
  6. workbench/api/compound.py +1 -1
  7. workbench/api/endpoint.py +11 -0
  8. workbench/api/feature_set.py +11 -8
  9. workbench/api/meta.py +5 -2
  10. workbench/api/model.py +16 -15
  11. workbench/api/monitor.py +1 -16
  12. workbench/core/artifacts/__init__.py +11 -2
  13. workbench/core/artifacts/artifact.py +11 -3
  14. workbench/core/artifacts/data_capture_core.py +355 -0
  15. workbench/core/artifacts/endpoint_core.py +256 -118
  16. workbench/core/artifacts/feature_set_core.py +265 -16
  17. workbench/core/artifacts/model_core.py +107 -60
  18. workbench/core/artifacts/monitor_core.py +33 -248
  19. workbench/core/cloud_platform/aws/aws_account_clamp.py +50 -1
  20. workbench/core/cloud_platform/aws/aws_meta.py +12 -5
  21. workbench/core/cloud_platform/aws/aws_parameter_store.py +18 -2
  22. workbench/core/cloud_platform/aws/aws_session.py +4 -4
  23. workbench/core/transforms/data_to_features/light/molecular_descriptors.py +4 -4
  24. workbench/core/transforms/features_to_model/features_to_model.py +42 -32
  25. workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +36 -6
  26. workbench/core/transforms/pandas_transforms/pandas_to_features.py +27 -0
  27. workbench/core/views/training_view.py +113 -42
  28. workbench/core/views/view.py +53 -3
  29. workbench/core/views/view_utils.py +4 -4
  30. workbench/model_scripts/chemprop/chemprop.template +852 -0
  31. workbench/model_scripts/chemprop/generated_model_script.py +852 -0
  32. workbench/model_scripts/chemprop/requirements.txt +11 -0
  33. workbench/model_scripts/custom_models/chem_info/fingerprints.py +134 -0
  34. workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +483 -0
  35. workbench/model_scripts/custom_models/chem_info/mol_standardize.py +450 -0
  36. workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +7 -9
  37. workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py +1 -1
  38. workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +3 -5
  39. workbench/model_scripts/custom_models/proximity/proximity.py +261 -235
  40. workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
  41. workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +20 -21
  42. workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
  43. workbench/model_scripts/custom_models/uq_models/meta_uq.template +166 -62
  44. workbench/model_scripts/custom_models/uq_models/ngboost.template +30 -18
  45. workbench/model_scripts/custom_models/uq_models/proximity.py +261 -235
  46. workbench/model_scripts/custom_models/uq_models/requirements.txt +1 -3
  47. workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +15 -17
  48. workbench/model_scripts/pytorch_model/generated_model_script.py +373 -190
  49. workbench/model_scripts/pytorch_model/pytorch.template +370 -187
  50. workbench/model_scripts/scikit_learn/generated_model_script.py +7 -12
  51. workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
  52. workbench/model_scripts/script_generation.py +17 -9
  53. workbench/model_scripts/uq_models/generated_model_script.py +605 -0
  54. workbench/model_scripts/uq_models/mapie.template +605 -0
  55. workbench/model_scripts/uq_models/requirements.txt +1 -0
  56. workbench/model_scripts/xgb_model/generated_model_script.py +37 -46
  57. workbench/model_scripts/xgb_model/xgb_model.template +44 -46
  58. workbench/repl/workbench_shell.py +28 -14
  59. workbench/scripts/endpoint_test.py +162 -0
  60. workbench/scripts/lambda_test.py +73 -0
  61. workbench/scripts/ml_pipeline_batch.py +137 -0
  62. workbench/scripts/ml_pipeline_sqs.py +186 -0
  63. workbench/scripts/monitor_cloud_watch.py +20 -100
  64. workbench/utils/aws_utils.py +4 -3
  65. workbench/utils/chem_utils/__init__.py +0 -0
  66. workbench/utils/chem_utils/fingerprints.py +134 -0
  67. workbench/utils/chem_utils/misc.py +194 -0
  68. workbench/utils/chem_utils/mol_descriptors.py +483 -0
  69. workbench/utils/chem_utils/mol_standardize.py +450 -0
  70. workbench/utils/chem_utils/mol_tagging.py +348 -0
  71. workbench/utils/chem_utils/projections.py +209 -0
  72. workbench/utils/chem_utils/salts.py +256 -0
  73. workbench/utils/chem_utils/sdf.py +292 -0
  74. workbench/utils/chem_utils/toxicity.py +250 -0
  75. workbench/utils/chem_utils/vis.py +253 -0
  76. workbench/utils/chemprop_utils.py +760 -0
  77. workbench/utils/cloudwatch_handler.py +1 -1
  78. workbench/utils/cloudwatch_utils.py +137 -0
  79. workbench/utils/config_manager.py +3 -7
  80. workbench/utils/endpoint_utils.py +5 -7
  81. workbench/utils/license_manager.py +2 -6
  82. workbench/utils/model_utils.py +95 -34
  83. workbench/utils/monitor_utils.py +44 -62
  84. workbench/utils/pandas_utils.py +3 -3
  85. workbench/utils/pytorch_utils.py +526 -0
  86. workbench/utils/shap_utils.py +10 -2
  87. workbench/utils/workbench_logging.py +0 -3
  88. workbench/utils/workbench_sqs.py +1 -1
  89. workbench/utils/xgboost_model_utils.py +371 -156
  90. workbench/web_interface/components/model_plot.py +7 -1
  91. workbench/web_interface/components/plugin_unit_test.py +5 -2
  92. workbench/web_interface/components/plugins/dashboard_status.py +3 -1
  93. workbench/web_interface/components/plugins/generated_compounds.py +1 -1
  94. workbench/web_interface/components/plugins/model_details.py +9 -7
  95. workbench/web_interface/components/plugins/scatter_plot.py +3 -3
  96. {workbench-0.8.162.dist-info → workbench-0.8.202.dist-info}/METADATA +27 -6
  97. {workbench-0.8.162.dist-info → workbench-0.8.202.dist-info}/RECORD +101 -85
  98. {workbench-0.8.162.dist-info → workbench-0.8.202.dist-info}/entry_points.txt +4 -0
  99. {workbench-0.8.162.dist-info → workbench-0.8.202.dist-info}/licenses/LICENSE +1 -1
  100. workbench/model_scripts/custom_models/chem_info/local_utils.py +0 -769
  101. workbench/model_scripts/custom_models/chem_info/tautomerize.py +0 -83
  102. workbench/model_scripts/custom_models/proximity/generated_model_script.py +0 -138
  103. workbench/model_scripts/custom_models/uq_models/generated_model_script.py +0 -393
  104. workbench/model_scripts/custom_models/uq_models/mapie_xgb.template +0 -203
  105. workbench/model_scripts/ensemble_xgb/generated_model_script.py +0 -279
  106. workbench/model_scripts/quant_regression/quant_regression.template +0 -279
  107. workbench/model_scripts/quant_regression/requirements.txt +0 -1
  108. workbench/utils/chem_utils.py +0 -1556
  109. workbench/utils/execution_environment.py +0 -211
  110. workbench/utils/fast_inference.py +0 -167
  111. workbench/utils/resource_utils.py +0 -39
  112. {workbench-0.8.162.dist-info → workbench-0.8.202.dist-info}/WHEEL +0 -0
  113. {workbench-0.8.162.dist-info → workbench-0.8.202.dist-info}/top_level.txt +0 -0
@@ -4,13 +4,10 @@ import awswrangler as wr
4
4
  import numpy as np
5
5
 
6
6
  # Model Performance Scores
7
- from sklearn.metrics import (
8
- mean_absolute_error,
9
- r2_score,
10
- root_mean_squared_error
11
- )
7
+ from sklearn.metrics import mean_absolute_error, median_absolute_error, r2_score, root_mean_squared_error
12
8
  from sklearn.model_selection import KFold
13
9
  from scipy.optimize import minimize
10
+ from scipy.stats import spearmanr
14
11
 
15
12
  from io import StringIO
16
13
  import json
@@ -23,7 +20,7 @@ TEMPLATE_PARAMS = {
23
20
  "features": "{{feature_list}}",
24
21
  "target": "{{target_column}}",
25
22
  "train_all_data": "{{train_all_data}}",
26
- "model_metrics_s3_path": "{{model_metrics_s3_path}}"
23
+ "model_metrics_s3_path": "{{model_metrics_s3_path}}",
27
24
  }
28
25
 
29
26
 
@@ -47,7 +44,7 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
47
44
  """
48
45
  Matches and renames DataFrame columns to match model feature names (case-insensitive).
49
46
  Prioritizes exact matches, then case-insensitive matches.
50
-
47
+
51
48
  Raises ValueError if any model features cannot be matched.
52
49
  """
53
50
  df_columns_lower = {col.lower(): col for col in df.columns}
@@ -90,10 +87,7 @@ if __name__ == "__main__":
90
87
  args = parser.parse_args()
91
88
 
92
89
  # Load training data from the specified directory
93
- training_files = [
94
- os.path.join(args.train, file)
95
- for file in os.listdir(args.train) if file.endswith(".csv")
96
- ]
90
+ training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
97
91
  df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
98
92
 
99
93
  # Check if the DataFrame is empty
@@ -172,16 +166,14 @@ if __name__ == "__main__":
172
166
  cv_residuals = np.array(cv_residuals)
173
167
  cv_uncertainties = np.array(cv_uncertainties)
174
168
 
175
-
176
169
  # Optimize calibration parameters: σ_cal = a * σ_uc + b
177
170
  def neg_log_likelihood(params):
178
171
  a, b = params
179
172
  sigma_cal = a * cv_uncertainties + b
180
173
  sigma_cal = np.maximum(sigma_cal, 1e-8) # Prevent division by zero
181
- return np.sum(0.5 * np.log(2 * np.pi * sigma_cal ** 2) + 0.5 * (cv_residuals ** 2) / (sigma_cal ** 2))
174
+ return np.sum(0.5 * np.log(2 * np.pi * sigma_cal**2) + 0.5 * (cv_residuals**2) / (sigma_cal**2))
182
175
 
183
-
184
- result = minimize(neg_log_likelihood, x0=[1.0, 0.1], method='Nelder-Mead')
176
+ result = minimize(neg_log_likelihood, x0=[1.0, 0.1], method="Nelder-Mead")
185
177
  cal_a, cal_b = result.x
186
178
 
187
179
  print(f"Calibration parameters: a={cal_a:.4f}, b={cal_b:.4f}")
@@ -205,7 +197,9 @@ if __name__ == "__main__":
205
197
  result_df["prediction"] = result_df[[name for name in result_df.columns if name.startswith("m_")]].mean(axis=1)
206
198
 
207
199
  # Compute uncalibrated uncertainty
208
- result_df["prediction_std_uc"] = result_df[[name for name in result_df.columns if name.startswith("m_")]].std(axis=1)
200
+ result_df["prediction_std_uc"] = result_df[[name for name in result_df.columns if name.startswith("m_")]].std(
201
+ axis=1
202
+ )
209
203
 
210
204
  # Apply calibration to uncertainty
211
205
  result_df["prediction_std"] = cal_a * result_df["prediction_std_uc"] + cal_b
@@ -224,11 +218,16 @@ if __name__ == "__main__":
224
218
  # Report Performance Metrics
225
219
  rmse = root_mean_squared_error(result_df[target], result_df["prediction"])
226
220
  mae = mean_absolute_error(result_df[target], result_df["prediction"])
221
+ medae = median_absolute_error(result_df[target], result_df["prediction"])
227
222
  r2 = r2_score(result_df[target], result_df["prediction"])
228
- print(f"RMSE: {rmse:.3f}")
229
- print(f"MAE: {mae:.3f}")
230
- print(f"R2: {r2:.3f}")
231
- print(f"NumRows: {len(result_df)}")
223
+ spearman_corr = spearmanr(result_df[target], result_df["prediction"]).correlation
224
+ support = len(result_df)
225
+ print(f"rmse: {rmse:.3f}")
226
+ print(f"mae: {mae:.3f}")
227
+ print(f"medae: {medae:.3f}")
228
+ print(f"r2: {r2:.3f}")
229
+ print(f"spearmanr: {spearman_corr:.3f}")
230
+ print(f"support: {support}")
232
231
 
233
232
  # Now save the models
234
233
  for name, model in models.items():
@@ -352,4 +351,4 @@ def predict_fn(df, models) -> pd.DataFrame:
352
351
  df = df.reindex(sorted(df.columns), axis=1)
353
352
 
354
353
  # All done, return the DataFrame
355
- return df
354
+ return df
@@ -9,7 +9,7 @@ from sklearn.model_selection import train_test_split
9
9
  TEMPLATE_PARAMS = {
10
10
  "features": "{{feature_list}}",
11
11
  "target": "{{target_column}}",
12
- "train_all_data": "{{train_all_data}}"
12
+ "train_all_data": "{{train_all_data}}",
13
13
  }
14
14
 
15
15
  from io import StringIO
@@ -33,7 +33,7 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
33
33
  """
34
34
  Matches and renames DataFrame columns to match model feature names (case-insensitive).
35
35
  Prioritizes exact matches, then case-insensitive matches.
36
-
36
+
37
37
  Raises ValueError if any model features cannot be matched.
38
38
  """
39
39
  df_columns_lower = {col.lower(): col for col in df.columns}
@@ -46,7 +46,7 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
46
46
  rename_dict[df_columns_lower[feature.lower()]] = feature
47
47
  else:
48
48
  missing.append(feature)
49
-
49
+
50
50
  if missing:
51
51
  raise ValueError(f"Features not found: {missing}")
52
52
 
@@ -76,10 +76,7 @@ if __name__ == "__main__":
76
76
  args = parser.parse_args()
77
77
 
78
78
  # Load training data from the specified directory
79
- training_files = [
80
- os.path.join(args.train, file)
81
- for file in os.listdir(args.train) if file.endswith(".csv")
82
- ]
79
+ training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
83
80
  df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
84
81
 
85
82
  # Check if the DataFrame is empty
@@ -112,10 +109,7 @@ if __name__ == "__main__":
112
109
  )
113
110
 
114
111
  # Create a Pipeline with StandardScaler
115
- model = Pipeline([
116
- ("scaler", StandardScaler()),
117
- ("model", model)
118
- ])
112
+ model = Pipeline([("scaler", StandardScaler()), ("model", model)])
119
113
 
120
114
  # Prepare features and targets for training
121
115
  X_train = df_train[features]
@@ -1,34 +1,34 @@
1
1
  # Model: NGBoost Regressor with Distribution output
2
2
  from ngboost import NGBRegressor
3
- from xgboost import XGBRegressor # Base Estimator
3
+ from ngboost.distns import Cauchy
4
+ from xgboost import XGBRegressor # Point Estimator
4
5
  from sklearn.model_selection import train_test_split
5
6
 
6
7
  # Model Performance Scores
7
- from sklearn.metrics import (
8
- mean_absolute_error,
9
- r2_score,
10
- root_mean_squared_error
11
- )
8
+ from sklearn.metrics import mean_absolute_error, median_absolute_error, r2_score, root_mean_squared_error
9
+ from scipy.stats import spearmanr
12
10
 
13
11
  from io import StringIO
14
12
  import json
15
13
  import argparse
16
14
  import joblib
17
15
  import os
16
+ import numpy as np
18
17
  import pandas as pd
18
+ from typing import List, Tuple
19
19
 
20
20
  # Local Imports
21
21
  from proximity import Proximity
22
22
 
23
23
 
24
-
25
24
  # Template Placeholders
26
25
  TEMPLATE_PARAMS = {
27
26
  "id_column": "{{id_column}}",
28
- "features": "{{feature_list}}",
29
27
  "target": "{{target_column}}",
28
+ "features": "{{feature_list}}",
29
+ "compressed_features": "{{compressed_features}}",
30
30
  "train_all_data": "{{train_all_data}}",
31
- "track_columns": "{{track_columns}}"
31
+ "track_columns": "{{track_columns}}",
32
32
  }
33
33
 
34
34
 
@@ -72,16 +72,99 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
72
72
  return df.rename(columns=rename_dict)
73
73
 
74
74
 
75
- # TRAINING SECTION
76
- #
77
- # This section (__main__) is where SageMaker will execute the training job
78
- # and save the model artifacts to the model directory.
79
- #
75
+ def convert_categorical_types(df: pd.DataFrame, features: list, category_mappings={}) -> tuple:
76
+ """
77
+ Converts appropriate columns to categorical type with consistent mappings.
78
+
79
+ Args:
80
+ df (pd.DataFrame): The DataFrame to process.
81
+ features (list): List of feature names to consider for conversion.
82
+ category_mappings (dict, optional): Existing category mappings. If empty dict, we're in
83
+ training mode. If populated, we're in inference mode.
84
+
85
+ Returns:
86
+ tuple: (processed DataFrame, category mappings dictionary)
87
+ """
88
+ # Training mode
89
+ if category_mappings == {}:
90
+ for col in df.select_dtypes(include=["object", "string"]):
91
+ if col in features and df[col].nunique() < 20:
92
+ print(f"Training mode: Converting {col} to category")
93
+ df[col] = df[col].astype("category")
94
+ category_mappings[col] = df[col].cat.categories.tolist() # Store category mappings
95
+
96
+ # Inference mode
97
+ else:
98
+ for col, categories in category_mappings.items():
99
+ if col in df.columns:
100
+ print(f"Inference mode: Applying categorical mapping for {col}")
101
+ df[col] = pd.Categorical(df[col], categories=categories) # Apply consistent categorical mapping
102
+
103
+ return df, category_mappings
104
+
105
+
106
+ def decompress_features(
107
+ df: pd.DataFrame, features: List[str], compressed_features: List[str]
108
+ ) -> Tuple[pd.DataFrame, List[str]]:
109
+ """Prepare features for the model by decompressing bitstring features
110
+
111
+ Args:
112
+ df (pd.DataFrame): The features DataFrame
113
+ features (List[str]): Full list of feature names
114
+ compressed_features (List[str]): List of feature names to decompress (bitstrings)
115
+
116
+ Returns:
117
+ pd.DataFrame: DataFrame with the decompressed features
118
+ List[str]: Updated list of feature names after decompression
119
+
120
+ Raises:
121
+ ValueError: If any missing values are found in the specified features
122
+ """
123
+
124
+ # Check for any missing values in the required features
125
+ missing_counts = df[features].isna().sum()
126
+ if missing_counts.any():
127
+ missing_features = missing_counts[missing_counts > 0]
128
+ print(
129
+ f"WARNING: Found missing values in features: {missing_features.to_dict()}. "
130
+ "WARNING: You might want to remove/replace all NaN values before processing."
131
+ )
132
+
133
+ # Decompress the specified compressed features
134
+ decompressed_features = features.copy()
135
+ for feature in compressed_features:
136
+ if (feature not in df.columns) or (feature not in features):
137
+ print(f"Feature '{feature}' not in the features list, skipping decompression.")
138
+ continue
139
+
140
+ # Remove the feature from the list of features to avoid duplication
141
+ decompressed_features.remove(feature)
142
+
143
+ # Handle all compressed features as bitstrings
144
+ bit_matrix = np.array([list(bitstring) for bitstring in df[feature]], dtype=np.uint8)
145
+ prefix = feature[:3]
146
+
147
+ # Create all new columns at once - avoids fragmentation
148
+ new_col_names = [f"{prefix}_{i}" for i in range(bit_matrix.shape[1])]
149
+ new_df = pd.DataFrame(bit_matrix, columns=new_col_names, index=df.index)
150
+
151
+ # Add to features list
152
+ decompressed_features.extend(new_col_names)
153
+
154
+ # Drop original column and concatenate new ones
155
+ df = df.drop(columns=[feature])
156
+ df = pd.concat([df, new_df], axis=1)
157
+
158
+ return df, decompressed_features
159
+
160
+
80
161
  if __name__ == "__main__":
81
162
  # Template Parameters
82
163
  id_column = TEMPLATE_PARAMS["id_column"]
83
- features = TEMPLATE_PARAMS["features"]
84
164
  target = TEMPLATE_PARAMS["target"]
165
+ features = TEMPLATE_PARAMS["features"]
166
+ orig_features = features.copy()
167
+ compressed_features = TEMPLATE_PARAMS["compressed_features"]
85
168
  train_all_data = TEMPLATE_PARAMS["train_all_data"]
86
169
  track_columns = TEMPLATE_PARAMS["track_columns"] # Can be None
87
170
  validation_split = 0.2
@@ -95,63 +178,77 @@ if __name__ == "__main__":
95
178
  )
96
179
  args = parser.parse_args()
97
180
 
98
- # Load training data from the specified directory
99
- training_files = [
100
- os.path.join(args.train, file)
101
- for file in os.listdir(args.train) if file.endswith(".csv")
102
- ]
181
+ # Read the training data into DataFrames
182
+ training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
103
183
  print(f"Training Files: {training_files}")
104
184
 
105
185
  # Combine files and read them all into a single pandas dataframe
106
- df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
186
+ all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
187
+
188
+ # Check if the dataframe is empty
189
+ check_dataframe(all_df, "training_df")
190
+
191
+ # Features/Target output
192
+ print(f"Target: {target}")
193
+ print(f"Features: {str(features)}")
107
194
 
108
- # Check if the DataFrame is empty
109
- check_dataframe(df, "training_df")
195
+ # Convert any features that might be categorical to 'category' type
196
+ all_df, category_mappings = convert_categorical_types(all_df, features)
110
197
 
111
- # Training data split logic
198
+ # If we have compressed features, decompress them
199
+ if compressed_features:
200
+ print(f"Decompressing features {compressed_features}...")
201
+ all_df, features = decompress_features(all_df, features, compressed_features)
202
+
203
+ # Do we want to train on all the data?
112
204
  if train_all_data:
113
- # Use all data for both training and validation
114
- print("Training on all data...")
115
- df_train = df.copy()
116
- df_val = df.copy()
117
- elif "training" in df.columns:
118
- # Split data based on a 'training' column if it exists
119
- print("Splitting data based on 'training' column...")
120
- df_train = df[df["training"]].copy()
121
- df_val = df[~df["training"]].copy()
205
+ print("Training on ALL of the data")
206
+ df_train = all_df.copy()
207
+ df_val = all_df.copy()
208
+
209
+ # Does the dataframe have a training column?
210
+ elif "training" in all_df.columns:
211
+ print("Found training column, splitting data based on training column")
212
+ df_train = all_df[all_df["training"]]
213
+ df_val = all_df[~all_df["training"]]
122
214
  else:
123
- # Perform a random split if no 'training' column is found
124
- print("Splitting data randomly...")
125
- df_train, df_val = train_test_split(df, test_size=validation_split, random_state=42)
215
+ # Just do a random training Split
216
+ print("WARNING: No training column found, splitting data with random state=42")
217
+ df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
218
+ print(f"FIT/TRAIN: {df_train.shape}")
219
+ print(f"VALIDATION: {df_val.shape}")
126
220
 
127
221
  # We're using XGBoost for point predictions and NGBoost for uncertainty quantification
128
222
  xgb_model = XGBRegressor()
129
- ngb_model = NGBRegressor()
223
+ ngb_model = NGBRegressor() # Dist=Cauchy) Seems to give HUGE prediction intervals
130
224
 
131
225
  # Prepare features and targets for training
132
226
  X_train = df_train[features]
133
- X_val = df_val[features]
227
+ X_validate = df_val[features]
134
228
  y_train = df_train[target]
135
- y_val = df_val[target]
229
+ y_validate = df_val[target]
136
230
 
137
231
  # Train both models using the training data
138
232
  xgb_model.fit(X_train, y_train)
139
- ngb_model.fit(X_train, y_train, X_val=X_val, Y_val=y_val)
233
+ ngb_model.fit(X_train, y_train, X_val=X_validate, Y_val=y_validate)
140
234
 
141
235
  # Make Predictions on the Validation Set
142
236
  print(f"Making Predictions on Validation Set...")
143
- y_validate = df_val[target]
144
- X_validate = df_val[features]
145
237
  preds = xgb_model.predict(X_validate)
146
238
 
147
239
  # Calculate various model performance metrics (regression)
148
240
  rmse = root_mean_squared_error(y_validate, preds)
149
241
  mae = mean_absolute_error(y_validate, preds)
242
+ medae = median_absolute_error(y_validate, preds)
150
243
  r2 = r2_score(y_validate, preds)
151
- print(f"RMSE: {rmse:.3f}")
152
- print(f"MAE: {mae:.3f}")
153
- print(f"R2: {r2:.3f}")
154
- print(f"NumRows: {len(df_val)}")
244
+ spearman_corr = spearmanr(y_validate, preds).correlation
245
+ support = len(df_val)
246
+ print(f"rmse: {rmse:.3f}")
247
+ print(f"mae: {mae:.3f}")
248
+ print(f"medae: {medae:.3f}")
249
+ print(f"r2: {r2:.3f}")
250
+ print(f"spearmanr: {spearman_corr:.3f}")
251
+ print(f"support: {support}")
155
252
 
156
253
  # Save the trained XGBoost model
157
254
  xgb_model.save_model(os.path.join(args.model_dir, "xgb_model.json"))
@@ -159,9 +256,9 @@ if __name__ == "__main__":
159
256
  # Save the trained NGBoost model
160
257
  joblib.dump(ngb_model, os.path.join(args.model_dir, "ngb_model.joblib"))
161
258
 
162
- # Save the feature list to validate input during predictions
259
+ # Save the features (this will validate input during predictions)
163
260
  with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
164
- json.dump(features, fp)
261
+ json.dump(orig_features, fp) # We save the original features, not the decompressed ones
165
262
 
166
263
  # Now the Proximity model
167
264
  model = Proximity(df_train, id_column, features, target, track_columns=track_columns)
@@ -187,11 +284,7 @@ def model_fn(model_dir) -> dict:
187
284
  # Deserialize the proximity model
188
285
  prox_model = Proximity.deserialize(model_dir)
189
286
 
190
- return {
191
- "xgboost": xgb_model,
192
- "ngboost": ngb_model,
193
- "proximity": prox_model
194
- }
287
+ return {"xgboost": xgb_model, "ngboost": ngb_model, "proximity": prox_model}
195
288
 
196
289
 
197
290
  def input_fn(input_data, content_type):
@@ -251,20 +344,31 @@ def predict_fn(df, models) -> pd.DataFrame:
251
344
  dist_params = y_dists.params
252
345
 
253
346
  # Extract mean and std from distribution parameters
254
- df["prediction_uq"] = dist_params['loc'] # mean
255
- df["prediction_std"] = dist_params['scale'] # standard deviation
347
+ df["prediction_uq"] = dist_params["loc"] # mean
348
+ df["prediction_std"] = dist_params["scale"] # standard deviation
256
349
 
257
350
  # Add 95% prediction intervals using ppf (percent point function)
258
- df["q_025"] = y_dists.ppf(0.025) # 2.5th percentile
259
- df["q_975"] = y_dists.ppf(0.975) # 97.5th percentile
351
+ # Note: Our hybrid model uses XGB point prediction and NGBoost UQ
352
+ # so we need to adjust the bounds to include the point prediction
353
+ df["q_025"] = np.minimum(y_dists.ppf(0.025), df["prediction"])
354
+ df["q_975"] = np.maximum(y_dists.ppf(0.975), df["prediction"])
355
+
356
+ # Add 90% prediction intervals
357
+ df["q_05"] = y_dists.ppf(0.05) # 5th percentile
358
+ df["q_95"] = y_dists.ppf(0.95) # 95th percentile
359
+
360
+ # Add 80% prediction intervals
361
+ df["q_10"] = y_dists.ppf(0.10) # 10th percentile
362
+ df["q_90"] = y_dists.ppf(0.90) # 90th percentile
260
363
 
261
364
  # Add 50% prediction intervals
262
- df["q_25"] = y_dists.ppf(0.25) # 25th percentile
263
- df["q_75"] = y_dists.ppf(0.75) # 75th percentile
365
+ df["q_25"] = y_dists.ppf(0.25) # 25th percentile
366
+ df["q_75"] = y_dists.ppf(0.75) # 75th percentile
264
367
 
265
- # Adjust prediction intervals to include point predictions
266
- df["q_025"] = df[["q_025", "prediction"]].min(axis=1)
267
- df["q_975"] = df[["q_975", "prediction"]].max(axis=1)
368
+ # Reorder the quantile columns for easier reading
369
+ quantile_cols = ["q_025", "q_05", "q_10", "q_25", "q_75", "q_90", "q_95", "q_975"]
370
+ other_cols = [col for col in df.columns if col not in quantile_cols]
371
+ df = df[other_cols + quantile_cols]
268
372
 
269
373
  # Compute Nearest neighbors with Proximity model
270
374
  models["proximity"].neighbors(df)
@@ -3,11 +3,8 @@ from ngboost import NGBRegressor
3
3
  from sklearn.model_selection import train_test_split
4
4
 
5
5
  # Model Performance Scores
6
- from sklearn.metrics import (
7
- mean_absolute_error,
8
- r2_score,
9
- root_mean_squared_error
10
- )
6
+ from sklearn.metrics import mean_absolute_error, median_absolute_error, r2_score, root_mean_squared_error
7
+ from scipy.stats import spearmanr
11
8
 
12
9
  from io import StringIO
13
10
  import json
@@ -21,7 +18,7 @@ import pandas as pd
21
18
  TEMPLATE_PARAMS = {
22
19
  "features": "{{feature_list}}",
23
20
  "target": "{{target_column}}",
24
- "train_all_data": "{{train_all_data}}"
21
+ "train_all_data": "{{train_all_data}}",
25
22
  }
26
23
 
27
24
 
@@ -87,10 +84,7 @@ if __name__ == "__main__":
87
84
  args = parser.parse_args()
88
85
 
89
86
  # Load training data from the specified directory
90
- training_files = [
91
- os.path.join(args.train, file)
92
- for file in os.listdir(args.train) if file.endswith(".csv")
93
- ]
87
+ training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
94
88
  print(f"Training Files: {training_files}")
95
89
 
96
90
  # Combine files and read them all into a single pandas dataframe
@@ -136,11 +130,16 @@ if __name__ == "__main__":
136
130
  # Calculate various model performance metrics (regression)
137
131
  rmse = root_mean_squared_error(y_validate, preds)
138
132
  mae = mean_absolute_error(y_validate, preds)
133
+ medae = median_absolute_error(y_validate, preds)
139
134
  r2 = r2_score(y_validate, preds)
140
- print(f"RMSE: {rmse:.3f}")
141
- print(f"MAE: {mae:.3f}")
142
- print(f"R2: {r2:.3f}")
143
- print(f"NumRows: {len(df_val)}")
135
+ spearman_corr = spearmanr(y_validate, preds).correlation
136
+ support = len(df_val)
137
+ print(f"rmse: {rmse:.3f}")
138
+ print(f"mae: {mae:.3f}")
139
+ print(f"medae: {medae:.3f}")
140
+ print(f"r2: {r2:.3f}")
141
+ print(f"spearmanr: {spearman_corr:.3f}")
142
+ print(f"support: {support}")
144
143
 
145
144
  # Save the trained NGBoost model
146
145
  joblib.dump(ngb_model, os.path.join(args.model_dir, "ngb_model.joblib"))
@@ -212,16 +211,29 @@ def predict_fn(df, model) -> pd.DataFrame:
212
211
  dist_params = y_dists.params
213
212
 
214
213
  # Extract mean and std from distribution parameters
215
- df["prediction"] = dist_params['loc'] # mean
216
- df["prediction_std"] = dist_params['scale'] # standard deviation
214
+ df["prediction"] = dist_params["loc"] # mean
215
+ df["prediction_std"] = dist_params["scale"] # standard deviation
217
216
 
218
217
  # Add 95% prediction intervals using ppf (percent point function)
219
218
  df["q_025"] = y_dists.ppf(0.025) # 2.5th percentile
220
219
  df["q_975"] = y_dists.ppf(0.975) # 97.5th percentile
221
220
 
221
+ # Add 90% prediction intervals
222
+ df["q_05"] = y_dists.ppf(0.05) # 5th percentile
223
+ df["q_95"] = y_dists.ppf(0.95) # 95th percentile
224
+
225
+ # Add 80% prediction intervals
226
+ df["q_10"] = y_dists.ppf(0.10) # 10th percentile
227
+ df["q_90"] = y_dists.ppf(0.90) # 90th percentile
228
+
222
229
  # Add 50% prediction intervals
223
- df["q_25"] = y_dists.ppf(0.25) # 25th percentile
224
- df["q_75"] = y_dists.ppf(0.75) # 75th percentile
230
+ df["q_25"] = y_dists.ppf(0.25) # 25th percentile
231
+ df["q_75"] = y_dists.ppf(0.75) # 75th percentile
232
+
233
+ # Reorder the quantile columns for easier reading
234
+ quantile_cols = ["q_025", "q_05", "q_10", "q_25", "q_75", "q_90", "q_95", "q_975"]
235
+ other_cols = [col for col in df.columns if col not in quantile_cols]
236
+ df = df[other_cols + quantile_cols]
225
237
 
226
238
  # Return the modified DataFrame
227
239
  return df