workbench 0.8.170__py3-none-any.whl → 0.8.172__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of workbench might be problematic. Click here for more details.

Files changed (27) hide show
  1. workbench/api/feature_set.py +4 -4
  2. workbench/core/artifacts/artifact.py +11 -3
  3. workbench/core/artifacts/model_core.py +37 -14
  4. workbench/core/cloud_platform/aws/aws_account_clamp.py +4 -1
  5. workbench/core/cloud_platform/aws/aws_meta.py +11 -4
  6. workbench/core/transforms/features_to_model/features_to_model.py +4 -4
  7. workbench/model_scripts/custom_models/uq_models/generated_model_script.py +319 -210
  8. workbench/model_scripts/custom_models/uq_models/mapie.template +502 -0
  9. workbench/model_scripts/custom_models/uq_models/meta_uq.template +154 -41
  10. workbench/model_scripts/custom_models/uq_models/ngboost.template +15 -2
  11. workbench/model_scripts/custom_models/uq_models/requirements.txt +1 -3
  12. workbench/model_scripts/script_generation.py +5 -0
  13. workbench/model_scripts/xgb_model/generated_model_script.py +11 -11
  14. workbench/model_scripts/xgb_model/xgb_model.template +7 -7
  15. workbench/scripts/{ml_pipeline_launcher.py → ml_pipeline_batch.py} +1 -1
  16. workbench/scripts/ml_pipeline_sqs.py +139 -0
  17. workbench/utils/model_utils.py +13 -1
  18. workbench/utils/workbench_sqs.py +1 -1
  19. workbench/utils/xgboost_model_utils.py +1 -0
  20. workbench/web_interface/components/plugins/dashboard_status.py +3 -1
  21. {workbench-0.8.170.dist-info → workbench-0.8.172.dist-info}/METADATA +1 -1
  22. {workbench-0.8.170.dist-info → workbench-0.8.172.dist-info}/RECORD +26 -25
  23. {workbench-0.8.170.dist-info → workbench-0.8.172.dist-info}/entry_points.txt +2 -1
  24. workbench/model_scripts/custom_models/uq_models/mapie_xgb.template +0 -203
  25. {workbench-0.8.170.dist-info → workbench-0.8.172.dist-info}/WHEEL +0 -0
  26. {workbench-0.8.170.dist-info → workbench-0.8.172.dist-info}/licenses/LICENSE +0 -0
  27. {workbench-0.8.170.dist-info → workbench-0.8.172.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,7 @@
1
1
  # Model: NGBoost Regressor with Distribution output
2
2
  from ngboost import NGBRegressor
3
- from xgboost import XGBRegressor # Base Estimator
3
+ from ngboost.distns import Cauchy
4
+ from xgboost import XGBRegressor # Point Estimator
4
5
  from sklearn.model_selection import train_test_split
5
6
 
6
7
  # Model Performance Scores
@@ -15,7 +16,9 @@ import json
15
16
  import argparse
16
17
  import joblib
17
18
  import os
19
+ import numpy as np
18
20
  import pandas as pd
21
+ from typing import List, Tuple
19
22
 
20
23
  # Local Imports
21
24
  from proximity import Proximity
@@ -25,8 +28,9 @@ from proximity import Proximity
25
28
  # Template Placeholders
26
29
  TEMPLATE_PARAMS = {
27
30
  "id_column": "{{id_column}}",
28
- "features": "{{feature_list}}",
29
31
  "target": "{{target_column}}",
32
+ "features": "{{feature_list}}",
33
+ "compressed_features": "{{compressed_features}}",
30
34
  "train_all_data": "{{train_all_data}}",
31
35
  "track_columns": "{{track_columns}}"
32
36
  }
@@ -72,16 +76,99 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
72
76
  return df.rename(columns=rename_dict)
73
77
 
74
78
 
75
- # TRAINING SECTION
76
- #
77
- # This section (__main__) is where SageMaker will execute the training job
78
- # and save the model artifacts to the model directory.
79
- #
79
+ def convert_categorical_types(df: pd.DataFrame, features: list, category_mappings={}) -> tuple:
80
+ """
81
+ Converts appropriate columns to categorical type with consistent mappings.
82
+
83
+ Args:
84
+ df (pd.DataFrame): The DataFrame to process.
85
+ features (list): List of feature names to consider for conversion.
86
+ category_mappings (dict, optional): Existing category mappings. If empty dict, we're in
87
+ training mode. If populated, we're in inference mode.
88
+
89
+ Returns:
90
+ tuple: (processed DataFrame, category mappings dictionary)
91
+ """
92
+ # Training mode
93
+ if category_mappings == {}:
94
+ for col in df.select_dtypes(include=["object", "string"]):
95
+ if col in features and df[col].nunique() < 20:
96
+ print(f"Training mode: Converting {col} to category")
97
+ df[col] = df[col].astype("category")
98
+ category_mappings[col] = df[col].cat.categories.tolist() # Store category mappings
99
+
100
+ # Inference mode
101
+ else:
102
+ for col, categories in category_mappings.items():
103
+ if col in df.columns:
104
+ print(f"Inference mode: Applying categorical mapping for {col}")
105
+ df[col] = pd.Categorical(df[col], categories=categories) # Apply consistent categorical mapping
106
+
107
+ return df, category_mappings
108
+
109
+
110
+ def decompress_features(
111
+ df: pd.DataFrame, features: List[str], compressed_features: List[str]
112
+ ) -> Tuple[pd.DataFrame, List[str]]:
113
+ """Prepare features for the model by decompressing bitstring features
114
+
115
+ Args:
116
+ df (pd.DataFrame): The features DataFrame
117
+ features (List[str]): Full list of feature names
118
+ compressed_features (List[str]): List of feature names to decompress (bitstrings)
119
+
120
+ Returns:
121
+ pd.DataFrame: DataFrame with the decompressed features
122
+ List[str]: Updated list of feature names after decompression
123
+
124
+ Raises:
125
+ ValueError: If any missing values are found in the specified features
126
+ """
127
+
128
+ # Check for any missing values in the required features
129
+ missing_counts = df[features].isna().sum()
130
+ if missing_counts.any():
131
+ missing_features = missing_counts[missing_counts > 0]
132
+ print(
133
+ f"WARNING: Found missing values in features: {missing_features.to_dict()}. "
134
+ "WARNING: You might want to remove/replace all NaN values before processing."
135
+ )
136
+
137
+ # Decompress the specified compressed features
138
+ decompressed_features = features.copy()
139
+ for feature in compressed_features:
140
+ if (feature not in df.columns) or (feature not in features):
141
+ print(f"Feature '{feature}' not in the features list, skipping decompression.")
142
+ continue
143
+
144
+ # Remove the feature from the list of features to avoid duplication
145
+ decompressed_features.remove(feature)
146
+
147
+ # Handle all compressed features as bitstrings
148
+ bit_matrix = np.array([list(bitstring) for bitstring in df[feature]], dtype=np.uint8)
149
+ prefix = feature[:3]
150
+
151
+ # Create all new columns at once - avoids fragmentation
152
+ new_col_names = [f"{prefix}_{i}" for i in range(bit_matrix.shape[1])]
153
+ new_df = pd.DataFrame(bit_matrix, columns=new_col_names, index=df.index)
154
+
155
+ # Add to features list
156
+ decompressed_features.extend(new_col_names)
157
+
158
+ # Drop original column and concatenate new ones
159
+ df = df.drop(columns=[feature])
160
+ df = pd.concat([df, new_df], axis=1)
161
+
162
+ return df, decompressed_features
163
+
164
+
80
165
  if __name__ == "__main__":
81
166
  # Template Parameters
82
167
  id_column = TEMPLATE_PARAMS["id_column"]
83
- features = TEMPLATE_PARAMS["features"]
84
168
  target = TEMPLATE_PARAMS["target"]
169
+ features = TEMPLATE_PARAMS["features"]
170
+ orig_features = features.copy()
171
+ compressed_features = TEMPLATE_PARAMS["compressed_features"]
85
172
  train_all_data = TEMPLATE_PARAMS["train_all_data"]
86
173
  track_columns = TEMPLATE_PARAMS["track_columns"] # Can be None
87
174
  validation_split = 0.2
@@ -95,53 +182,68 @@ if __name__ == "__main__":
95
182
  )
96
183
  args = parser.parse_args()
97
184
 
98
- # Load training data from the specified directory
185
+ # Read the training data into DataFrames
99
186
  training_files = [
100
187
  os.path.join(args.train, file)
101
- for file in os.listdir(args.train) if file.endswith(".csv")
188
+ for file in os.listdir(args.train)
189
+ if file.endswith(".csv")
102
190
  ]
103
191
  print(f"Training Files: {training_files}")
104
192
 
105
193
  # Combine files and read them all into a single pandas dataframe
106
- df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
194
+ all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
195
+
196
+ # Check if the dataframe is empty
197
+ check_dataframe(all_df, "training_df")
198
+
199
+ # Features/Target output
200
+ print(f"Target: {target}")
201
+ print(f"Features: {str(features)}")
107
202
 
108
- # Check if the DataFrame is empty
109
- check_dataframe(df, "training_df")
203
+ # Convert any features that might be categorical to 'category' type
204
+ all_df, category_mappings = convert_categorical_types(all_df, features)
110
205
 
111
- # Training data split logic
206
+ # If we have compressed features, decompress them
207
+ if compressed_features:
208
+ print(f"Decompressing features {compressed_features}...")
209
+ all_df, features = decompress_features(all_df, features, compressed_features)
210
+
211
+ # Do we want to train on all the data?
112
212
  if train_all_data:
113
- # Use all data for both training and validation
114
- print("Training on all data...")
115
- df_train = df.copy()
116
- df_val = df.copy()
117
- elif "training" in df.columns:
118
- # Split data based on a 'training' column if it exists
119
- print("Splitting data based on 'training' column...")
120
- df_train = df[df["training"]].copy()
121
- df_val = df[~df["training"]].copy()
213
+ print("Training on ALL of the data")
214
+ df_train = all_df.copy()
215
+ df_val = all_df.copy()
216
+
217
+ # Does the dataframe have a training column?
218
+ elif "training" in all_df.columns:
219
+ print("Found training column, splitting data based on training column")
220
+ df_train = all_df[all_df["training"]]
221
+ df_val = all_df[~all_df["training"]]
122
222
  else:
123
- # Perform a random split if no 'training' column is found
124
- print("Splitting data randomly...")
125
- df_train, df_val = train_test_split(df, test_size=validation_split, random_state=42)
223
+ # Just do a random training Split
224
+ print("WARNING: No training column found, splitting data with random state=42")
225
+ df_train, df_val = train_test_split(
226
+ all_df, test_size=validation_split, random_state=42
227
+ )
228
+ print(f"FIT/TRAIN: {df_train.shape}")
229
+ print(f"VALIDATION: {df_val.shape}")
126
230
 
127
231
  # We're using XGBoost for point predictions and NGBoost for uncertainty quantification
128
232
  xgb_model = XGBRegressor()
129
- ngb_model = NGBRegressor()
233
+ ngb_model = NGBRegressor() # Dist=Cauchy) Seems to give HUGE prediction intervals
130
234
 
131
235
  # Prepare features and targets for training
132
236
  X_train = df_train[features]
133
- X_val = df_val[features]
237
+ X_validate = df_val[features]
134
238
  y_train = df_train[target]
135
- y_val = df_val[target]
239
+ y_validate = df_val[target]
136
240
 
137
241
  # Train both models using the training data
138
242
  xgb_model.fit(X_train, y_train)
139
- ngb_model.fit(X_train, y_train, X_val=X_val, Y_val=y_val)
243
+ ngb_model.fit(X_train, y_train, X_val=X_validate, Y_val=y_validate)
140
244
 
141
245
  # Make Predictions on the Validation Set
142
246
  print(f"Making Predictions on Validation Set...")
143
- y_validate = df_val[target]
144
- X_validate = df_val[features]
145
247
  preds = xgb_model.predict(X_validate)
146
248
 
147
249
  # Calculate various model performance metrics (regression)
@@ -159,9 +261,9 @@ if __name__ == "__main__":
159
261
  # Save the trained NGBoost model
160
262
  joblib.dump(ngb_model, os.path.join(args.model_dir, "ngb_model.joblib"))
161
263
 
162
- # Save the feature list to validate input during predictions
264
+ # Save the features (this will validate input during predictions)
163
265
  with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
164
- json.dump(features, fp)
266
+ json.dump(orig_features, fp) # We save the original features, not the decompressed ones
165
267
 
166
268
  # Now the Proximity model
167
269
  model = Proximity(df_train, id_column, features, target, track_columns=track_columns)
@@ -255,16 +357,27 @@ def predict_fn(df, models) -> pd.DataFrame:
255
357
  df["prediction_std"] = dist_params['scale'] # standard deviation
256
358
 
257
359
  # Add 95% prediction intervals using ppf (percent point function)
258
- df["q_025"] = y_dists.ppf(0.025) # 2.5th percentile
259
- df["q_975"] = y_dists.ppf(0.975) # 97.5th percentile
360
+ # Note: Our hybrid model uses XGB point prediction and NGBoost UQ
361
+ # so we need to adjust the bounds to include the point prediction
362
+ df["q_025"] = np.minimum(y_dists.ppf(0.025), df["prediction"])
363
+ df["q_975"] = np.maximum(y_dists.ppf(0.975), df["prediction"])
364
+
365
+ # Add 90% prediction intervals
366
+ df["q_05"] = y_dists.ppf(0.05) # 5th percentile
367
+ df["q_95"] = y_dists.ppf(0.95) # 95th percentile
368
+
369
+ # Add 80% prediction intervals
370
+ df["q_10"] = y_dists.ppf(0.10) # 10th percentile
371
+ df["q_90"] = y_dists.ppf(0.90) # 90th percentile
260
372
 
261
373
  # Add 50% prediction intervals
262
- df["q_25"] = y_dists.ppf(0.25) # 25th percentile
263
- df["q_75"] = y_dists.ppf(0.75) # 75th percentile
374
+ df["q_25"] = y_dists.ppf(0.25) # 25th percentile
375
+ df["q_75"] = y_dists.ppf(0.75) # 75th percentile
264
376
 
265
- # Adjust prediction intervals to include point predictions
266
- df["q_025"] = df[["q_025", "prediction"]].min(axis=1)
267
- df["q_975"] = df[["q_975", "prediction"]].max(axis=1)
377
+ # Reorder the quantile columns for easier reading
378
+ quantile_cols = ["q_025", "q_05", "q_10", "q_25", "q_75", "q_90", "q_95", "q_975"]
379
+ other_cols = [col for col in df.columns if col not in quantile_cols]
380
+ df = df[other_cols + quantile_cols]
268
381
 
269
382
  # Compute Nearest neighbors with Proximity model
270
383
  models["proximity"].neighbors(df)
@@ -219,9 +219,22 @@ def predict_fn(df, model) -> pd.DataFrame:
219
219
  df["q_025"] = y_dists.ppf(0.025) # 2.5th percentile
220
220
  df["q_975"] = y_dists.ppf(0.975) # 97.5th percentile
221
221
 
222
+ # Add 90% prediction intervals
223
+ df["q_05"] = y_dists.ppf(0.05) # 5th percentile
224
+ df["q_95"] = y_dists.ppf(0.95) # 95th percentile
225
+
226
+ # Add 80% prediction intervals
227
+ df["q_10"] = y_dists.ppf(0.10) # 10th percentile
228
+ df["q_90"] = y_dists.ppf(0.90) # 90th percentile
229
+
222
230
  # Add 50% prediction intervals
223
- df["q_25"] = y_dists.ppf(0.25) # 25th percentile
224
- df["q_75"] = y_dists.ppf(0.75) # 75th percentile
231
+ df["q_25"] = y_dists.ppf(0.25) # 25th percentile
232
+ df["q_75"] = y_dists.ppf(0.75) # 75th percentile
233
+
234
+ # Reorder the quantile columns for easier reading
235
+ quantile_cols = ["q_025", "q_05", "q_10", "q_25", "q_75", "q_90", "q_95", "q_975"]
236
+ other_cols = [col for col in df.columns if col not in quantile_cols]
237
+ df = df[other_cols + quantile_cols]
225
238
 
226
239
  # Return the modified DataFrame
227
240
  return df
@@ -1,3 +1 @@
1
- # Note: NGBoost is not included in the default inference image, so it must be specified here.
2
- ngboost
3
- mapie
1
+ # Note: Most libs are already in the training/inference images, ONLY specify additional libs here
@@ -70,6 +70,11 @@ def fill_template(template_path: str, params: dict, output_script: str) -> str:
70
70
  # Sanity check to ensure all placeholders were replaced
71
71
  if "{{" in template and "}}" in template:
72
72
  msg = "Not all template placeholders were replaced. Please check your params."
73
+
74
+ # Show which placeholders are still present
75
+ start = template.index("{{")
76
+ end = template.index("}}", start) + 2
77
+ msg += f" Unreplaced placeholder: {template[start:end]}"
73
78
  log.critical(msg)
74
79
  raise ValueError(msg)
75
80
 
@@ -28,12 +28,12 @@ from typing import List, Tuple
28
28
 
29
29
  # Template Parameters
30
30
  TEMPLATE_PARAMS = {
31
- "model_type": "classifier",
32
- "target_column": "class",
33
- "features": ['bcut2d_logplow', 'numradicalelectrons', 'smr_vsa5', 'fr_lactam', 'fr_morpholine', 'fr_aldehyde', 'slogp_vsa1', 'fr_amidine', 'bpol', 'fr_ester', 'fr_azo', 'kappa3', 'peoe_vsa5', 'fr_ketone_topliss', 'vsa_estate9', 'estate_vsa9', 'bcut2d_mrhi', 'fr_ndealkylation1', 'numrotatablebonds', 'minestateindex', 'fr_quatn', 'peoe_vsa3', 'fr_epoxide', 'fr_aniline', 'minpartialcharge', 'fr_nitroso', 'fpdensitymorgan2', 'fr_oxime', 'fr_sulfone', 'smr_vsa1', 'kappa1', 'fr_pyridine', 'numaromaticrings', 'vsa_estate6', 'molmr', 'estate_vsa1', 'fr_dihydropyridine', 'vsa_estate10', 'fr_alkyl_halide', 'chi2n', 'fr_thiocyan', 'fpdensitymorgan1', 'fr_unbrch_alkane', 'slogp_vsa9', 'chi4n', 'fr_nitro_arom', 'fr_al_oh', 'fr_furan', 'fr_c_s', 'peoe_vsa8', 'peoe_vsa14', 'numheteroatoms', 'fr_ndealkylation2', 'maxabspartialcharge', 'vsa_estate2', 'peoe_vsa7', 'apol', 'numhacceptors', 'fr_tetrazole', 'vsa_estate1', 'peoe_vsa9', 'naromatom', 'bcut2d_chghi', 'fr_sh', 'fr_halogen', 'slogp_vsa4', 'fr_benzodiazepine', 'molwt', 'fr_isocyan', 'fr_prisulfonamd', 'maxabsestateindex', 'minabsestateindex', 'peoe_vsa11', 'slogp_vsa12', 'estate_vsa5', 'numaliphaticcarbocycles', 'bcut2d_mwlow', 'slogp_vsa7', 'fr_allylic_oxid', 'fr_methoxy', 'fr_nh0', 'fr_coo2', 'fr_phenol', 'nacid', 'nbase', 'chi3v', 'fr_ar_nh', 'fr_nitrile', 'fr_imidazole', 'fr_urea', 'bcut2d_mrlow', 'chi1', 'smr_vsa6', 'fr_aryl_methyl', 'narombond', 'fr_alkyl_carbamate', 'fr_piperzine', 'exactmolwt', 'qed', 'chi0n', 'fr_sulfonamd', 'fr_thiazole', 'numvalenceelectrons', 'fr_phos_acid', 'peoe_vsa12', 'fr_nh1', 'fr_hdrzine', 'fr_c_o_nocoo', 'fr_lactone', 'estate_vsa6', 'bcut2d_logphi', 'vsa_estate7', 'peoe_vsa13', 'numsaturatedcarbocycles', 'fr_nitro', 'fr_phenol_noorthohbond', 'rotratio', 'fr_barbitur', 'fr_isothiocyan', 'balabanj', 'fr_arn', 'fr_imine', 'maxpartialcharge', 'fr_sulfide', 'slogp_vsa11', 'fr_hoccn', 'fr_n_o', 'peoe_vsa1', 'slogp_vsa6', 'heavyatommolwt', 'fractioncsp3', 'estate_vsa8', 'peoe_vsa10', 'numaliphaticrings', 'fr_thiophene', 'maxestateindex', 'smr_vsa10', 'labuteasa', 'smr_vsa2', 'fpdensitymorgan3', 'smr_vsa9', 'slogp_vsa10', 'numaromaticheterocycles', 'fr_nh2', 'fr_diazo', 'chi3n', 'fr_ar_coo', 'slogp_vsa5', 'fr_bicyclic', 'fr_amide', 'estate_vsa10', 'fr_guanido', 'chi1n', 'numsaturatedrings', 'fr_piperdine', 'fr_term_acetylene', 'estate_vsa4', 'slogp_vsa3', 'fr_coo', 'fr_ether', 'estate_vsa7', 'bcut2d_chglo', 'fr_oxazole', 'peoe_vsa6', 'hallkieralpha', 'peoe_vsa2', 'chi2v', 'nocount', 'vsa_estate5', 'fr_nhpyrrole', 'fr_al_coo', 'bertzct', 'estate_vsa11', 'minabspartialcharge', 'slogp_vsa8', 'fr_imide', 'kappa2', 'numaliphaticheterocycles', 'numsaturatedheterocycles', 'fr_hdrzone', 'smr_vsa4', 'fr_ar_n', 'nrot', 'smr_vsa8', 'slogp_vsa2', 'chi4v', 'fr_phos_ester', 'fr_para_hydroxylation', 'smr_vsa3', 'nhohcount', 'estate_vsa2', 'mollogp', 'tpsa', 'fr_azide', 'peoe_vsa4', 'numhdonors', 'fr_al_oh_notert', 'fr_c_o', 'chi0', 'fr_nitro_arom_nonortho', 'vsa_estate3', 'fr_benzene', 'fr_ketone', 'vsa_estate8', 'smr_vsa7', 'fr_ar_oh', 'fr_priamide', 'ringcount', 'estate_vsa3', 'numaromaticcarbocycles', 'bcut2d_mwhi', 'chi1v', 'heavyatomcount', 'vsa_estate4', 'chi0v', 'pred_pka_reg'],
31
+ "model_type": "regressor",
32
+ "target": "udm_asy_res_intrinsic_clearance_ul_per_min_per_mg_protein",
33
+ "features": ['bcut2d_logplow', 'numradicalelectrons', 'smr_vsa5', 'fr_lactam', 'fr_morpholine', 'fr_aldehyde', 'slogp_vsa1', 'fr_amidine', 'bpol', 'fr_ester', 'fr_azo', 'kappa3', 'peoe_vsa5', 'fr_ketone_topliss', 'vsa_estate9', 'estate_vsa9', 'bcut2d_mrhi', 'fr_ndealkylation1', 'numrotatablebonds', 'minestateindex', 'fr_quatn', 'peoe_vsa3', 'fr_epoxide', 'fr_aniline', 'minpartialcharge', 'fr_nitroso', 'fpdensitymorgan2', 'fr_oxime', 'fr_sulfone', 'smr_vsa1', 'kappa1', 'fr_pyridine', 'numaromaticrings', 'vsa_estate6', 'molmr', 'estate_vsa1', 'fr_dihydropyridine', 'vsa_estate10', 'fr_alkyl_halide', 'chi2n', 'fr_thiocyan', 'fpdensitymorgan1', 'fr_unbrch_alkane', 'slogp_vsa9', 'chi4n', 'fr_nitro_arom', 'fr_al_oh', 'fr_furan', 'fr_c_s', 'peoe_vsa8', 'peoe_vsa14', 'numheteroatoms', 'fr_ndealkylation2', 'maxabspartialcharge', 'vsa_estate2', 'peoe_vsa7', 'apol', 'numhacceptors', 'fr_tetrazole', 'vsa_estate1', 'peoe_vsa9', 'naromatom', 'bcut2d_chghi', 'fr_sh', 'fr_halogen', 'slogp_vsa4', 'fr_benzodiazepine', 'molwt', 'fr_isocyan', 'fr_prisulfonamd', 'maxabsestateindex', 'minabsestateindex', 'peoe_vsa11', 'slogp_vsa12', 'estate_vsa5', 'numaliphaticcarbocycles', 'bcut2d_mwlow', 'slogp_vsa7', 'fr_allylic_oxid', 'fr_methoxy', 'fr_nh0', 'fr_coo2', 'fr_phenol', 'nacid', 'nbase', 'chi3v', 'fr_ar_nh', 'fr_nitrile', 'fr_imidazole', 'fr_urea', 'bcut2d_mrlow', 'chi1', 'smr_vsa6', 'fr_aryl_methyl', 'narombond', 'fr_alkyl_carbamate', 'fr_piperzine', 'exactmolwt', 'qed', 'chi0n', 'fr_sulfonamd', 'fr_thiazole', 'numvalenceelectrons', 'fr_phos_acid', 'peoe_vsa12', 'fr_nh1', 'fr_hdrzine', 'fr_c_o_nocoo', 'fr_lactone', 'estate_vsa6', 'bcut2d_logphi', 'vsa_estate7', 'peoe_vsa13', 'numsaturatedcarbocycles', 'fr_nitro', 'fr_phenol_noorthohbond', 'rotratio', 'fr_barbitur', 'fr_isothiocyan', 'balabanj', 'fr_arn', 'fr_imine', 'maxpartialcharge', 'fr_sulfide', 'slogp_vsa11', 'fr_hoccn', 'fr_n_o', 'peoe_vsa1', 'slogp_vsa6', 'heavyatommolwt', 'fractioncsp3', 'estate_vsa8', 'peoe_vsa10', 'numaliphaticrings', 'fr_thiophene', 'maxestateindex', 'smr_vsa10', 'labuteasa', 'smr_vsa2', 'fpdensitymorgan3', 'smr_vsa9', 'slogp_vsa10', 'numaromaticheterocycles', 'fr_nh2', 'fr_diazo', 'chi3n', 'fr_ar_coo', 'slogp_vsa5', 'fr_bicyclic', 'fr_amide', 'estate_vsa10', 'fr_guanido', 'chi1n', 'numsaturatedrings', 'fr_piperdine', 'fr_term_acetylene', 'estate_vsa4', 'slogp_vsa3', 'fr_coo', 'fr_ether', 'estate_vsa7', 'bcut2d_chglo', 'fr_oxazole', 'peoe_vsa6', 'hallkieralpha', 'peoe_vsa2', 'chi2v', 'nocount', 'vsa_estate5', 'fr_nhpyrrole', 'fr_al_coo', 'bertzct', 'estate_vsa11', 'minabspartialcharge', 'slogp_vsa8', 'fr_imide', 'kappa2', 'numaliphaticheterocycles', 'numsaturatedheterocycles', 'fr_hdrzone', 'smr_vsa4', 'fr_ar_n', 'nrot', 'smr_vsa8', 'slogp_vsa2', 'chi4v', 'fr_phos_ester', 'fr_para_hydroxylation', 'smr_vsa3', 'nhohcount', 'estate_vsa2', 'mollogp', 'tpsa', 'fr_azide', 'peoe_vsa4', 'numhdonors', 'fr_al_oh_notert', 'fr_c_o', 'chi0', 'fr_nitro_arom_nonortho', 'vsa_estate3', 'fr_benzene', 'fr_ketone', 'vsa_estate8', 'smr_vsa7', 'fr_ar_oh', 'fr_priamide', 'ringcount', 'estate_vsa3', 'numaromaticcarbocycles', 'bcut2d_mwhi', 'chi1v', 'heavyatomcount', 'vsa_estate4', 'chi0v'],
34
34
  "compressed_features": [],
35
- "model_metrics_s3_path": "s3://ideaya-sageworks-bucket/models/sol-with-pka-class-100-test/training",
36
- "train_all_data": True
35
+ "model_metrics_s3_path": "s3://ideaya-sageworks-bucket/models/temp-hlm-phase1-reg-0-80/training",
36
+ "train_all_data": False
37
37
  }
38
38
 
39
39
  # Function to check if dataframe is empty
@@ -88,13 +88,12 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
88
88
  """
89
89
  Matches and renames DataFrame columns to match model feature names (case-insensitive).
90
90
  Prioritizes exact matches, then case-insensitive matches.
91
-
91
+
92
92
  Raises ValueError if any model features cannot be matched.
93
93
  """
94
94
  df_columns_lower = {col.lower(): col for col in df.columns}
95
95
  rename_dict = {}
96
96
  missing = []
97
-
98
97
  for feature in model_features:
99
98
  if feature in df.columns:
100
99
  continue # Exact match
@@ -102,10 +101,11 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
102
101
  rename_dict[df_columns_lower[feature.lower()]] = feature
103
102
  else:
104
103
  missing.append(feature)
105
-
104
+
106
105
  if missing:
107
106
  raise ValueError(f"Features not found: {missing}")
108
-
107
+
108
+ # Rename the DataFrame columns to match the model features
109
109
  return df.rename(columns=rename_dict)
110
110
 
111
111
 
@@ -197,7 +197,7 @@ if __name__ == "__main__":
197
197
  """The main function is for training the XGBoost model"""
198
198
 
199
199
  # Harness Template Parameters
200
- target = TEMPLATE_PARAMS["target_column"]
200
+ target = TEMPLATE_PARAMS["target"]
201
201
  features = TEMPLATE_PARAMS["features"]
202
202
  orig_features = features.copy()
203
203
  compressed_features = TEMPLATE_PARAMS["compressed_features"]
@@ -390,7 +390,7 @@ def input_fn(input_data, content_type):
390
390
  """Parse input data and return a DataFrame."""
391
391
  if not input_data:
392
392
  raise ValueError("Empty input data is not supported!")
393
-
393
+
394
394
  # Decode bytes to string if necessary
395
395
  if isinstance(input_data, bytes):
396
396
  input_data = input_data.decode("utf-8")
@@ -29,7 +29,7 @@ from typing import List, Tuple
29
29
  # Template Parameters
30
30
  TEMPLATE_PARAMS = {
31
31
  "model_type": "{{model_type}}",
32
- "target_column": "{{target_column}}",
32
+ "target": "{{target_column}}",
33
33
  "features": "{{feature_list}}",
34
34
  "compressed_features": "{{compressed_features}}",
35
35
  "model_metrics_s3_path": "{{model_metrics_s3_path}}",
@@ -88,13 +88,12 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
88
88
  """
89
89
  Matches and renames DataFrame columns to match model feature names (case-insensitive).
90
90
  Prioritizes exact matches, then case-insensitive matches.
91
-
91
+
92
92
  Raises ValueError if any model features cannot be matched.
93
93
  """
94
94
  df_columns_lower = {col.lower(): col for col in df.columns}
95
95
  rename_dict = {}
96
96
  missing = []
97
-
98
97
  for feature in model_features:
99
98
  if feature in df.columns:
100
99
  continue # Exact match
@@ -102,10 +101,11 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
102
101
  rename_dict[df_columns_lower[feature.lower()]] = feature
103
102
  else:
104
103
  missing.append(feature)
105
-
104
+
106
105
  if missing:
107
106
  raise ValueError(f"Features not found: {missing}")
108
-
107
+
108
+ # Rename the DataFrame columns to match the model features
109
109
  return df.rename(columns=rename_dict)
110
110
 
111
111
 
@@ -197,7 +197,7 @@ if __name__ == "__main__":
197
197
  """The main function is for training the XGBoost model"""
198
198
 
199
199
  # Harness Template Parameters
200
- target = TEMPLATE_PARAMS["target_column"]
200
+ target = TEMPLATE_PARAMS["target"]
201
201
  features = TEMPLATE_PARAMS["features"]
202
202
  orig_features = features.copy()
203
203
  compressed_features = TEMPLATE_PARAMS["compressed_features"]
@@ -390,7 +390,7 @@ def input_fn(input_data, content_type):
390
390
  """Parse input data and return a DataFrame."""
391
391
  if not input_data:
392
392
  raise ValueError("Empty input data is not supported!")
393
-
393
+
394
394
  # Decode bytes to string if necessary
395
395
  if isinstance(input_data, bytes):
396
396
  input_data = input_data.decode("utf-8")
@@ -76,7 +76,7 @@ def run_batch_job(script_path: str, size: str = "small") -> int:
76
76
  response = batch.submit_job(
77
77
  jobName=job_name,
78
78
  jobQueue="workbench-job-queue",
79
- jobDefinition=f"workbench-ml-pipeline-{size}",
79
+ jobDefinition=f"workbench-batch-{size}",
80
80
  containerOverrides={
81
81
  "environment": [
82
82
  {"name": "ML_PIPELINE_S3_PATH", "value": s3_path},
@@ -0,0 +1,139 @@
1
+ import argparse
2
+ import logging
3
+ import json
4
+ from pathlib import Path
5
+
6
+ # Workbench Imports
7
+ from workbench.core.cloud_platform.aws.aws_account_clamp import AWSAccountClamp
8
+ from workbench.utils.config_manager import ConfigManager
9
+ from workbench.utils.s3_utils import upload_content_to_s3
10
+
11
+ log = logging.getLogger("workbench")
12
+ cm = ConfigManager()
13
+ workbench_bucket = cm.get_config("WORKBENCH_BUCKET")
14
+
15
+
16
+ def submit_to_sqs(script_path: str, size: str = "small") -> None:
17
+ """
18
+ Upload script to S3 and submit message to SQS queue for processing.
19
+ Args:
20
+ script_path: Local path to the ML pipeline script
21
+ size: Job size tier - "small" (default), "medium", or "large"
22
+ """
23
+ print(f"\n{'=' * 60}")
24
+ print("🚀 SUBMITTING ML PIPELINE JOB")
25
+ print(f"{'=' * 60}")
26
+
27
+ if size not in ["small", "medium", "large"]:
28
+ raise ValueError(f"Invalid size '{size}'. Must be 'small', 'medium', or 'large'")
29
+ # Validate script exists
30
+ script_file = Path(script_path)
31
+ if not script_file.exists():
32
+ raise FileNotFoundError(f"Script not found: {script_path}")
33
+
34
+ print(f"📄 Script: {script_file.name}")
35
+ print(f"📏 Size tier: {size}")
36
+ print(f"🪣 Bucket: {workbench_bucket}")
37
+ sqs = AWSAccountClamp().boto3_session.client("sqs")
38
+ script_name = script_file.name
39
+
40
+ # List Workbench queues
41
+ print("\n📋 Listing Workbench SQS queues...")
42
+ try:
43
+ queues = sqs.list_queues(QueueNamePrefix="workbench-")
44
+ queue_urls = queues.get("QueueUrls", [])
45
+ if queue_urls:
46
+ print(f"✅ Found {len(queue_urls)} workbench queue(s):")
47
+ for url in queue_urls:
48
+ queue_name = url.split("/")[-1]
49
+ print(f" • {queue_name}")
50
+ else:
51
+ print("⚠️ No workbench queues found")
52
+ except Exception as e:
53
+ print(f"❌ Error listing queues: {e}")
54
+
55
+ # Upload script to S3
56
+ s3_path = f"s3://{workbench_bucket}/batch-jobs/{script_name}"
57
+ print("\n📤 Uploading script to S3...")
58
+ print(f" Source: {script_path}")
59
+ print(f" Destination: {s3_path}")
60
+
61
+ try:
62
+ upload_content_to_s3(script_file.read_text(), s3_path)
63
+ print("✅ Script uploaded successfully")
64
+ except Exception as e:
65
+ print(f"❌ Upload failed: {e}")
66
+ raise
67
+ # Get queue URL and info
68
+ queue_name = "workbench-ml-pipeline-queue.fifo"
69
+ print("\n🎯 Getting queue information...")
70
+ print(f" Queue name: {queue_name}")
71
+
72
+ try:
73
+ queue_url = sqs.get_queue_url(QueueName=queue_name)["QueueUrl"]
74
+ print(f" Queue URL: {queue_url}")
75
+
76
+ # Get queue attributes for additional info
77
+ attrs = sqs.get_queue_attributes(
78
+ QueueUrl=queue_url, AttributeNames=["ApproximateNumberOfMessages", "ApproximateNumberOfMessagesNotVisible"]
79
+ )
80
+ messages_available = attrs["Attributes"].get("ApproximateNumberOfMessages", "0")
81
+ messages_in_flight = attrs["Attributes"].get("ApproximateNumberOfMessagesNotVisible", "0")
82
+ print(f" Messages in queue: {messages_available}")
83
+ print(f" Messages in flight: {messages_in_flight}")
84
+
85
+ except Exception as e:
86
+ print(f"❌ Error accessing queue: {e}")
87
+ raise
88
+
89
+ # Prepare message
90
+ message = {"script_path": s3_path, "size": size}
91
+ print("\n📨 Sending message to SQS...")
92
+
93
+ # Send the message to SQS
94
+ try:
95
+ response = sqs.send_message(
96
+ QueueUrl=queue_url,
97
+ MessageBody=json.dumps(message, indent=2),
98
+ MessageGroupId="ml-pipeline-jobs", # Required for FIFO
99
+ )
100
+ message_id = response["MessageId"]
101
+ print("✅ Message sent successfully!")
102
+ print(f" Message ID: {message_id}")
103
+ except Exception as e:
104
+ print(f"❌ Failed to send message: {e}")
105
+ raise
106
+
107
+ # Success summary
108
+ print(f"\n{'=' * 60}")
109
+ print("✅ JOB SUBMISSION COMPLETE")
110
+ print(f"{'=' * 60}")
111
+ print(f"📄 Script: {script_name}")
112
+ print(f"📏 Size: {size}")
113
+ print(f"🆔 Message ID: {message_id}")
114
+ print("\n🔍 MONITORING LOCATIONS:")
115
+ print(f" • SQS Queue: AWS Console → SQS → {queue_name}")
116
+ print(" • Lambda Logs: AWS Console → Lambda → Functions")
117
+ print(" • Batch Jobs: AWS Console → Batch → Jobs")
118
+ print(" • CloudWatch: AWS Console → CloudWatch → Log groups")
119
+ print("\n⏳ Your job should start processing soon...")
120
+
121
+
122
+ def main():
123
+ """CLI entry point for submitting ML pipelines via SQS."""
124
+ parser = argparse.ArgumentParser(description="Submit ML pipeline to SQS queue for Batch processing")
125
+ parser.add_argument("script_file", help="Local path to ML pipeline script")
126
+ parser.add_argument(
127
+ "--size", default="small", choices=["small", "medium", "large"], help="Job size tier (default: small)"
128
+ )
129
+ args = parser.parse_args()
130
+ try:
131
+ submit_to_sqs(args.script_file, args.size)
132
+ except Exception as e:
133
+ print(f"\n❌ ERROR: {e}")
134
+ log.error(f"Error: {e}")
135
+ exit(1)
136
+
137
+
138
+ if __name__ == "__main__":
139
+ main()
@@ -140,7 +140,7 @@ def uq_model(model: "Model", uq_model_name: str, train_all_data: bool = False) -
140
140
  from workbench.api import Model, ModelType, FeatureSet # noqa: F401 (avoid circular import)
141
141
 
142
142
  # Get the custom script path for the UQ model
143
- script_path = get_custom_script_path("uq_models", "meta_uq.template")
143
+ script_path = get_custom_script_path("uq_models", "mapie.template")
144
144
 
145
145
  # Get Feature and Target Columns from the existing given Model
146
146
  features = model.features()
@@ -220,6 +220,8 @@ def uq_metrics(df: pd.DataFrame, target_col: str) -> Dict[str, Any]:
220
220
  # --- Coverage and Interval Width ---
221
221
  if "q_025" in df.columns and "q_975" in df.columns:
222
222
  lower_95, upper_95 = df["q_025"], df["q_975"]
223
+ lower_90, upper_90 = df["q_05"], df["q_95"]
224
+ lower_80, upper_80 = df["q_10"], df["q_90"]
223
225
  lower_50, upper_50 = df["q_25"], df["q_75"]
224
226
  elif "prediction_std" in df.columns:
225
227
  lower_95 = df["prediction"] - 1.96 * df["prediction_std"]
@@ -231,8 +233,12 @@ def uq_metrics(df: pd.DataFrame, target_col: str) -> Dict[str, Any]:
231
233
  "Either quantile columns (q_025, q_975, q_25, q_75) or 'prediction_std' column must be present."
232
234
  )
233
235
  coverage_95 = np.mean((df[target_col] >= lower_95) & (df[target_col] <= upper_95))
236
+ coverage_90 = np.mean((df[target_col] >= lower_90) & (df[target_col] <= upper_90))
237
+ coverage_80 = np.mean((df[target_col] >= lower_80) & (df[target_col] <= upper_80))
234
238
  coverage_50 = np.mean((df[target_col] >= lower_50) & (df[target_col] <= upper_50))
235
239
  avg_width_95 = np.mean(upper_95 - lower_95)
240
+ avg_width_90 = np.mean(upper_90 - lower_90)
241
+ avg_width_80 = np.mean(upper_80 - lower_80)
236
242
  avg_width_50 = np.mean(upper_50 - lower_50)
237
243
 
238
244
  # --- CRPS (measures calibration + sharpness) ---
@@ -260,6 +266,8 @@ def uq_metrics(df: pd.DataFrame, target_col: str) -> Dict[str, Any]:
260
266
  # Collect results
261
267
  results = {
262
268
  "coverage_95": coverage_95,
269
+ "coverage_90": coverage_90,
270
+ "coverage_80": coverage_80,
263
271
  "coverage_50": coverage_50,
264
272
  "avg_width_95": avg_width_95,
265
273
  "avg_width_50": avg_width_50,
@@ -271,8 +279,12 @@ def uq_metrics(df: pd.DataFrame, target_col: str) -> Dict[str, Any]:
271
279
 
272
280
  print("\n=== UQ Metrics ===")
273
281
  print(f"Coverage @ 95%: {coverage_95:.3f} (target: 0.95)")
282
+ print(f"Coverage @ 90%: {coverage_90:.3f} (target: 0.90)")
283
+ print(f"Coverage @ 80%: {coverage_80:.3f} (target: 0.80)")
274
284
  print(f"Coverage @ 50%: {coverage_50:.3f} (target: 0.50)")
275
285
  print(f"Average 95% Width: {avg_width_95:.3f}")
286
+ print(f"Average 90% Width: {avg_width_90:.3f}")
287
+ print(f"Average 80% Width: {avg_width_80:.3f}")
276
288
  print(f"Average 50% Width: {avg_width_50:.3f}")
277
289
  print(f"CRPS: {mean_crps:.3f} (lower is better)")
278
290
  print(f"Interval Score 95%: {mean_is_95:.3f} (lower is better)")