workbench 0.8.170__py3-none-any.whl → 0.8.171__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of workbench might be problematic. Click here for more details.

@@ -196,7 +196,9 @@ class AWSMeta:
196
196
 
197
197
  # Return the summary as a DataFrame
198
198
  df = pd.DataFrame(data_summary).convert_dtypes()
199
- return df.sort_values(by="Created", ascending=False)
199
+ if not df.empty:
200
+ df.sort_values(by="Created", ascending=False, inplace=True)
201
+ return df
200
202
 
201
203
  def models(self, details: bool = False) -> pd.DataFrame:
202
204
  """Get a summary of the Models in AWS.
@@ -256,7 +258,9 @@ class AWSMeta:
256
258
 
257
259
  # Return the summary as a DataFrame
258
260
  df = pd.DataFrame(model_summary).convert_dtypes()
259
- return df.sort_values(by="Created", ascending=False)
261
+ if not df.empty:
262
+ df.sort_values(by="Created", ascending=False, inplace=True)
263
+ return df
260
264
 
261
265
  def endpoints(self, details: bool = False) -> pd.DataFrame:
262
266
  """Get a summary of the Endpoints in AWS.
@@ -317,7 +321,9 @@ class AWSMeta:
317
321
 
318
322
  # Return the summary as a DataFrame
319
323
  df = pd.DataFrame(data_summary).convert_dtypes()
320
- return df.sort_values(by="Created", ascending=False)
324
+ if not df.empty:
325
+ df.sort_values(by="Created", ascending=False, inplace=True)
326
+ return df
321
327
 
322
328
  def _endpoint_config_info(self, endpoint_config_name: str) -> dict:
323
329
  """Internal: Get the Endpoint Configuration information for the given endpoint config name.
@@ -657,7 +663,8 @@ class AWSMeta:
657
663
  df = pd.DataFrame(data_summary).convert_dtypes()
658
664
 
659
665
  # Sort by the Modified column
660
- df = df.sort_values(by="Modified", ascending=False)
666
+ if not df.empty:
667
+ df = df.sort_values(by="Modified", ascending=False)
661
668
  return df
662
669
 
663
670
  def _aws_pipelines(self) -> pd.DataFrame:
@@ -2,7 +2,6 @@
2
2
  from ngboost import NGBRegressor
3
3
  from xgboost import XGBRegressor # Base Estimator
4
4
  from sklearn.model_selection import train_test_split
5
- import numpy as np
6
5
 
7
6
  # Model Performance Scores
8
7
  from sklearn.metrics import (
@@ -16,7 +15,9 @@ import json
16
15
  import argparse
17
16
  import joblib
18
17
  import os
18
+ import numpy as np
19
19
  import pandas as pd
20
+ from typing import List, Tuple
20
21
 
21
22
  # Local Imports
22
23
  from proximity import Proximity
@@ -25,11 +26,12 @@ from proximity import Proximity
25
26
 
26
27
  # Template Placeholders
27
28
  TEMPLATE_PARAMS = {
28
- "id_column": "id",
29
- "features": ['molwt', 'mollogp', 'molmr', 'heavyatomcount', 'numhacceptors', 'numhdonors', 'numheteroatoms', 'numrotatablebonds', 'numvalenceelectrons', 'numaromaticrings', 'numsaturatedrings', 'numaliphaticrings', 'ringcount', 'tpsa', 'labuteasa', 'balabanj', 'bertzct'],
30
- "target": "solubility",
31
- "train_all_data": True,
32
- "track_columns": ['solubility']
29
+ "id_column": "udm_mol_bat_id",
30
+ "target": "udm_asy_res_intrinsic_clearance_ul_per_min_per_mg_protein",
31
+ "features": ['bcut2d_logplow', 'numradicalelectrons', 'smr_vsa5', 'fr_lactam', 'fr_morpholine', 'fr_aldehyde', 'slogp_vsa1', 'fr_amidine', 'bpol', 'fr_ester', 'fr_azo', 'kappa3', 'peoe_vsa5', 'fr_ketone_topliss', 'vsa_estate9', 'estate_vsa9', 'bcut2d_mrhi', 'fr_ndealkylation1', 'numrotatablebonds', 'minestateindex', 'fr_quatn', 'peoe_vsa3', 'fr_epoxide', 'fr_aniline', 'minpartialcharge', 'fr_nitroso', 'fpdensitymorgan2', 'fr_oxime', 'fr_sulfone', 'smr_vsa1', 'kappa1', 'fr_pyridine', 'numaromaticrings', 'vsa_estate6', 'molmr', 'estate_vsa1', 'fr_dihydropyridine', 'vsa_estate10', 'fr_alkyl_halide', 'chi2n', 'fr_thiocyan', 'fpdensitymorgan1', 'fr_unbrch_alkane', 'slogp_vsa9', 'chi4n', 'fr_nitro_arom', 'fr_al_oh', 'fr_furan', 'fr_c_s', 'peoe_vsa8', 'peoe_vsa14', 'numheteroatoms', 'fr_ndealkylation2', 'maxabspartialcharge', 'vsa_estate2', 'peoe_vsa7', 'apol', 'numhacceptors', 'fr_tetrazole', 'vsa_estate1', 'peoe_vsa9', 'naromatom', 'bcut2d_chghi', 'fr_sh', 'fr_halogen', 'slogp_vsa4', 'fr_benzodiazepine', 'molwt', 'fr_isocyan', 'fr_prisulfonamd', 'maxabsestateindex', 'minabsestateindex', 'peoe_vsa11', 'slogp_vsa12', 'estate_vsa5', 'numaliphaticcarbocycles', 'bcut2d_mwlow', 'slogp_vsa7', 'fr_allylic_oxid', 'fr_methoxy', 'fr_nh0', 'fr_coo2', 'fr_phenol', 'nacid', 'nbase', 'chi3v', 'fr_ar_nh', 'fr_nitrile', 'fr_imidazole', 'fr_urea', 'bcut2d_mrlow', 'chi1', 'smr_vsa6', 'fr_aryl_methyl', 'narombond', 'fr_alkyl_carbamate', 'fr_piperzine', 'exactmolwt', 'qed', 'chi0n', 'fr_sulfonamd', 'fr_thiazole', 'numvalenceelectrons', 'fr_phos_acid', 'peoe_vsa12', 'fr_nh1', 'fr_hdrzine', 'fr_c_o_nocoo', 'fr_lactone', 'estate_vsa6', 'bcut2d_logphi', 'vsa_estate7', 'peoe_vsa13', 'numsaturatedcarbocycles', 'fr_nitro', 'fr_phenol_noorthohbond', 'rotratio', 'fr_barbitur', 'fr_isothiocyan', 'balabanj', 'fr_arn', 'fr_imine', 'maxpartialcharge', 'fr_sulfide', 'slogp_vsa11', 'fr_hoccn', 'fr_n_o', 'peoe_vsa1', 'slogp_vsa6', 'heavyatommolwt', 'fractioncsp3', 'estate_vsa8', 'peoe_vsa10', 'numaliphaticrings', 'fr_thiophene', 'maxestateindex', 'smr_vsa10', 'labuteasa', 'smr_vsa2', 'fpdensitymorgan3', 'smr_vsa9', 'slogp_vsa10', 'numaromaticheterocycles', 'fr_nh2', 'fr_diazo', 'chi3n', 'fr_ar_coo', 'slogp_vsa5', 'fr_bicyclic', 'fr_amide', 'estate_vsa10', 'fr_guanido', 'chi1n', 'numsaturatedrings', 'fr_piperdine', 'fr_term_acetylene', 'estate_vsa4', 'slogp_vsa3', 'fr_coo', 'fr_ether', 'estate_vsa7', 'bcut2d_chglo', 'fr_oxazole', 'peoe_vsa6', 'hallkieralpha', 'peoe_vsa2', 'chi2v', 'nocount', 'vsa_estate5', 'fr_nhpyrrole', 'fr_al_coo', 'bertzct', 'estate_vsa11', 'minabspartialcharge', 'slogp_vsa8', 'fr_imide', 'kappa2', 'numaliphaticheterocycles', 'numsaturatedheterocycles', 'fr_hdrzone', 'smr_vsa4', 'fr_ar_n', 'nrot', 'smr_vsa8', 'slogp_vsa2', 'chi4v', 'fr_phos_ester', 'fr_para_hydroxylation', 'smr_vsa3', 'nhohcount', 'estate_vsa2', 'mollogp', 'tpsa', 'fr_azide', 'peoe_vsa4', 'numhdonors', 'fr_al_oh_notert', 'fr_c_o', 'chi0', 'fr_nitro_arom_nonortho', 'vsa_estate3', 'fr_benzene', 'fr_ketone', 'vsa_estate8', 'smr_vsa7', 'fr_ar_oh', 'fr_priamide', 'ringcount', 'estate_vsa3', 'numaromaticcarbocycles', 'bcut2d_mwhi', 'chi1v', 'heavyatomcount', 'vsa_estate4', 'chi0v'],
32
+ "compressed_features": [],
33
+ "train_all_data": False,
34
+ "track_columns": ['udm_asy_res_intrinsic_clearance_ul_per_min_per_mg_protein']
33
35
  }
34
36
 
35
37
 
@@ -73,136 +75,97 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
73
75
  return df.rename(columns=rename_dict)
74
76
 
75
77
 
76
- def distance_weighted_calibrated_intervals(
77
- df_pred: pd.DataFrame,
78
- prox_df: pd.DataFrame,
79
- calibration_strength: float = 0.7,
80
- distance_decay: float = 3.0,
81
- ) -> pd.DataFrame:
78
+ def convert_categorical_types(df: pd.DataFrame, features: list, category_mappings={}) -> tuple:
82
79
  """
83
- Calibrate intervals using distance-weighted neighbor quantiles.
84
- Uses all 10 neighbors with distance-based weighting.
80
+ Converts appropriate columns to categorical type with consistent mappings.
81
+
82
+ Args:
83
+ df (pd.DataFrame): The DataFrame to process.
84
+ features (list): List of feature names to consider for conversion.
85
+ category_mappings (dict, optional): Existing category mappings. If empty dict, we're in
86
+ training mode. If populated, we're in inference mode.
87
+
88
+ Returns:
89
+ tuple: (processed DataFrame, category mappings dictionary)
85
90
  """
86
- id_column = TEMPLATE_PARAMS["id_column"]
87
- target_column = TEMPLATE_PARAMS["target"]
88
-
89
- # Distance-weighted neighbor statistics
90
- def weighted_quantile(values, weights, q):
91
- """Calculate weighted quantile"""
92
- if len(values) == 0:
93
- return np.nan
94
- sorted_indices = np.argsort(values)
95
- sorted_values = values[sorted_indices]
96
- sorted_weights = weights[sorted_indices]
97
- cumsum = np.cumsum(sorted_weights)
98
- cutoff = q * cumsum[-1]
99
- return np.interp(cutoff, cumsum, sorted_values)
100
-
101
- # Calculate distance weights (closer neighbors get more weight)
102
- prox_df = prox_df.copy()
103
- prox_df['weight'] = 1 / (1 + prox_df['distance'] ** distance_decay)
104
-
105
- # Get weighted quantiles and statistics for each ID
106
- neighbor_stats = []
107
- for id_val, group in prox_df.groupby(id_column):
108
- values = group[target_column].values
109
- weights = group['weight'].values
110
-
111
- # Normalize weights
112
- weights = weights / weights.sum()
113
-
114
- stats = {
115
- id_column: id_val,
116
- 'local_q025': weighted_quantile(values, weights, 0.025),
117
- 'local_q25': weighted_quantile(values, weights, 0.25),
118
- 'local_q75': weighted_quantile(values, weights, 0.75),
119
- 'local_q975': weighted_quantile(values, weights, 0.975),
120
- 'local_median': weighted_quantile(values, weights, 0.5),
121
- 'local_std': np.sqrt(np.average((values - np.average(values, weights=weights)) ** 2, weights=weights)),
122
- 'avg_distance': group['distance'].mean(),
123
- 'min_distance': group['distance'].min(),
124
- 'max_distance': group['distance'].max(),
125
- }
126
- neighbor_stats.append(stats)
127
-
128
- neighbor_df = pd.DataFrame(neighbor_stats)
129
- out = df_pred.merge(neighbor_df, on=id_column, how='left')
130
-
131
- # Model disagreement score (normalized by prediction std)
132
- model_disagreement = (out["prediction"] - out["prediction_uq"]).abs()
133
- disagreement_score = (model_disagreement / out["prediction_std"]).clip(0, 2)
134
-
135
- # Local confidence based on:
136
- # 1. How close the neighbors are (closer = more confident)
137
- # 2. How much local variance there is (less variance = more confident)
138
- max_reasonable_distance = out['max_distance'].quantile(0.8) # 80th percentile as reference
139
- distance_confidence = (1 - (out['avg_distance'] / max_reasonable_distance)).clip(0.1, 1.0)
140
-
141
- variance_confidence = (out["prediction_std"] / out["local_std"]).clip(0.5, 2.0)
142
- local_confidence = distance_confidence * variance_confidence.clip(0.5, 1.5)
143
-
144
- # Calibration weight: higher when models disagree and we have good local data
145
- calibration_weight = (
146
- calibration_strength *
147
- local_confidence * # Weight by local data quality
148
- disagreement_score.clip(0.3, 1.0) # More calibration when models disagree
149
- )
91
+ # Training mode
92
+ if category_mappings == {}:
93
+ for col in df.select_dtypes(include=["object", "string"]):
94
+ if col in features and df[col].nunique() < 20:
95
+ print(f"Training mode: Converting {col} to category")
96
+ df[col] = df[col].astype("category")
97
+ category_mappings[col] = df[col].cat.categories.tolist() # Store category mappings
98
+
99
+ # Inference mode
100
+ else:
101
+ for col, categories in category_mappings.items():
102
+ if col in df.columns:
103
+ print(f"Inference mode: Applying categorical mapping for {col}")
104
+ df[col] = pd.Categorical(df[col], categories=categories) # Apply consistent categorical mapping
150
105
 
151
- # Consensus prediction (slight preference for NGBoost since it provides intervals)
152
- consensus_pred = 0.65 * out["prediction_uq"] + 0.35 * out["prediction"]
106
+ return df, category_mappings
153
107
 
154
- # Re-center local intervals around consensus prediction
155
- local_center_offset = consensus_pred - out["local_median"]
156
108
 
157
- # Apply calibration to each quantile
158
- quantile_pairs = [
159
- ("q_025", "local_q025"),
160
- ("q_25", "local_q25"),
161
- ("q_75", "local_q75"),
162
- ("q_975", "local_q975")
163
- ]
109
+ def decompress_features(df: pd.DataFrame, features: List[str], compressed_features: List[str]) -> Tuple[pd.DataFrame, List[str]]:
110
+ """Prepare features for the XGBoost model
111
+
112
+ Args:
113
+ df (pd.DataFrame): The features DataFrame
114
+ features (List[str]): Full list of feature names
115
+ compressed_features (List[str]): List of feature names to decompress (bitstrings)
116
+
117
+ Returns:
118
+ pd.DataFrame: DataFrame with the decompressed features
119
+ List[str]: Updated list of feature names after decompression
164
120
 
165
- for model_q, local_q in quantile_pairs:
166
- # Adjust local quantiles to be centered around consensus
167
- adjusted_local_q = out[local_q] + local_center_offset
121
+ Raises:
122
+ ValueError: If any missing values are found in the specified features
123
+ """
168
124
 
169
- # Blend model and local intervals
170
- out[model_q] = (
171
- (1 - calibration_weight) * out[model_q] +
172
- calibration_weight * adjusted_local_q
125
+ # Check for any missing values in the required features
126
+ missing_counts = df[features].isna().sum()
127
+ if missing_counts.any():
128
+ missing_features = missing_counts[missing_counts > 0]
129
+ print(
130
+ f"WARNING: Found missing values in features: {missing_features.to_dict()}. "
131
+ "WARNING: You might want to remove/replace all NaN values before processing."
173
132
  )
174
133
 
175
- # Ensure proper interval ordering and bounds using pandas
176
- out["q_025"] = pd.concat([out["q_025"], consensus_pred], axis=1).min(axis=1)
177
- out["q_975"] = pd.concat([out["q_975"], consensus_pred], axis=1).max(axis=1)
178
- out["q_25"] = pd.concat([out["q_25"], out["q_75"]], axis=1).min(axis=1)
134
+ # Decompress the specified compressed features
135
+ decompressed_features = features
136
+ for feature in compressed_features:
137
+ if (feature not in df.columns) or (feature not in features):
138
+ print(f"Feature '{feature}' not in the features list, skipping decompression.")
139
+ continue
179
140
 
180
- # Optional: Add some interval expansion when neighbors are very far
181
- # (indicates we're in a sparse region of feature space)
182
- sparse_region_mask = out['min_distance'] > out['min_distance'].quantile(0.9)
183
- expansion_factor = 1 + 0.2 * sparse_region_mask # 20% expansion in sparse regions
141
+ # Remove the feature from the list of features to avoid duplication
142
+ decompressed_features.remove(feature)
184
143
 
185
- for q in ["q_025", "q_25", "q_75", "q_975"]:
186
- interval_width = out[q] - consensus_pred
187
- out[q] = consensus_pred + interval_width * expansion_factor
144
+ # Handle all compressed features as bitstrings
145
+ bit_matrix = np.array([list(bitstring) for bitstring in df[feature]], dtype=np.uint8)
146
+ prefix = feature[:3]
188
147
 
189
- # Clean up temporary columns
190
- cleanup_cols = [col for col in out.columns if col.startswith("local_")] + \
191
- ['avg_distance', 'min_distance', 'max_distance']
148
+ # Create all new columns at once - avoids fragmentation
149
+ new_col_names = [f"{prefix}_{i}" for i in range(bit_matrix.shape[1])]
150
+ new_df = pd.DataFrame(bit_matrix, columns=new_col_names, index=df.index)
192
151
 
193
- return out.drop(columns=cleanup_cols)
152
+ # Add to features list
153
+ decompressed_features.extend(new_col_names)
154
+
155
+ # Drop original column and concatenate new ones
156
+ df = df.drop(columns=[feature])
157
+ df = pd.concat([df, new_df], axis=1)
158
+
159
+ return df, decompressed_features
194
160
 
195
161
 
196
- # TRAINING SECTION
197
- #
198
- # This section (__main__) is where SageMaker will execute the training job
199
- # and save the model artifacts to the model directory.
200
- #
201
162
  if __name__ == "__main__":
202
163
  # Template Parameters
203
164
  id_column = TEMPLATE_PARAMS["id_column"]
204
- features = TEMPLATE_PARAMS["features"]
205
165
  target = TEMPLATE_PARAMS["target"]
166
+ features = TEMPLATE_PARAMS["features"]
167
+ orig_features = features.copy()
168
+ compressed_features = TEMPLATE_PARAMS["compressed_features"]
206
169
  train_all_data = TEMPLATE_PARAMS["train_all_data"]
207
170
  track_columns = TEMPLATE_PARAMS["track_columns"] # Can be None
208
171
  validation_split = 0.2
@@ -216,34 +179,51 @@ if __name__ == "__main__":
216
179
  )
217
180
  args = parser.parse_args()
218
181
 
219
- # Load training data from the specified directory
182
+ # Read the training data into DataFrames
220
183
  training_files = [
221
184
  os.path.join(args.train, file)
222
- for file in os.listdir(args.train) if file.endswith(".csv")
185
+ for file in os.listdir(args.train)
186
+ if file.endswith(".csv")
223
187
  ]
224
188
  print(f"Training Files: {training_files}")
225
189
 
226
190
  # Combine files and read them all into a single pandas dataframe
227
- df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
191
+ all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
228
192
 
229
- # Check if the DataFrame is empty
230
- check_dataframe(df, "training_df")
193
+ # Check if the dataframe is empty
194
+ check_dataframe(all_df, "training_df")
231
195
 
232
- # Training data split logic
196
+ # Features/Target output
197
+ print(f"Target: {target}")
198
+ print(f"Features: {str(features)}")
199
+
200
+ # Convert any features that might be categorical to 'category' type
201
+ all_df, category_mappings = convert_categorical_types(all_df, features)
202
+
203
+ # If we have compressed features, decompress them
204
+ if compressed_features:
205
+ print(f"Decompressing features {compressed_features}...")
206
+ all_df, features = decompress_features(all_df, features, compressed_features)
207
+
208
+ # Do we want to train on all the data?
233
209
  if train_all_data:
234
- # Use all data for both training and validation
235
- print("Training on all data...")
236
- df_train = df.copy()
237
- df_val = df.copy()
238
- elif "training" in df.columns:
239
- # Split data based on a 'training' column if it exists
240
- print("Splitting data based on 'training' column...")
241
- df_train = df[df["training"]].copy()
242
- df_val = df[~df["training"]].copy()
210
+ print("Training on ALL of the data")
211
+ df_train = all_df.copy()
212
+ df_val = all_df.copy()
213
+
214
+ # Does the dataframe have a training column?
215
+ elif "training" in all_df.columns:
216
+ print("Found training column, splitting data based on training column")
217
+ df_train = all_df[all_df["training"]]
218
+ df_val = all_df[~all_df["training"]]
243
219
  else:
244
- # Perform a random split if no 'training' column is found
245
- print("Splitting data randomly...")
246
- df_train, df_val = train_test_split(df, test_size=validation_split, random_state=42)
220
+ # Just do a random training Split
221
+ print("WARNING: No training column found, splitting data with random state=42")
222
+ df_train, df_val = train_test_split(
223
+ all_df, test_size=validation_split, random_state=42
224
+ )
225
+ print(f"FIT/TRAIN: {df_train.shape}")
226
+ print(f"VALIDATION: {df_val.shape}")
247
227
 
248
228
  # We're using XGBoost for point predictions and NGBoost for uncertainty quantification
249
229
  xgb_model = XGBRegressor()
@@ -251,18 +231,16 @@ if __name__ == "__main__":
251
231
 
252
232
  # Prepare features and targets for training
253
233
  X_train = df_train[features]
254
- X_val = df_val[features]
234
+ X_validate = df_val[features]
255
235
  y_train = df_train[target]
256
- y_val = df_val[target]
236
+ y_validate = df_val[target]
257
237
 
258
238
  # Train both models using the training data
259
239
  xgb_model.fit(X_train, y_train)
260
- ngb_model.fit(X_train, y_train, X_val=X_val, Y_val=y_val)
240
+ ngb_model.fit(X_train, y_train, X_val=X_validate, Y_val=y_validate)
261
241
 
262
242
  # Make Predictions on the Validation Set
263
243
  print(f"Making Predictions on Validation Set...")
264
- y_validate = df_val[target]
265
- X_validate = df_val[features]
266
244
  preds = xgb_model.predict(X_validate)
267
245
 
268
246
  # Calculate various model performance metrics (regression)
@@ -280,9 +258,9 @@ if __name__ == "__main__":
280
258
  # Save the trained NGBoost model
281
259
  joblib.dump(ngb_model, os.path.join(args.model_dir, "ngb_model.joblib"))
282
260
 
283
- # Save the feature list to validate input during predictions
261
+ # Save the features (this will validate input during predictions)
284
262
  with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
285
- json.dump(features, fp)
263
+ json.dump(orig_features, fp) # We save the original features, not the decompressed ones
286
264
 
287
265
  # Now the Proximity model
288
266
  model = Proximity(df_train, id_column, features, target, track_columns=track_columns)
@@ -295,7 +273,7 @@ if __name__ == "__main__":
295
273
  # Inference Section
296
274
  #
297
275
  def model_fn(model_dir) -> dict:
298
- """Load and return XGBoost and NGBoost regressors from model directory."""
276
+ """Load and return XGBoost, NGBoost, and Prox Model from model directory."""
299
277
 
300
278
  # Load XGBoost regressor
301
279
  xgb_path = os.path.join(model_dir, "xgb_model.json")
@@ -376,18 +354,30 @@ def predict_fn(df, models) -> pd.DataFrame:
376
354
  df["prediction_std"] = dist_params['scale'] # standard deviation
377
355
 
378
356
  # Add 95% prediction intervals using ppf (percent point function)
379
- df["q_025"] = y_dists.ppf(0.025) # 2.5th percentile
380
- df["q_975"] = y_dists.ppf(0.975) # 97.5th percentile
357
+ # Note: Our hybrid model uses XGB point prediction and NGBoost UQ
358
+ # so we need to adjust the bounds to include the point prediction
359
+ df["q_025"] = np.minimum(y_dists.ppf(0.025), df["prediction"])
360
+ df["q_975"] = np.maximum(y_dists.ppf(0.975), df["prediction"])
361
+
362
+ # Add 90% prediction intervals
363
+ df["q_05"] = y_dists.ppf(0.05) # 5th percentile
364
+ df["q_95"] = y_dists.ppf(0.95) # 95th percentile
365
+
366
+ # Add 80% prediction intervals
367
+ df["q_10"] = y_dists.ppf(0.10) # 10th percentile
368
+ df["q_90"] = y_dists.ppf(0.90) # 90th percentile
381
369
 
382
370
  # Add 50% prediction intervals
383
- df["q_25"] = y_dists.ppf(0.25) # 25th percentile
384
- df["q_75"] = y_dists.ppf(0.75) # 75th percentile
371
+ df["q_25"] = y_dists.ppf(0.25) # 25th percentile
372
+ df["q_75"] = y_dists.ppf(0.75) # 75th percentile
385
373
 
386
- # Compute Nearest neighbors with Proximity model
387
- prox_df = models["proximity"].neighbors(df)
374
+ # Reorder the quantile columns for easier reading
375
+ quantile_cols = ["q_025", "q_05", "q_10", "q_25", "q_75", "q_90", "q_95", "q_975"]
376
+ other_cols = [col for col in df.columns if col not in quantile_cols]
377
+ df = df[other_cols + quantile_cols]
388
378
 
389
- # Shrink prediction intervals based on KNN variance
390
- df = distance_weighted_calibrated_intervals(df, prox_df)
379
+ # Compute Nearest neighbors with Proximity model
380
+ models["proximity"].neighbors(df)
391
381
 
392
382
  # Return the modified DataFrame
393
383
  return df
@@ -15,7 +15,9 @@ import json
15
15
  import argparse
16
16
  import joblib
17
17
  import os
18
+ import numpy as np
18
19
  import pandas as pd
20
+ from typing import List, Tuple
19
21
 
20
22
  # Local Imports
21
23
  from proximity import Proximity
@@ -25,8 +27,9 @@ from proximity import Proximity
25
27
  # Template Placeholders
26
28
  TEMPLATE_PARAMS = {
27
29
  "id_column": "{{id_column}}",
28
- "features": "{{feature_list}}",
29
30
  "target": "{{target_column}}",
31
+ "features": "{{feature_list}}",
32
+ "compressed_features": "{{compressed_features}}",
30
33
  "train_all_data": "{{train_all_data}}",
31
34
  "track_columns": "{{track_columns}}"
32
35
  }
@@ -72,16 +75,97 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
72
75
  return df.rename(columns=rename_dict)
73
76
 
74
77
 
75
- # TRAINING SECTION
76
- #
77
- # This section (__main__) is where SageMaker will execute the training job
78
- # and save the model artifacts to the model directory.
79
- #
78
+ def convert_categorical_types(df: pd.DataFrame, features: list, category_mappings={}) -> tuple:
79
+ """
80
+ Converts appropriate columns to categorical type with consistent mappings.
81
+
82
+ Args:
83
+ df (pd.DataFrame): The DataFrame to process.
84
+ features (list): List of feature names to consider for conversion.
85
+ category_mappings (dict, optional): Existing category mappings. If empty dict, we're in
86
+ training mode. If populated, we're in inference mode.
87
+
88
+ Returns:
89
+ tuple: (processed DataFrame, category mappings dictionary)
90
+ """
91
+ # Training mode
92
+ if category_mappings == {}:
93
+ for col in df.select_dtypes(include=["object", "string"]):
94
+ if col in features and df[col].nunique() < 20:
95
+ print(f"Training mode: Converting {col} to category")
96
+ df[col] = df[col].astype("category")
97
+ category_mappings[col] = df[col].cat.categories.tolist() # Store category mappings
98
+
99
+ # Inference mode
100
+ else:
101
+ for col, categories in category_mappings.items():
102
+ if col in df.columns:
103
+ print(f"Inference mode: Applying categorical mapping for {col}")
104
+ df[col] = pd.Categorical(df[col], categories=categories) # Apply consistent categorical mapping
105
+
106
+ return df, category_mappings
107
+
108
+
109
+ def decompress_features(df: pd.DataFrame, features: List[str], compressed_features: List[str]) -> Tuple[pd.DataFrame, List[str]]:
110
+ """Prepare features for the XGBoost model
111
+
112
+ Args:
113
+ df (pd.DataFrame): The features DataFrame
114
+ features (List[str]): Full list of feature names
115
+ compressed_features (List[str]): List of feature names to decompress (bitstrings)
116
+
117
+ Returns:
118
+ pd.DataFrame: DataFrame with the decompressed features
119
+ List[str]: Updated list of feature names after decompression
120
+
121
+ Raises:
122
+ ValueError: If any missing values are found in the specified features
123
+ """
124
+
125
+ # Check for any missing values in the required features
126
+ missing_counts = df[features].isna().sum()
127
+ if missing_counts.any():
128
+ missing_features = missing_counts[missing_counts > 0]
129
+ print(
130
+ f"WARNING: Found missing values in features: {missing_features.to_dict()}. "
131
+ "WARNING: You might want to remove/replace all NaN values before processing."
132
+ )
133
+
134
+ # Decompress the specified compressed features
135
+ decompressed_features = features
136
+ for feature in compressed_features:
137
+ if (feature not in df.columns) or (feature not in features):
138
+ print(f"Feature '{feature}' not in the features list, skipping decompression.")
139
+ continue
140
+
141
+ # Remove the feature from the list of features to avoid duplication
142
+ decompressed_features.remove(feature)
143
+
144
+ # Handle all compressed features as bitstrings
145
+ bit_matrix = np.array([list(bitstring) for bitstring in df[feature]], dtype=np.uint8)
146
+ prefix = feature[:3]
147
+
148
+ # Create all new columns at once - avoids fragmentation
149
+ new_col_names = [f"{prefix}_{i}" for i in range(bit_matrix.shape[1])]
150
+ new_df = pd.DataFrame(bit_matrix, columns=new_col_names, index=df.index)
151
+
152
+ # Add to features list
153
+ decompressed_features.extend(new_col_names)
154
+
155
+ # Drop original column and concatenate new ones
156
+ df = df.drop(columns=[feature])
157
+ df = pd.concat([df, new_df], axis=1)
158
+
159
+ return df, decompressed_features
160
+
161
+
80
162
  if __name__ == "__main__":
81
163
  # Template Parameters
82
164
  id_column = TEMPLATE_PARAMS["id_column"]
83
- features = TEMPLATE_PARAMS["features"]
84
165
  target = TEMPLATE_PARAMS["target"]
166
+ features = TEMPLATE_PARAMS["features"]
167
+ orig_features = features.copy()
168
+ compressed_features = TEMPLATE_PARAMS["compressed_features"]
85
169
  train_all_data = TEMPLATE_PARAMS["train_all_data"]
86
170
  track_columns = TEMPLATE_PARAMS["track_columns"] # Can be None
87
171
  validation_split = 0.2
@@ -95,34 +179,51 @@ if __name__ == "__main__":
95
179
  )
96
180
  args = parser.parse_args()
97
181
 
98
- # Load training data from the specified directory
182
+ # Read the training data into DataFrames
99
183
  training_files = [
100
184
  os.path.join(args.train, file)
101
- for file in os.listdir(args.train) if file.endswith(".csv")
185
+ for file in os.listdir(args.train)
186
+ if file.endswith(".csv")
102
187
  ]
103
188
  print(f"Training Files: {training_files}")
104
189
 
105
190
  # Combine files and read them all into a single pandas dataframe
106
- df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
191
+ all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
192
+
193
+ # Check if the dataframe is empty
194
+ check_dataframe(all_df, "training_df")
195
+
196
+ # Features/Target output
197
+ print(f"Target: {target}")
198
+ print(f"Features: {str(features)}")
107
199
 
108
- # Check if the DataFrame is empty
109
- check_dataframe(df, "training_df")
200
+ # Convert any features that might be categorical to 'category' type
201
+ all_df, category_mappings = convert_categorical_types(all_df, features)
110
202
 
111
- # Training data split logic
203
+ # If we have compressed features, decompress them
204
+ if compressed_features:
205
+ print(f"Decompressing features {compressed_features}...")
206
+ all_df, features = decompress_features(all_df, features, compressed_features)
207
+
208
+ # Do we want to train on all the data?
112
209
  if train_all_data:
113
- # Use all data for both training and validation
114
- print("Training on all data...")
115
- df_train = df.copy()
116
- df_val = df.copy()
117
- elif "training" in df.columns:
118
- # Split data based on a 'training' column if it exists
119
- print("Splitting data based on 'training' column...")
120
- df_train = df[df["training"]].copy()
121
- df_val = df[~df["training"]].copy()
210
+ print("Training on ALL of the data")
211
+ df_train = all_df.copy()
212
+ df_val = all_df.copy()
213
+
214
+ # Does the dataframe have a training column?
215
+ elif "training" in all_df.columns:
216
+ print("Found training column, splitting data based on training column")
217
+ df_train = all_df[all_df["training"]]
218
+ df_val = all_df[~all_df["training"]]
122
219
  else:
123
- # Perform a random split if no 'training' column is found
124
- print("Splitting data randomly...")
125
- df_train, df_val = train_test_split(df, test_size=validation_split, random_state=42)
220
+ # Just do a random training Split
221
+ print("WARNING: No training column found, splitting data with random state=42")
222
+ df_train, df_val = train_test_split(
223
+ all_df, test_size=validation_split, random_state=42
224
+ )
225
+ print(f"FIT/TRAIN: {df_train.shape}")
226
+ print(f"VALIDATION: {df_val.shape}")
126
227
 
127
228
  # We're using XGBoost for point predictions and NGBoost for uncertainty quantification
128
229
  xgb_model = XGBRegressor()
@@ -130,18 +231,16 @@ if __name__ == "__main__":
130
231
 
131
232
  # Prepare features and targets for training
132
233
  X_train = df_train[features]
133
- X_val = df_val[features]
234
+ X_validate = df_val[features]
134
235
  y_train = df_train[target]
135
- y_val = df_val[target]
236
+ y_validate = df_val[target]
136
237
 
137
238
  # Train both models using the training data
138
239
  xgb_model.fit(X_train, y_train)
139
- ngb_model.fit(X_train, y_train, X_val=X_val, Y_val=y_val)
240
+ ngb_model.fit(X_train, y_train, X_val=X_validate, Y_val=y_validate)
140
241
 
141
242
  # Make Predictions on the Validation Set
142
243
  print(f"Making Predictions on Validation Set...")
143
- y_validate = df_val[target]
144
- X_validate = df_val[features]
145
244
  preds = xgb_model.predict(X_validate)
146
245
 
147
246
  # Calculate various model performance metrics (regression)
@@ -159,9 +258,9 @@ if __name__ == "__main__":
159
258
  # Save the trained NGBoost model
160
259
  joblib.dump(ngb_model, os.path.join(args.model_dir, "ngb_model.joblib"))
161
260
 
162
- # Save the feature list to validate input during predictions
261
+ # Save the features (this will validate input during predictions)
163
262
  with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
164
- json.dump(features, fp)
263
+ json.dump(orig_features, fp) # We save the original features, not the decompressed ones
165
264
 
166
265
  # Now the Proximity model
167
266
  model = Proximity(df_train, id_column, features, target, track_columns=track_columns)
@@ -255,16 +354,27 @@ def predict_fn(df, models) -> pd.DataFrame:
255
354
  df["prediction_std"] = dist_params['scale'] # standard deviation
256
355
 
257
356
  # Add 95% prediction intervals using ppf (percent point function)
258
- df["q_025"] = y_dists.ppf(0.025) # 2.5th percentile
259
- df["q_975"] = y_dists.ppf(0.975) # 97.5th percentile
357
+ # Note: Our hybrid model uses XGB point prediction and NGBoost UQ
358
+ # so we need to adjust the bounds to include the point prediction
359
+ df["q_025"] = np.minimum(y_dists.ppf(0.025), df["prediction"])
360
+ df["q_975"] = np.maximum(y_dists.ppf(0.975), df["prediction"])
361
+
362
+ # Add 90% prediction intervals
363
+ df["q_05"] = y_dists.ppf(0.05) # 5th percentile
364
+ df["q_95"] = y_dists.ppf(0.95) # 95th percentile
365
+
366
+ # Add 80% prediction intervals
367
+ df["q_10"] = y_dists.ppf(0.10) # 10th percentile
368
+ df["q_90"] = y_dists.ppf(0.90) # 90th percentile
260
369
 
261
370
  # Add 50% prediction intervals
262
- df["q_25"] = y_dists.ppf(0.25) # 25th percentile
263
- df["q_75"] = y_dists.ppf(0.75) # 75th percentile
371
+ df["q_25"] = y_dists.ppf(0.25) # 25th percentile
372
+ df["q_75"] = y_dists.ppf(0.75) # 75th percentile
264
373
 
265
- # Adjust prediction intervals to include point predictions
266
- df["q_025"] = df[["q_025", "prediction"]].min(axis=1)
267
- df["q_975"] = df[["q_975", "prediction"]].max(axis=1)
374
+ # Reorder the quantile columns for easier reading
375
+ quantile_cols = ["q_025", "q_05", "q_10", "q_25", "q_75", "q_90", "q_95", "q_975"]
376
+ other_cols = [col for col in df.columns if col not in quantile_cols]
377
+ df = df[other_cols + quantile_cols]
268
378
 
269
379
  # Compute Nearest neighbors with Proximity model
270
380
  models["proximity"].neighbors(df)
@@ -219,9 +219,22 @@ def predict_fn(df, model) -> pd.DataFrame:
219
219
  df["q_025"] = y_dists.ppf(0.025) # 2.5th percentile
220
220
  df["q_975"] = y_dists.ppf(0.975) # 97.5th percentile
221
221
 
222
+ # Add 90% prediction intervals
223
+ df["q_05"] = y_dists.ppf(0.05) # 5th percentile
224
+ df["q_95"] = y_dists.ppf(0.95) # 95th percentile
225
+
226
+ # Add 80% prediction intervals
227
+ df["q_10"] = y_dists.ppf(0.10) # 10th percentile
228
+ df["q_90"] = y_dists.ppf(0.90) # 90th percentile
229
+
222
230
  # Add 50% prediction intervals
223
- df["q_25"] = y_dists.ppf(0.25) # 25th percentile
224
- df["q_75"] = y_dists.ppf(0.75) # 75th percentile
231
+ df["q_25"] = y_dists.ppf(0.25) # 25th percentile
232
+ df["q_75"] = y_dists.ppf(0.75) # 75th percentile
233
+
234
+ # Reorder the quantile columns for easier reading
235
+ quantile_cols = ["q_025", "q_05", "q_10", "q_25", "q_75", "q_90", "q_95", "q_975"]
236
+ other_cols = [col for col in df.columns if col not in quantile_cols]
237
+ df = df[other_cols + quantile_cols]
225
238
 
226
239
  # Return the modified DataFrame
227
240
  return df
@@ -28,12 +28,12 @@ from typing import List, Tuple
28
28
 
29
29
  # Template Parameters
30
30
  TEMPLATE_PARAMS = {
31
- "model_type": "classifier",
32
- "target_column": "class",
33
- "features": ['bcut2d_logplow', 'numradicalelectrons', 'smr_vsa5', 'fr_lactam', 'fr_morpholine', 'fr_aldehyde', 'slogp_vsa1', 'fr_amidine', 'bpol', 'fr_ester', 'fr_azo', 'kappa3', 'peoe_vsa5', 'fr_ketone_topliss', 'vsa_estate9', 'estate_vsa9', 'bcut2d_mrhi', 'fr_ndealkylation1', 'numrotatablebonds', 'minestateindex', 'fr_quatn', 'peoe_vsa3', 'fr_epoxide', 'fr_aniline', 'minpartialcharge', 'fr_nitroso', 'fpdensitymorgan2', 'fr_oxime', 'fr_sulfone', 'smr_vsa1', 'kappa1', 'fr_pyridine', 'numaromaticrings', 'vsa_estate6', 'molmr', 'estate_vsa1', 'fr_dihydropyridine', 'vsa_estate10', 'fr_alkyl_halide', 'chi2n', 'fr_thiocyan', 'fpdensitymorgan1', 'fr_unbrch_alkane', 'slogp_vsa9', 'chi4n', 'fr_nitro_arom', 'fr_al_oh', 'fr_furan', 'fr_c_s', 'peoe_vsa8', 'peoe_vsa14', 'numheteroatoms', 'fr_ndealkylation2', 'maxabspartialcharge', 'vsa_estate2', 'peoe_vsa7', 'apol', 'numhacceptors', 'fr_tetrazole', 'vsa_estate1', 'peoe_vsa9', 'naromatom', 'bcut2d_chghi', 'fr_sh', 'fr_halogen', 'slogp_vsa4', 'fr_benzodiazepine', 'molwt', 'fr_isocyan', 'fr_prisulfonamd', 'maxabsestateindex', 'minabsestateindex', 'peoe_vsa11', 'slogp_vsa12', 'estate_vsa5', 'numaliphaticcarbocycles', 'bcut2d_mwlow', 'slogp_vsa7', 'fr_allylic_oxid', 'fr_methoxy', 'fr_nh0', 'fr_coo2', 'fr_phenol', 'nacid', 'nbase', 'chi3v', 'fr_ar_nh', 'fr_nitrile', 'fr_imidazole', 'fr_urea', 'bcut2d_mrlow', 'chi1', 'smr_vsa6', 'fr_aryl_methyl', 'narombond', 'fr_alkyl_carbamate', 'fr_piperzine', 'exactmolwt', 'qed', 'chi0n', 'fr_sulfonamd', 'fr_thiazole', 'numvalenceelectrons', 'fr_phos_acid', 'peoe_vsa12', 'fr_nh1', 'fr_hdrzine', 'fr_c_o_nocoo', 'fr_lactone', 'estate_vsa6', 'bcut2d_logphi', 'vsa_estate7', 'peoe_vsa13', 'numsaturatedcarbocycles', 'fr_nitro', 'fr_phenol_noorthohbond', 'rotratio', 'fr_barbitur', 'fr_isothiocyan', 'balabanj', 'fr_arn', 'fr_imine', 'maxpartialcharge', 'fr_sulfide', 'slogp_vsa11', 'fr_hoccn', 'fr_n_o', 'peoe_vsa1', 'slogp_vsa6', 'heavyatommolwt', 'fractioncsp3', 'estate_vsa8', 'peoe_vsa10', 'numaliphaticrings', 'fr_thiophene', 'maxestateindex', 'smr_vsa10', 'labuteasa', 'smr_vsa2', 'fpdensitymorgan3', 'smr_vsa9', 'slogp_vsa10', 'numaromaticheterocycles', 'fr_nh2', 'fr_diazo', 'chi3n', 'fr_ar_coo', 'slogp_vsa5', 'fr_bicyclic', 'fr_amide', 'estate_vsa10', 'fr_guanido', 'chi1n', 'numsaturatedrings', 'fr_piperdine', 'fr_term_acetylene', 'estate_vsa4', 'slogp_vsa3', 'fr_coo', 'fr_ether', 'estate_vsa7', 'bcut2d_chglo', 'fr_oxazole', 'peoe_vsa6', 'hallkieralpha', 'peoe_vsa2', 'chi2v', 'nocount', 'vsa_estate5', 'fr_nhpyrrole', 'fr_al_coo', 'bertzct', 'estate_vsa11', 'minabspartialcharge', 'slogp_vsa8', 'fr_imide', 'kappa2', 'numaliphaticheterocycles', 'numsaturatedheterocycles', 'fr_hdrzone', 'smr_vsa4', 'fr_ar_n', 'nrot', 'smr_vsa8', 'slogp_vsa2', 'chi4v', 'fr_phos_ester', 'fr_para_hydroxylation', 'smr_vsa3', 'nhohcount', 'estate_vsa2', 'mollogp', 'tpsa', 'fr_azide', 'peoe_vsa4', 'numhdonors', 'fr_al_oh_notert', 'fr_c_o', 'chi0', 'fr_nitro_arom_nonortho', 'vsa_estate3', 'fr_benzene', 'fr_ketone', 'vsa_estate8', 'smr_vsa7', 'fr_ar_oh', 'fr_priamide', 'ringcount', 'estate_vsa3', 'numaromaticcarbocycles', 'bcut2d_mwhi', 'chi1v', 'heavyatomcount', 'vsa_estate4', 'chi0v', 'pred_pka_reg'],
31
+ "model_type": "regressor",
32
+ "target": "udm_asy_res_intrinsic_clearance_ul_per_min_per_mg_protein",
33
+ "features": ['bcut2d_logplow', 'numradicalelectrons', 'smr_vsa5', 'fr_lactam', 'fr_morpholine', 'fr_aldehyde', 'slogp_vsa1', 'fr_amidine', 'bpol', 'fr_ester', 'fr_azo', 'kappa3', 'peoe_vsa5', 'fr_ketone_topliss', 'vsa_estate9', 'estate_vsa9', 'bcut2d_mrhi', 'fr_ndealkylation1', 'numrotatablebonds', 'minestateindex', 'fr_quatn', 'peoe_vsa3', 'fr_epoxide', 'fr_aniline', 'minpartialcharge', 'fr_nitroso', 'fpdensitymorgan2', 'fr_oxime', 'fr_sulfone', 'smr_vsa1', 'kappa1', 'fr_pyridine', 'numaromaticrings', 'vsa_estate6', 'molmr', 'estate_vsa1', 'fr_dihydropyridine', 'vsa_estate10', 'fr_alkyl_halide', 'chi2n', 'fr_thiocyan', 'fpdensitymorgan1', 'fr_unbrch_alkane', 'slogp_vsa9', 'chi4n', 'fr_nitro_arom', 'fr_al_oh', 'fr_furan', 'fr_c_s', 'peoe_vsa8', 'peoe_vsa14', 'numheteroatoms', 'fr_ndealkylation2', 'maxabspartialcharge', 'vsa_estate2', 'peoe_vsa7', 'apol', 'numhacceptors', 'fr_tetrazole', 'vsa_estate1', 'peoe_vsa9', 'naromatom', 'bcut2d_chghi', 'fr_sh', 'fr_halogen', 'slogp_vsa4', 'fr_benzodiazepine', 'molwt', 'fr_isocyan', 'fr_prisulfonamd', 'maxabsestateindex', 'minabsestateindex', 'peoe_vsa11', 'slogp_vsa12', 'estate_vsa5', 'numaliphaticcarbocycles', 'bcut2d_mwlow', 'slogp_vsa7', 'fr_allylic_oxid', 'fr_methoxy', 'fr_nh0', 'fr_coo2', 'fr_phenol', 'nacid', 'nbase', 'chi3v', 'fr_ar_nh', 'fr_nitrile', 'fr_imidazole', 'fr_urea', 'bcut2d_mrlow', 'chi1', 'smr_vsa6', 'fr_aryl_methyl', 'narombond', 'fr_alkyl_carbamate', 'fr_piperzine', 'exactmolwt', 'qed', 'chi0n', 'fr_sulfonamd', 'fr_thiazole', 'numvalenceelectrons', 'fr_phos_acid', 'peoe_vsa12', 'fr_nh1', 'fr_hdrzine', 'fr_c_o_nocoo', 'fr_lactone', 'estate_vsa6', 'bcut2d_logphi', 'vsa_estate7', 'peoe_vsa13', 'numsaturatedcarbocycles', 'fr_nitro', 'fr_phenol_noorthohbond', 'rotratio', 'fr_barbitur', 'fr_isothiocyan', 'balabanj', 'fr_arn', 'fr_imine', 'maxpartialcharge', 'fr_sulfide', 'slogp_vsa11', 'fr_hoccn', 'fr_n_o', 'peoe_vsa1', 'slogp_vsa6', 'heavyatommolwt', 'fractioncsp3', 'estate_vsa8', 'peoe_vsa10', 'numaliphaticrings', 'fr_thiophene', 'maxestateindex', 'smr_vsa10', 'labuteasa', 'smr_vsa2', 'fpdensitymorgan3', 'smr_vsa9', 'slogp_vsa10', 'numaromaticheterocycles', 'fr_nh2', 'fr_diazo', 'chi3n', 'fr_ar_coo', 'slogp_vsa5', 'fr_bicyclic', 'fr_amide', 'estate_vsa10', 'fr_guanido', 'chi1n', 'numsaturatedrings', 'fr_piperdine', 'fr_term_acetylene', 'estate_vsa4', 'slogp_vsa3', 'fr_coo', 'fr_ether', 'estate_vsa7', 'bcut2d_chglo', 'fr_oxazole', 'peoe_vsa6', 'hallkieralpha', 'peoe_vsa2', 'chi2v', 'nocount', 'vsa_estate5', 'fr_nhpyrrole', 'fr_al_coo', 'bertzct', 'estate_vsa11', 'minabspartialcharge', 'slogp_vsa8', 'fr_imide', 'kappa2', 'numaliphaticheterocycles', 'numsaturatedheterocycles', 'fr_hdrzone', 'smr_vsa4', 'fr_ar_n', 'nrot', 'smr_vsa8', 'slogp_vsa2', 'chi4v', 'fr_phos_ester', 'fr_para_hydroxylation', 'smr_vsa3', 'nhohcount', 'estate_vsa2', 'mollogp', 'tpsa', 'fr_azide', 'peoe_vsa4', 'numhdonors', 'fr_al_oh_notert', 'fr_c_o', 'chi0', 'fr_nitro_arom_nonortho', 'vsa_estate3', 'fr_benzene', 'fr_ketone', 'vsa_estate8', 'smr_vsa7', 'fr_ar_oh', 'fr_priamide', 'ringcount', 'estate_vsa3', 'numaromaticcarbocycles', 'bcut2d_mwhi', 'chi1v', 'heavyatomcount', 'vsa_estate4', 'chi0v'],
34
34
  "compressed_features": [],
35
- "model_metrics_s3_path": "s3://ideaya-sageworks-bucket/models/sol-with-pka-class-100-test/training",
36
- "train_all_data": True
35
+ "model_metrics_s3_path": "s3://ideaya-sageworks-bucket/models/temp-hlm-phase1-reg-0-80/training",
36
+ "train_all_data": False
37
37
  }
38
38
 
39
39
  # Function to check if dataframe is empty
@@ -88,13 +88,12 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
88
88
  """
89
89
  Matches and renames DataFrame columns to match model feature names (case-insensitive).
90
90
  Prioritizes exact matches, then case-insensitive matches.
91
-
91
+
92
92
  Raises ValueError if any model features cannot be matched.
93
93
  """
94
94
  df_columns_lower = {col.lower(): col for col in df.columns}
95
95
  rename_dict = {}
96
96
  missing = []
97
-
98
97
  for feature in model_features:
99
98
  if feature in df.columns:
100
99
  continue # Exact match
@@ -102,10 +101,11 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
102
101
  rename_dict[df_columns_lower[feature.lower()]] = feature
103
102
  else:
104
103
  missing.append(feature)
105
-
104
+
106
105
  if missing:
107
106
  raise ValueError(f"Features not found: {missing}")
108
-
107
+
108
+ # Rename the DataFrame columns to match the model features
109
109
  return df.rename(columns=rename_dict)
110
110
 
111
111
 
@@ -197,7 +197,7 @@ if __name__ == "__main__":
197
197
  """The main function is for training the XGBoost model"""
198
198
 
199
199
  # Harness Template Parameters
200
- target = TEMPLATE_PARAMS["target_column"]
200
+ target = TEMPLATE_PARAMS["target"]
201
201
  features = TEMPLATE_PARAMS["features"]
202
202
  orig_features = features.copy()
203
203
  compressed_features = TEMPLATE_PARAMS["compressed_features"]
@@ -390,7 +390,7 @@ def input_fn(input_data, content_type):
390
390
  """Parse input data and return a DataFrame."""
391
391
  if not input_data:
392
392
  raise ValueError("Empty input data is not supported!")
393
-
393
+
394
394
  # Decode bytes to string if necessary
395
395
  if isinstance(input_data, bytes):
396
396
  input_data = input_data.decode("utf-8")
@@ -29,7 +29,7 @@ from typing import List, Tuple
29
29
  # Template Parameters
30
30
  TEMPLATE_PARAMS = {
31
31
  "model_type": "{{model_type}}",
32
- "target_column": "{{target_column}}",
32
+ "target": "{{target_column}}",
33
33
  "features": "{{feature_list}}",
34
34
  "compressed_features": "{{compressed_features}}",
35
35
  "model_metrics_s3_path": "{{model_metrics_s3_path}}",
@@ -88,13 +88,12 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
88
88
  """
89
89
  Matches and renames DataFrame columns to match model feature names (case-insensitive).
90
90
  Prioritizes exact matches, then case-insensitive matches.
91
-
91
+
92
92
  Raises ValueError if any model features cannot be matched.
93
93
  """
94
94
  df_columns_lower = {col.lower(): col for col in df.columns}
95
95
  rename_dict = {}
96
96
  missing = []
97
-
98
97
  for feature in model_features:
99
98
  if feature in df.columns:
100
99
  continue # Exact match
@@ -102,10 +101,11 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
102
101
  rename_dict[df_columns_lower[feature.lower()]] = feature
103
102
  else:
104
103
  missing.append(feature)
105
-
104
+
106
105
  if missing:
107
106
  raise ValueError(f"Features not found: {missing}")
108
-
107
+
108
+ # Rename the DataFrame columns to match the model features
109
109
  return df.rename(columns=rename_dict)
110
110
 
111
111
 
@@ -197,7 +197,7 @@ if __name__ == "__main__":
197
197
  """The main function is for training the XGBoost model"""
198
198
 
199
199
  # Harness Template Parameters
200
- target = TEMPLATE_PARAMS["target_column"]
200
+ target = TEMPLATE_PARAMS["target"]
201
201
  features = TEMPLATE_PARAMS["features"]
202
202
  orig_features = features.copy()
203
203
  compressed_features = TEMPLATE_PARAMS["compressed_features"]
@@ -390,7 +390,7 @@ def input_fn(input_data, content_type):
390
390
  """Parse input data and return a DataFrame."""
391
391
  if not input_data:
392
392
  raise ValueError("Empty input data is not supported!")
393
-
393
+
394
394
  # Decode bytes to string if necessary
395
395
  if isinstance(input_data, bytes):
396
396
  input_data = input_data.decode("utf-8")
@@ -76,7 +76,7 @@ def run_batch_job(script_path: str, size: str = "small") -> int:
76
76
  response = batch.submit_job(
77
77
  jobName=job_name,
78
78
  jobQueue="workbench-job-queue",
79
- jobDefinition=f"workbench-ml-pipeline-{size}",
79
+ jobDefinition=f"workbench-batch-{size}",
80
80
  containerOverrides={
81
81
  "environment": [
82
82
  {"name": "ML_PIPELINE_S3_PATH", "value": s3_path},
@@ -0,0 +1,139 @@
1
+ import argparse
2
+ import logging
3
+ import json
4
+ from pathlib import Path
5
+
6
+ # Workbench Imports
7
+ from workbench.core.cloud_platform.aws.aws_account_clamp import AWSAccountClamp
8
+ from workbench.utils.config_manager import ConfigManager
9
+ from workbench.utils.s3_utils import upload_content_to_s3
10
+
11
+ log = logging.getLogger("workbench")
12
+ cm = ConfigManager()
13
+ workbench_bucket = cm.get_config("WORKBENCH_BUCKET")
14
+
15
+
16
+ def submit_to_sqs(script_path: str, size: str = "small") -> None:
17
+ """
18
+ Upload script to S3 and submit message to SQS queue for processing.
19
+ Args:
20
+ script_path: Local path to the ML pipeline script
21
+ size: Job size tier - "small" (default), "medium", or "large"
22
+ """
23
+ print(f"\n{'=' * 60}")
24
+ print("🚀 SUBMITTING ML PIPELINE JOB")
25
+ print(f"{'=' * 60}")
26
+
27
+ if size not in ["small", "medium", "large"]:
28
+ raise ValueError(f"Invalid size '{size}'. Must be 'small', 'medium', or 'large'")
29
+ # Validate script exists
30
+ script_file = Path(script_path)
31
+ if not script_file.exists():
32
+ raise FileNotFoundError(f"Script not found: {script_path}")
33
+
34
+ print(f"📄 Script: {script_file.name}")
35
+ print(f"📏 Size tier: {size}")
36
+ print(f"🪣 Bucket: {workbench_bucket}")
37
+ sqs = AWSAccountClamp().boto3_session.client("sqs")
38
+ script_name = script_file.name
39
+
40
+ # List Workbench queues
41
+ print("\n📋 Listing Workbench SQS queues...")
42
+ try:
43
+ queues = sqs.list_queues(QueueNamePrefix="workbench-")
44
+ queue_urls = queues.get("QueueUrls", [])
45
+ if queue_urls:
46
+ print(f"✅ Found {len(queue_urls)} workbench queue(s):")
47
+ for url in queue_urls:
48
+ queue_name = url.split("/")[-1]
49
+ print(f" • {queue_name}")
50
+ else:
51
+ print("⚠️ No workbench queues found")
52
+ except Exception as e:
53
+ print(f"❌ Error listing queues: {e}")
54
+
55
+ # Upload script to S3
56
+ s3_path = f"s3://{workbench_bucket}/batch-jobs/{script_name}"
57
+ print("\n📤 Uploading script to S3...")
58
+ print(f" Source: {script_path}")
59
+ print(f" Destination: {s3_path}")
60
+
61
+ try:
62
+ upload_content_to_s3(script_file.read_text(), s3_path)
63
+ print("✅ Script uploaded successfully")
64
+ except Exception as e:
65
+ print(f"❌ Upload failed: {e}")
66
+ raise
67
+ # Get queue URL and info
68
+ queue_name = "workbench-ml-pipeline-queue.fifo"
69
+ print("\n🎯 Getting queue information...")
70
+ print(f" Queue name: {queue_name}")
71
+
72
+ try:
73
+ queue_url = sqs.get_queue_url(QueueName=queue_name)["QueueUrl"]
74
+ print(f" Queue URL: {queue_url}")
75
+
76
+ # Get queue attributes for additional info
77
+ attrs = sqs.get_queue_attributes(
78
+ QueueUrl=queue_url, AttributeNames=["ApproximateNumberOfMessages", "ApproximateNumberOfMessagesNotVisible"]
79
+ )
80
+ messages_available = attrs["Attributes"].get("ApproximateNumberOfMessages", "0")
81
+ messages_in_flight = attrs["Attributes"].get("ApproximateNumberOfMessagesNotVisible", "0")
82
+ print(f" Messages in queue: {messages_available}")
83
+ print(f" Messages in flight: {messages_in_flight}")
84
+
85
+ except Exception as e:
86
+ print(f"❌ Error accessing queue: {e}")
87
+ raise
88
+
89
+ # Prepare message
90
+ message = {"script_path": s3_path, "size": size}
91
+ print("\n📨 Sending message to SQS...")
92
+
93
+ # Send the message to SQS
94
+ try:
95
+ response = sqs.send_message(
96
+ QueueUrl=queue_url,
97
+ MessageBody=json.dumps(message, indent=2),
98
+ MessageGroupId="ml-pipeline-jobs", # Required for FIFO
99
+ )
100
+ message_id = response["MessageId"]
101
+ print("✅ Message sent successfully!")
102
+ print(f" Message ID: {message_id}")
103
+ except Exception as e:
104
+ print(f"❌ Failed to send message: {e}")
105
+ raise
106
+
107
+ # Success summary
108
+ print(f"\n{'=' * 60}")
109
+ print("✅ JOB SUBMISSION COMPLETE")
110
+ print(f"{'=' * 60}")
111
+ print(f"📄 Script: {script_name}")
112
+ print(f"📏 Size: {size}")
113
+ print(f"🆔 Message ID: {message_id}")
114
+ print("\n🔍 MONITORING LOCATIONS:")
115
+ print(f" • SQS Queue: AWS Console → SQS → {queue_name}")
116
+ print(" • Lambda Logs: AWS Console → Lambda → Functions")
117
+ print(" • Batch Jobs: AWS Console → Batch → Jobs")
118
+ print(" • CloudWatch: AWS Console → CloudWatch → Log groups")
119
+ print("\n⏳ Your job should start processing soon...")
120
+
121
+
122
+ def main():
123
+ """CLI entry point for submitting ML pipelines via SQS."""
124
+ parser = argparse.ArgumentParser(description="Submit ML pipeline to SQS queue for Batch processing")
125
+ parser.add_argument("script_file", help="Local path to ML pipeline script")
126
+ parser.add_argument(
127
+ "--size", default="small", choices=["small", "medium", "large"], help="Job size tier (default: small)"
128
+ )
129
+ args = parser.parse_args()
130
+ try:
131
+ submit_to_sqs(args.script_file, args.size)
132
+ except Exception as e:
133
+ print(f"\n❌ ERROR: {e}")
134
+ log.error(f"Error: {e}")
135
+ exit(1)
136
+
137
+
138
+ if __name__ == "__main__":
139
+ main()
@@ -220,6 +220,8 @@ def uq_metrics(df: pd.DataFrame, target_col: str) -> Dict[str, Any]:
220
220
  # --- Coverage and Interval Width ---
221
221
  if "q_025" in df.columns and "q_975" in df.columns:
222
222
  lower_95, upper_95 = df["q_025"], df["q_975"]
223
+ lower_90, upper_90 = df["q_05"], df["q_95"]
224
+ lower_80, upper_80 = df["q_10"], df["q_90"]
223
225
  lower_50, upper_50 = df["q_25"], df["q_75"]
224
226
  elif "prediction_std" in df.columns:
225
227
  lower_95 = df["prediction"] - 1.96 * df["prediction_std"]
@@ -231,8 +233,12 @@ def uq_metrics(df: pd.DataFrame, target_col: str) -> Dict[str, Any]:
231
233
  "Either quantile columns (q_025, q_975, q_25, q_75) or 'prediction_std' column must be present."
232
234
  )
233
235
  coverage_95 = np.mean((df[target_col] >= lower_95) & (df[target_col] <= upper_95))
236
+ coverage_90 = np.mean((df[target_col] >= lower_90) & (df[target_col] <= upper_90))
237
+ coverage_80 = np.mean((df[target_col] >= lower_80) & (df[target_col] <= upper_80))
234
238
  coverage_50 = np.mean((df[target_col] >= lower_50) & (df[target_col] <= upper_50))
235
239
  avg_width_95 = np.mean(upper_95 - lower_95)
240
+ avg_width_90 = np.mean(upper_90 - lower_90)
241
+ avg_width_80 = np.mean(upper_80 - lower_80)
236
242
  avg_width_50 = np.mean(upper_50 - lower_50)
237
243
 
238
244
  # --- CRPS (measures calibration + sharpness) ---
@@ -260,6 +266,8 @@ def uq_metrics(df: pd.DataFrame, target_col: str) -> Dict[str, Any]:
260
266
  # Collect results
261
267
  results = {
262
268
  "coverage_95": coverage_95,
269
+ "coverage_90": coverage_90,
270
+ "coverage_80": coverage_80,
263
271
  "coverage_50": coverage_50,
264
272
  "avg_width_95": avg_width_95,
265
273
  "avg_width_50": avg_width_50,
@@ -271,8 +279,12 @@ def uq_metrics(df: pd.DataFrame, target_col: str) -> Dict[str, Any]:
271
279
 
272
280
  print("\n=== UQ Metrics ===")
273
281
  print(f"Coverage @ 95%: {coverage_95:.3f} (target: 0.95)")
282
+ print(f"Coverage @ 90%: {coverage_90:.3f} (target: 0.90)")
283
+ print(f"Coverage @ 80%: {coverage_80:.3f} (target: 0.80)")
274
284
  print(f"Coverage @ 50%: {coverage_50:.3f} (target: 0.50)")
275
285
  print(f"Average 95% Width: {avg_width_95:.3f}")
286
+ print(f"Average 90% Width: {avg_width_90:.3f}")
287
+ print(f"Average 80% Width: {avg_width_80:.3f}")
276
288
  print(f"Average 50% Width: {avg_width_50:.3f}")
277
289
  print(f"CRPS: {mean_crps:.3f} (lower is better)")
278
290
  print(f"Interval Score 95%: {mean_is_95:.3f} (lower is better)")
@@ -72,7 +72,9 @@ class DashboardStatus(PluginInterface):
72
72
  details = "**Redis:** 🔴 Failed to Connect<br>"
73
73
 
74
74
  # Fill in the license details
75
- details += f"**Redis Server:** {config_info['REDIS_HOST']}:{config_info.get('REDIS_PORT', 6379)}<br>"
75
+ redis_host = config_info.get("REDIS_HOST", "NOT SET")
76
+ redis_port = config_info.get("REDIS_PORT", "NOT SET")
77
+ details += f"**Redis Server:** {redis_host}:{redis_port}<br>"
76
78
  details += f"**Workbench S3 Bucket:** {config_info['WORKBENCH_BUCKET']}<br>"
77
79
  details += f"**Plugin Path:** {config_info.get('WORKBENCH_PLUGINS', 'unknown')}<br>"
78
80
  details += f"**Themes Path:** {config_info.get('WORKBENCH_THEMES', 'unknown')}<br>"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: workbench
3
- Version: 0.8.170
3
+ Version: 0.8.171
4
4
  Summary: Workbench: A Dashboard and Python API for creating and deploying AWS SageMaker Model Pipelines
5
5
  Author-email: SuperCowPowers LLC <support@supercowpowers.com>
6
6
  License-Expression: MIT
@@ -62,7 +62,7 @@ workbench/core/cloud_platform/aws/README.md,sha256=QT5IQXoUHbIA0qQ2wO6_2P2lYjYQF
62
62
  workbench/core/cloud_platform/aws/aws_account_clamp.py,sha256=OzFknZXKW7VTvnDGGX4BXKoh0i1gQ7yaEBhkLCyHFSs,6310
63
63
  workbench/core/cloud_platform/aws/aws_df_store.py,sha256=utRIlTCPwFneHHZ8_Z3Hw3rOJSeryiFA4wBtucxULRQ,15055
64
64
  workbench/core/cloud_platform/aws/aws_graph_store.py,sha256=ytYxQTplUmeWbsPmxyZbf6mO9qyTl60ewlJG8MyfyEY,9414
65
- workbench/core/cloud_platform/aws/aws_meta.py,sha256=xpidYpDydgWmKmJPrNFWbggahDY-nRXzXTRaEA3c5Sc,34587
65
+ workbench/core/cloud_platform/aws/aws_meta.py,sha256=eY9Pn6pl2yAyseACFb2nitR-0vLwG4i8CSEXe8Iaswc,34778
66
66
  workbench/core/cloud_platform/aws/aws_parameter_store.py,sha256=9ekuMOQFHFMIEV68UbHhS_fLB9iqG5Hvu4EV6iamEpk,10400
67
67
  workbench/core/cloud_platform/aws/aws_secrets_manager.py,sha256=TUnddp1gX-OwxJ_oO5ONh7OI4Z2HC_6euGkJ-himCCk,8615
68
68
  workbench/core/cloud_platform/aws/aws_session.py,sha256=2Gc_k4Q87BBeQDgXgVR-w-qmsF6ncZR8wvTeNnixM6k,6926
@@ -139,10 +139,10 @@ workbench/model_scripts/custom_models/uq_models/Readme.md,sha256=UVpL-lvtTrLqwBe
139
139
  workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template,sha256=U4LIlpp8Rbu3apyzPR7-55lvlutpTsCro_PUvQ5pklY,6457
140
140
  workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template,sha256=0IJnSBACQ556ldEiPqR7yPCOOLJs1hQhHmPBvB2d9tY,13491
141
141
  workbench/model_scripts/custom_models/uq_models/gaussian_process.template,sha256=QbDUfkiPCwJ-c-4Twgu4utZuYZaAyeW_3T1IP-_tutw,6683
142
- workbench/model_scripts/custom_models/uq_models/generated_model_script.py,sha256=l74VibzFnhmPeNUEiFwIIg5aNujcCs9LtRywUvc5Avo,14528
142
+ workbench/model_scripts/custom_models/uq_models/generated_model_script.py,sha256=QsMivNf77m4XfrV9aYTB7K3vI-InwegD7gyLZFNQmF4,17170
143
143
  workbench/model_scripts/custom_models/uq_models/mapie_xgb.template,sha256=ZTmerwkmXtewJwx3GGJSdLRyzJV5SJ86PvCu3dV_GHw,7330
144
- workbench/model_scripts/custom_models/uq_models/meta_uq.template,sha256=26FNangcpyV9nFOIufRuVZ45BQv6oPf9xlJZkVIULG4,9287
145
- workbench/model_scripts/custom_models/uq_models/ngboost.template,sha256=N-eWP967-X2Qbvk18VL7LPXRJMKne9SS2fb_jntwTec,7738
144
+ workbench/model_scripts/custom_models/uq_models/meta_uq.template,sha256=FqLLbuKMijd4DjmxuBBQN3_vZcbl8WF0BZU8HRK48_0,13977
145
+ workbench/model_scripts/custom_models/uq_models/ngboost.template,sha256=9-O6P-SW50ul5Wl6es2DMWXSbrwOg7HWsdc8Qdln0MM,8278
146
146
  workbench/model_scripts/custom_models/uq_models/proximity.py,sha256=zqmNlX70LnWXr5fdtFFQppSNTLjlOciQVrjGr-g9jRE,13716
147
147
  workbench/model_scripts/custom_models/uq_models/requirements.txt,sha256=jfwV5b1t6BFtdaRGrSz8LnuQzJm-4V5OlhhP-4CGxhs,107
148
148
  workbench/model_scripts/custom_script_example/custom_model_script.py,sha256=T8aydawgRVAdSlDimoWpXxG2YuWWQkbcjBVjAeSG2_0,6408
@@ -158,16 +158,17 @@ workbench/model_scripts/quant_regression/requirements.txt,sha256=jWlGc7HH7vqyukT
158
158
  workbench/model_scripts/scikit_learn/generated_model_script.py,sha256=c73ZpJBlU5k13Nx-ZDkLXu7da40CYyhwjwwmuPq6uLg,12870
159
159
  workbench/model_scripts/scikit_learn/requirements.txt,sha256=aVvwiJ3LgBUhM_PyFlb2gHXu_kpGPho3ANBzlOkfcvs,107
160
160
  workbench/model_scripts/scikit_learn/scikit_learn.template,sha256=d4pgeZYFezUQsB-7iIsjsUgB1FM6d27651wpfDdXmI0,12640
161
- workbench/model_scripts/xgb_model/generated_model_script.py,sha256=IITiaNcB7kqQtBCTvTbWwCb-vAKNeJsbyxBB691sU8U,21091
161
+ workbench/model_scripts/xgb_model/generated_model_script.py,sha256=nU9BLU0wIhK066HAgChgNLcuOM94vBqweoH8xB8wBeo,21152
162
162
  workbench/model_scripts/xgb_model/requirements.txt,sha256=jWlGc7HH7vqyukTm38LN4EyDi8jDUPEay4n45z-30uc,104
163
- workbench/model_scripts/xgb_model/xgb_model.template,sha256=RaUr8X6al5R2IILNKgGUH05Gb4H7AFFG9RE524_VH7Q,17935
163
+ workbench/model_scripts/xgb_model/xgb_model.template,sha256=HViJRsMWn393hP8VJRS45UQBzUVBhwR5sKc8Ern-9f4,17963
164
164
  workbench/repl/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
165
165
  workbench/repl/workbench_shell.py,sha256=eJ3rpYgEwZjhrVVCaJHht2N5BrimN6mbxqHXGrJmwC8,22130
166
166
  workbench/resources/open_source_api.key,sha256=3S0OTblsmC0msUPdE_dbBmI83xJNmYscuwLJ57JmuOc,433
167
167
  workbench/resources/signature_verify_pub.pem,sha256=V3-u-3_z2PH-805ybkKvzDOBwAbvHxcKn0jLBImEtzM,272
168
168
  workbench/scripts/check_double_bond_stereo.py,sha256=p5hnL54Weq77ES0HCELq9JeoM-PyUGkvVSeWYF2dKyo,7776
169
169
  workbench/scripts/glue_launcher.py,sha256=bIKQvfGxpAhzbeNvTnHfRW_5kQhY-169_868ZnCejJk,10692
170
- workbench/scripts/ml_pipeline_launcher.py,sha256=fjI35SXi9CDSQ6Lan7qGcLAHkVCDioyhbPlo0eDHDxQ,4913
170
+ workbench/scripts/ml_pipeline_batch.py,sha256=1T5JnLlUJR7bwAGBLHmLPOuj1xFRqVIQX8PsuDhHy8o,4907
171
+ workbench/scripts/ml_pipeline_sqs.py,sha256=7w67UUuZNYnxXiZG48gpoEFbH-c_cUfjMg0FgWI0DbQ,5100
171
172
  workbench/scripts/monitor_cloud_watch.py,sha256=s7MY4bsHts0nup9G0lWESCvgJZ9Mw1Eo-c8aKRgLjMw,9235
172
173
  workbench/scripts/redis_expire.py,sha256=DxI_RKSNlrW2BsJZXcsSbaWGBgPZdPhtzHjV9SUtElE,1120
173
174
  workbench/scripts/redis_report.py,sha256=iaJSuGPyLCs6e0TMcZDoT0YyJ43xJ1u74YD8FLnnUg4,990
@@ -219,7 +220,7 @@ workbench/utils/lambda_utils.py,sha256=7GhGRPyXn9o-toWb9HBGSnI8-DhK9YRkwhCSk_mNK
219
220
  workbench/utils/license_manager.py,sha256=sDuhk1mZZqUbFmnuFXehyGnui_ALxrmYBg7gYwoo7ho,6975
220
221
  workbench/utils/log_utils.py,sha256=7n1NJXO_jUX82e6LWAQug6oPo3wiPDBYsqk9gsYab_A,3167
221
222
  workbench/utils/markdown_utils.py,sha256=4lEqzgG4EVmLcvvKKNUwNxVCySLQKJTJmWDiaDroI1w,8306
222
- workbench/utils/model_utils.py,sha256=YV_OPdRXabte9Zo8v9igs4kW8s6eCngtvapa9jY6X_k,11264
223
+ workbench/utils/model_utils.py,sha256=S_fGnYucuOH5YfNviH-K85hUjSh1zFRCIjuduax7rvU,11940
223
224
  workbench/utils/monitor_utils.py,sha256=ywoEdqoHY9t5PYRstjitS_halEWO6veCL_06BekmMVo,9153
224
225
  workbench/utils/pandas_utils.py,sha256=LQTfZ3WJkg3rIahNJhsz1YV2y_0DBG94lO-KMmEY1g0,39325
225
226
  workbench/utils/performance_utils.py,sha256=WDNvz-bOdC99cDuXl0urAV4DJ7alk_V3yzKPwvqgST4,1329
@@ -256,7 +257,7 @@ workbench/web_interface/components/experiments/dashboard_metric_plots.py,sha256=
256
257
  workbench/web_interface/components/experiments/outlier_plot.py,sha256=5bWsmJEXyt50npeQxLHXCPtiq4WRVgg938Sl0DVjNWg,3647
257
258
  workbench/web_interface/components/plugins/ag_table.py,sha256=HrPOMotlOGigk0v8Cxx_doSHXdOKTT1-bzlsqDwwzng,3979
258
259
  workbench/web_interface/components/plugins/confusion_matrix.py,sha256=1K94JSlDwQwdf5uDYVydQzY-EQm89hYXchxbXoNvons,7176
259
- workbench/web_interface/components/plugins/dashboard_status.py,sha256=8Tu38lR5YgntxDjz_x2XfLiW7SOdreNLOFT5VkbYzKo,5748
260
+ workbench/web_interface/components/plugins/dashboard_status.py,sha256=4plmoiXj3dDjoQerUNpep_jfk50pI9rHvcoSP20UbE8,5832
260
261
  workbench/web_interface/components/plugins/data_details.py,sha256=pZm1AbM_0EXQwx77qUkfyrU9MedAs4Wlkp6iOtSrUtI,11104
261
262
  workbench/web_interface/components/plugins/endpoint_details.py,sha256=0A7g_Lx5-3XnDWOGT3YEDPNpmME_-WfYc65f-rRVjJE,3769
262
263
  workbench/web_interface/components/plugins/generated_compounds.py,sha256=hC0sh-1_rbN55Huno-E_2wF37kgIHi5Mtaer6Xk5fRM,8052
@@ -276,9 +277,9 @@ workbench/web_interface/page_views/main_page.py,sha256=X4-KyGTKLAdxR-Zk2niuLJB2Y
276
277
  workbench/web_interface/page_views/models_page_view.py,sha256=M0bdC7bAzLyIaE2jviY12FF4abdMFZmg6sFuOY_LaGI,2650
277
278
  workbench/web_interface/page_views/page_view.py,sha256=Gh6YnpOGlUejx-bHZAf5pzqoQ1H1R0OSwOpGhOBO06w,455
278
279
  workbench/web_interface/page_views/pipelines_page_view.py,sha256=v2pxrIbsHBcYiblfius3JK766NZ7ciD2yPx0t3E5IJo,2656
279
- workbench-0.8.170.dist-info/licenses/LICENSE,sha256=z4QMMPlLJkZjU8VOKqJkZiQZCEZ--saIU2Z8-p3aVc0,1080
280
- workbench-0.8.170.dist-info/METADATA,sha256=GbS745jAMPDykgLqfLcwjb9nRnczT-uV9Q11GbPBAX8,9210
281
- workbench-0.8.170.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
282
- workbench-0.8.170.dist-info/entry_points.txt,sha256=V_v6hQ4DYoCJnTnqbm036reCri_CXkA_ONcRSuF5OKg,305
283
- workbench-0.8.170.dist-info/top_level.txt,sha256=Dhy72zTxaA_o_yRkPZx5zw-fwumnjGaeGf0hBN3jc_w,10
284
- workbench-0.8.170.dist-info/RECORD,,
280
+ workbench-0.8.171.dist-info/licenses/LICENSE,sha256=z4QMMPlLJkZjU8VOKqJkZiQZCEZ--saIU2Z8-p3aVc0,1080
281
+ workbench-0.8.171.dist-info/METADATA,sha256=cLYIPKqidwQU6U3CIprMiMImJm8hwvKBAJBXGck_Aqo,9210
282
+ workbench-0.8.171.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
283
+ workbench-0.8.171.dist-info/entry_points.txt,sha256=zPFPruY9uayk8-wsKrhfnIyIB6jvZOW_ibyllEIsLWo,356
284
+ workbench-0.8.171.dist-info/top_level.txt,sha256=Dhy72zTxaA_o_yRkPZx5zw-fwumnjGaeGf0hBN3jc_w,10
285
+ workbench-0.8.171.dist-info/RECORD,,
@@ -1,6 +1,7 @@
1
1
  [console_scripts]
2
2
  cloud_watch = workbench.scripts.monitor_cloud_watch:main
3
3
  glue_launcher = workbench.scripts.glue_launcher:main
4
- ml_pipeline_launcher = workbench.scripts.ml_pipeline_launcher:main
4
+ ml_pipeline_batch = workbench.scripts.ml_pipeline_batch:main
5
+ ml_pipeline_sqs = workbench.scripts.ml_pipeline_sqs:main
5
6
  workbench = workbench.repl.workbench_shell:launch_shell
6
7
  workbench_config = workbench.scripts.show_config:main