workbench 0.8.168__py3-none-any.whl → 0.8.193__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. workbench/algorithms/dataframe/proximity.py +143 -102
  2. workbench/algorithms/graph/light/proximity_graph.py +2 -1
  3. workbench/api/compound.py +1 -1
  4. workbench/api/endpoint.py +3 -2
  5. workbench/api/feature_set.py +4 -4
  6. workbench/api/model.py +16 -12
  7. workbench/api/monitor.py +1 -16
  8. workbench/core/artifacts/artifact.py +11 -3
  9. workbench/core/artifacts/data_capture_core.py +355 -0
  10. workbench/core/artifacts/endpoint_core.py +113 -27
  11. workbench/core/artifacts/feature_set_core.py +72 -13
  12. workbench/core/artifacts/model_core.py +71 -49
  13. workbench/core/artifacts/monitor_core.py +33 -249
  14. workbench/core/cloud_platform/aws/aws_account_clamp.py +50 -1
  15. workbench/core/cloud_platform/aws/aws_meta.py +11 -4
  16. workbench/core/transforms/data_to_features/light/molecular_descriptors.py +4 -4
  17. workbench/core/transforms/features_to_model/features_to_model.py +11 -6
  18. workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +36 -6
  19. workbench/core/transforms/pandas_transforms/pandas_to_features.py +27 -0
  20. workbench/core/views/training_view.py +49 -53
  21. workbench/core/views/view.py +51 -1
  22. workbench/core/views/view_utils.py +4 -4
  23. workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +483 -0
  24. workbench/model_scripts/custom_models/chem_info/mol_standardize.py +450 -0
  25. workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +7 -9
  26. workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +3 -5
  27. workbench/model_scripts/custom_models/proximity/proximity.py +143 -102
  28. workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
  29. workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +10 -17
  30. workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
  31. workbench/model_scripts/custom_models/uq_models/meta_uq.template +156 -58
  32. workbench/model_scripts/custom_models/uq_models/ngboost.template +20 -14
  33. workbench/model_scripts/custom_models/uq_models/proximity.py +143 -102
  34. workbench/model_scripts/custom_models/uq_models/requirements.txt +1 -3
  35. workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +5 -13
  36. workbench/model_scripts/pytorch_model/pytorch.template +9 -18
  37. workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
  38. workbench/model_scripts/script_generation.py +7 -2
  39. workbench/model_scripts/uq_models/mapie.template +492 -0
  40. workbench/model_scripts/uq_models/requirements.txt +1 -0
  41. workbench/model_scripts/xgb_model/generated_model_script.py +34 -43
  42. workbench/model_scripts/xgb_model/xgb_model.template +31 -40
  43. workbench/repl/workbench_shell.py +4 -4
  44. workbench/scripts/lambda_launcher.py +63 -0
  45. workbench/scripts/{ml_pipeline_launcher.py → ml_pipeline_batch.py} +49 -51
  46. workbench/scripts/ml_pipeline_sqs.py +186 -0
  47. workbench/utils/chem_utils/__init__.py +0 -0
  48. workbench/utils/chem_utils/fingerprints.py +134 -0
  49. workbench/utils/chem_utils/misc.py +194 -0
  50. workbench/utils/chem_utils/mol_descriptors.py +483 -0
  51. workbench/utils/chem_utils/mol_standardize.py +450 -0
  52. workbench/utils/chem_utils/mol_tagging.py +348 -0
  53. workbench/utils/chem_utils/projections.py +209 -0
  54. workbench/utils/chem_utils/salts.py +256 -0
  55. workbench/utils/chem_utils/sdf.py +292 -0
  56. workbench/utils/chem_utils/toxicity.py +250 -0
  57. workbench/utils/chem_utils/vis.py +253 -0
  58. workbench/utils/config_manager.py +2 -6
  59. workbench/utils/endpoint_utils.py +5 -7
  60. workbench/utils/license_manager.py +2 -6
  61. workbench/utils/model_utils.py +89 -31
  62. workbench/utils/monitor_utils.py +44 -62
  63. workbench/utils/pandas_utils.py +3 -3
  64. workbench/utils/shap_utils.py +10 -2
  65. workbench/utils/workbench_sqs.py +1 -1
  66. workbench/utils/xgboost_model_utils.py +300 -151
  67. workbench/web_interface/components/model_plot.py +7 -1
  68. workbench/web_interface/components/plugins/dashboard_status.py +3 -1
  69. workbench/web_interface/components/plugins/generated_compounds.py +1 -1
  70. workbench/web_interface/components/plugins/model_details.py +7 -2
  71. workbench/web_interface/components/plugins/scatter_plot.py +3 -3
  72. {workbench-0.8.168.dist-info → workbench-0.8.193.dist-info}/METADATA +24 -2
  73. {workbench-0.8.168.dist-info → workbench-0.8.193.dist-info}/RECORD +77 -72
  74. {workbench-0.8.168.dist-info → workbench-0.8.193.dist-info}/entry_points.txt +3 -1
  75. {workbench-0.8.168.dist-info → workbench-0.8.193.dist-info}/licenses/LICENSE +1 -1
  76. workbench/model_scripts/custom_models/chem_info/local_utils.py +0 -769
  77. workbench/model_scripts/custom_models/chem_info/tautomerize.py +0 -83
  78. workbench/model_scripts/custom_models/proximity/generated_model_script.py +0 -138
  79. workbench/model_scripts/custom_models/uq_models/generated_model_script.py +0 -393
  80. workbench/model_scripts/custom_models/uq_models/mapie_xgb.template +0 -203
  81. workbench/model_scripts/ensemble_xgb/generated_model_script.py +0 -279
  82. workbench/model_scripts/pytorch_model/generated_model_script.py +0 -576
  83. workbench/model_scripts/quant_regression/quant_regression.template +0 -279
  84. workbench/model_scripts/quant_regression/requirements.txt +0 -1
  85. workbench/model_scripts/scikit_learn/generated_model_script.py +0 -307
  86. workbench/utils/chem_utils.py +0 -1556
  87. workbench/utils/fast_inference.py +0 -167
  88. workbench/utils/resource_utils.py +0 -39
  89. {workbench-0.8.168.dist-info → workbench-0.8.193.dist-info}/WHEEL +0 -0
  90. {workbench-0.8.168.dist-info → workbench-0.8.193.dist-info}/top_level.txt +0 -0
@@ -28,14 +28,16 @@ from typing import List, Tuple
28
28
 
29
29
  # Template Parameters
30
30
  TEMPLATE_PARAMS = {
31
- "model_type": "classifier",
32
- "target_column": "class",
33
- "features": ['bcut2d_logplow', 'numradicalelectrons', 'smr_vsa5', 'fr_lactam', 'fr_morpholine', 'fr_aldehyde', 'slogp_vsa1', 'fr_amidine', 'bpol', 'fr_ester', 'fr_azo', 'kappa3', 'peoe_vsa5', 'fr_ketone_topliss', 'vsa_estate9', 'estate_vsa9', 'bcut2d_mrhi', 'fr_ndealkylation1', 'numrotatablebonds', 'minestateindex', 'fr_quatn', 'peoe_vsa3', 'fr_epoxide', 'fr_aniline', 'minpartialcharge', 'fr_nitroso', 'fpdensitymorgan2', 'fr_oxime', 'fr_sulfone', 'smr_vsa1', 'kappa1', 'fr_pyridine', 'numaromaticrings', 'vsa_estate6', 'molmr', 'estate_vsa1', 'fr_dihydropyridine', 'vsa_estate10', 'fr_alkyl_halide', 'chi2n', 'fr_thiocyan', 'fpdensitymorgan1', 'fr_unbrch_alkane', 'slogp_vsa9', 'chi4n', 'fr_nitro_arom', 'fr_al_oh', 'fr_furan', 'fr_c_s', 'peoe_vsa8', 'peoe_vsa14', 'numheteroatoms', 'fr_ndealkylation2', 'maxabspartialcharge', 'vsa_estate2', 'peoe_vsa7', 'apol', 'numhacceptors', 'fr_tetrazole', 'vsa_estate1', 'peoe_vsa9', 'naromatom', 'bcut2d_chghi', 'fr_sh', 'fr_halogen', 'slogp_vsa4', 'fr_benzodiazepine', 'molwt', 'fr_isocyan', 'fr_prisulfonamd', 'maxabsestateindex', 'minabsestateindex', 'peoe_vsa11', 'slogp_vsa12', 'estate_vsa5', 'numaliphaticcarbocycles', 'bcut2d_mwlow', 'slogp_vsa7', 'fr_allylic_oxid', 'fr_methoxy', 'fr_nh0', 'fr_coo2', 'fr_phenol', 'nacid', 'nbase', 'chi3v', 'fr_ar_nh', 'fr_nitrile', 'fr_imidazole', 'fr_urea', 'bcut2d_mrlow', 'chi1', 'smr_vsa6', 'fr_aryl_methyl', 'narombond', 'fr_alkyl_carbamate', 'fr_piperzine', 'exactmolwt', 'qed', 'chi0n', 'fr_sulfonamd', 'fr_thiazole', 'numvalenceelectrons', 'fr_phos_acid', 'peoe_vsa12', 'fr_nh1', 'fr_hdrzine', 'fr_c_o_nocoo', 'fr_lactone', 'estate_vsa6', 'bcut2d_logphi', 'vsa_estate7', 'peoe_vsa13', 'numsaturatedcarbocycles', 'fr_nitro', 'fr_phenol_noorthohbond', 'rotratio', 'fr_barbitur', 'fr_isothiocyan', 'balabanj', 'fr_arn', 'fr_imine', 'maxpartialcharge', 'fr_sulfide', 'slogp_vsa11', 'fr_hoccn', 'fr_n_o', 'peoe_vsa1', 'slogp_vsa6', 'heavyatommolwt', 'fractioncsp3', 'estate_vsa8', 'peoe_vsa10', 'numaliphaticrings', 'fr_thiophene', 'maxestateindex', 'smr_vsa10', 'labuteasa', 'smr_vsa2', 'fpdensitymorgan3', 'smr_vsa9', 'slogp_vsa10', 'numaromaticheterocycles', 'fr_nh2', 'fr_diazo', 'chi3n', 'fr_ar_coo', 'slogp_vsa5', 'fr_bicyclic', 'fr_amide', 'estate_vsa10', 'fr_guanido', 'chi1n', 'numsaturatedrings', 'fr_piperdine', 'fr_term_acetylene', 'estate_vsa4', 'slogp_vsa3', 'fr_coo', 'fr_ether', 'estate_vsa7', 'bcut2d_chglo', 'fr_oxazole', 'peoe_vsa6', 'hallkieralpha', 'peoe_vsa2', 'chi2v', 'nocount', 'vsa_estate5', 'fr_nhpyrrole', 'fr_al_coo', 'bertzct', 'estate_vsa11', 'minabspartialcharge', 'slogp_vsa8', 'fr_imide', 'kappa2', 'numaliphaticheterocycles', 'numsaturatedheterocycles', 'fr_hdrzone', 'smr_vsa4', 'fr_ar_n', 'nrot', 'smr_vsa8', 'slogp_vsa2', 'chi4v', 'fr_phos_ester', 'fr_para_hydroxylation', 'smr_vsa3', 'nhohcount', 'estate_vsa2', 'mollogp', 'tpsa', 'fr_azide', 'peoe_vsa4', 'numhdonors', 'fr_al_oh_notert', 'fr_c_o', 'chi0', 'fr_nitro_arom_nonortho', 'vsa_estate3', 'fr_benzene', 'fr_ketone', 'vsa_estate8', 'smr_vsa7', 'fr_ar_oh', 'fr_priamide', 'ringcount', 'estate_vsa3', 'numaromaticcarbocycles', 'bcut2d_mwhi', 'chi1v', 'heavyatomcount', 'vsa_estate4', 'chi0v', 'pred_pka_reg'],
31
+ "model_type": "regressor",
32
+ "target": "solubility",
33
+ "features": ['molwt', 'mollogp', 'molmr', 'heavyatomcount', 'numhacceptors', 'numhdonors', 'numheteroatoms', 'numrotatablebonds', 'numvalenceelectrons', 'numaromaticrings', 'numsaturatedrings', 'numaliphaticrings', 'ringcount', 'tpsa', 'labuteasa', 'balabanj', 'bertzct'],
34
34
  "compressed_features": [],
35
- "model_metrics_s3_path": "s3://ideaya-sageworks-bucket/models/sol-with-pka-class-100-test/training",
36
- "train_all_data": True
35
+ "model_metrics_s3_path": "s3://sandbox-sageworks-artifacts/models/aqsol-regression/training",
36
+ "train_all_data": False,
37
+ "hyperparameters": {},
37
38
  }
38
39
 
40
+
39
41
  # Function to check if dataframe is empty
40
42
  def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
41
43
  """
@@ -75,7 +77,7 @@ def expand_proba_column(df: pd.DataFrame, class_labels: List[str]) -> pd.DataFra
75
77
  proba_df = pd.DataFrame(df[proba_column].tolist(), columns=proba_splits)
76
78
 
77
79
  # Drop any proba columns and reset the index in prep for the concat
78
- df = df.drop(columns=[proba_column]+proba_splits, errors="ignore")
80
+ df = df.drop(columns=[proba_column] + proba_splits, errors="ignore")
79
81
  df = df.reset_index(drop=True)
80
82
 
81
83
  # Concatenate the new columns with the original DataFrame
@@ -88,13 +90,12 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
88
90
  """
89
91
  Matches and renames DataFrame columns to match model feature names (case-insensitive).
90
92
  Prioritizes exact matches, then case-insensitive matches.
91
-
93
+
92
94
  Raises ValueError if any model features cannot be matched.
93
95
  """
94
96
  df_columns_lower = {col.lower(): col for col in df.columns}
95
97
  rename_dict = {}
96
98
  missing = []
97
-
98
99
  for feature in model_features:
99
100
  if feature in df.columns:
100
101
  continue # Exact match
@@ -102,10 +103,11 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
102
103
  rename_dict[df_columns_lower[feature.lower()]] = feature
103
104
  else:
104
105
  missing.append(feature)
105
-
106
+
106
107
  if missing:
107
108
  raise ValueError(f"Features not found: {missing}")
108
-
109
+
110
+ # Rename the DataFrame columns to match the model features
109
111
  return df.rename(columns=rename_dict)
110
112
 
111
113
 
@@ -140,8 +142,10 @@ def convert_categorical_types(df: pd.DataFrame, features: list, category_mapping
140
142
  return df, category_mappings
141
143
 
142
144
 
143
- def decompress_features(df: pd.DataFrame, features: List[str], compressed_features: List[str]) -> Tuple[pd.DataFrame, List[str]]:
144
- """Prepare features for the XGBoost model
145
+ def decompress_features(
146
+ df: pd.DataFrame, features: List[str], compressed_features: List[str]
147
+ ) -> Tuple[pd.DataFrame, List[str]]:
148
+ """Prepare features for the model by decompressing bitstring features
145
149
 
146
150
  Args:
147
151
  df (pd.DataFrame): The features DataFrame
@@ -166,7 +170,7 @@ def decompress_features(df: pd.DataFrame, features: List[str], compressed_featur
166
170
  )
167
171
 
168
172
  # Decompress the specified compressed features
169
- decompressed_features = features
173
+ decompressed_features = features.copy()
170
174
  for feature in compressed_features:
171
175
  if (feature not in df.columns) or (feature not in features):
172
176
  print(f"Feature '{feature}' not in the features list, skipping decompression.")
@@ -197,13 +201,14 @@ if __name__ == "__main__":
197
201
  """The main function is for training the XGBoost model"""
198
202
 
199
203
  # Harness Template Parameters
200
- target = TEMPLATE_PARAMS["target_column"]
204
+ target = TEMPLATE_PARAMS["target"]
201
205
  features = TEMPLATE_PARAMS["features"]
202
206
  orig_features = features.copy()
203
207
  compressed_features = TEMPLATE_PARAMS["compressed_features"]
204
208
  model_type = TEMPLATE_PARAMS["model_type"]
205
209
  model_metrics_s3_path = TEMPLATE_PARAMS["model_metrics_s3_path"]
206
210
  train_all_data = TEMPLATE_PARAMS["train_all_data"]
211
+ hyperparameters = TEMPLATE_PARAMS["hyperparameters"]
207
212
  validation_split = 0.2
208
213
 
209
214
  # Script arguments for input/output directories
@@ -216,11 +221,7 @@ if __name__ == "__main__":
216
221
  args = parser.parse_args()
217
222
 
218
223
  # Read the training data into DataFrames
219
- training_files = [
220
- os.path.join(args.train, file)
221
- for file in os.listdir(args.train)
222
- if file.endswith(".csv")
223
- ]
224
+ training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
224
225
  print(f"Training Files: {training_files}")
225
226
 
226
227
  # Combine files and read them all into a single pandas dataframe
@@ -255,15 +256,16 @@ if __name__ == "__main__":
255
256
  else:
256
257
  # Just do a random training Split
257
258
  print("WARNING: No training column found, splitting data with random state=42")
258
- df_train, df_val = train_test_split(
259
- all_df, test_size=validation_split, random_state=42
260
- )
259
+ df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
261
260
  print(f"FIT/TRAIN: {df_train.shape}")
262
261
  print(f"VALIDATION: {df_val.shape}")
263
262
 
263
+ # Use any hyperparameters to set up both the trainer and model configurations
264
+ print(f"Hyperparameters: {hyperparameters}")
265
+
264
266
  # Now spin up our XGB Model
265
267
  if model_type == "classifier":
266
- xgb_model = xgb.XGBClassifier(enable_categorical=True)
268
+ xgb_model = xgb.XGBClassifier(enable_categorical=True, **hyperparameters)
267
269
 
268
270
  # Encode the target column
269
271
  label_encoder = LabelEncoder()
@@ -271,12 +273,12 @@ if __name__ == "__main__":
271
273
  df_val[target] = label_encoder.transform(df_val[target])
272
274
 
273
275
  else:
274
- xgb_model = xgb.XGBRegressor(enable_categorical=True)
276
+ xgb_model = xgb.XGBRegressor(enable_categorical=True, **hyperparameters)
275
277
  label_encoder = None # We don't need this for regression
276
278
 
277
279
  # Grab our Features, Target and Train the Model
278
280
  y_train = df_train[target]
279
- X_train= df_train[features]
281
+ X_train = df_train[features]
280
282
  xgb_model.fit(X_train, y_train)
281
283
 
282
284
  # Make Predictions on the Validation Set
@@ -315,9 +317,7 @@ if __name__ == "__main__":
315
317
  label_names = label_encoder.classes_
316
318
 
317
319
  # Calculate various model performance metrics
318
- scores = precision_recall_fscore_support(
319
- y_validate, preds, average=None, labels=label_names
320
- )
320
+ scores = precision_recall_fscore_support(y_validate, preds, average=None, labels=label_names)
321
321
 
322
322
  # Put the scores into a dataframe
323
323
  score_df = pd.DataFrame(
@@ -355,7 +355,9 @@ if __name__ == "__main__":
355
355
  print(f"NumRows: {len(df_val)}")
356
356
 
357
357
  # Now save the model to the standard place/name
358
- xgb_model.save_model(os.path.join(args.model_dir, "xgb_model.json"))
358
+ joblib.dump(xgb_model, os.path.join(args.model_dir, "xgb_model.joblib"))
359
+
360
+ # Save the label encoder if we have one
359
361
  if label_encoder:
360
362
  joblib.dump(label_encoder, os.path.join(args.model_dir, "label_encoder.joblib"))
361
363
 
@@ -370,19 +372,8 @@ if __name__ == "__main__":
370
372
 
371
373
  def model_fn(model_dir):
372
374
  """Deserialize and return fitted XGBoost model"""
373
-
374
- model_path = os.path.join(model_dir, "xgb_model.json")
375
-
376
- with open(model_path, "r") as f:
377
- model_json = json.load(f)
378
-
379
- sklearn_data = model_json['learner']['attributes']['scikit_learn']
380
- model_type = json.loads(sklearn_data)['_estimator_type']
381
-
382
- model_class = xgb.XGBClassifier if model_type == "classifier" else xgb.XGBRegressor
383
- model = model_class(enable_categorical=True)
384
- model.load_model(model_path)
385
-
375
+ model_path = os.path.join(model_dir, "xgb_model.joblib")
376
+ model = joblib.load(model_path)
386
377
  return model
387
378
 
388
379
 
@@ -390,7 +381,7 @@ def input_fn(input_data, content_type):
390
381
  """Parse input data and return a DataFrame."""
391
382
  if not input_data:
392
383
  raise ValueError("Empty input data is not supported!")
393
-
384
+
394
385
  # Decode bytes to string if necessary
395
386
  if isinstance(input_data, bytes):
396
387
  input_data = input_data.decode("utf-8")
@@ -29,13 +29,15 @@ from typing import List, Tuple
29
29
  # Template Parameters
30
30
  TEMPLATE_PARAMS = {
31
31
  "model_type": "{{model_type}}",
32
- "target_column": "{{target_column}}",
32
+ "target": "{{target_column}}",
33
33
  "features": "{{feature_list}}",
34
34
  "compressed_features": "{{compressed_features}}",
35
35
  "model_metrics_s3_path": "{{model_metrics_s3_path}}",
36
- "train_all_data": "{{train_all_data}}"
36
+ "train_all_data": "{{train_all_data}}",
37
+ "hyperparameters": "{{hyperparameters}}",
37
38
  }
38
39
 
40
+
39
41
  # Function to check if dataframe is empty
40
42
  def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
41
43
  """
@@ -75,7 +77,7 @@ def expand_proba_column(df: pd.DataFrame, class_labels: List[str]) -> pd.DataFra
75
77
  proba_df = pd.DataFrame(df[proba_column].tolist(), columns=proba_splits)
76
78
 
77
79
  # Drop any proba columns and reset the index in prep for the concat
78
- df = df.drop(columns=[proba_column]+proba_splits, errors="ignore")
80
+ df = df.drop(columns=[proba_column] + proba_splits, errors="ignore")
79
81
  df = df.reset_index(drop=True)
80
82
 
81
83
  # Concatenate the new columns with the original DataFrame
@@ -88,13 +90,12 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
88
90
  """
89
91
  Matches and renames DataFrame columns to match model feature names (case-insensitive).
90
92
  Prioritizes exact matches, then case-insensitive matches.
91
-
93
+
92
94
  Raises ValueError if any model features cannot be matched.
93
95
  """
94
96
  df_columns_lower = {col.lower(): col for col in df.columns}
95
97
  rename_dict = {}
96
98
  missing = []
97
-
98
99
  for feature in model_features:
99
100
  if feature in df.columns:
100
101
  continue # Exact match
@@ -102,10 +103,11 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
102
103
  rename_dict[df_columns_lower[feature.lower()]] = feature
103
104
  else:
104
105
  missing.append(feature)
105
-
106
+
106
107
  if missing:
107
108
  raise ValueError(f"Features not found: {missing}")
108
-
109
+
110
+ # Rename the DataFrame columns to match the model features
109
111
  return df.rename(columns=rename_dict)
110
112
 
111
113
 
@@ -140,8 +142,10 @@ def convert_categorical_types(df: pd.DataFrame, features: list, category_mapping
140
142
  return df, category_mappings
141
143
 
142
144
 
143
- def decompress_features(df: pd.DataFrame, features: List[str], compressed_features: List[str]) -> Tuple[pd.DataFrame, List[str]]:
144
- """Prepare features for the XGBoost model
145
+ def decompress_features(
146
+ df: pd.DataFrame, features: List[str], compressed_features: List[str]
147
+ ) -> Tuple[pd.DataFrame, List[str]]:
148
+ """Prepare features for the model by decompressing bitstring features
145
149
 
146
150
  Args:
147
151
  df (pd.DataFrame): The features DataFrame
@@ -166,7 +170,7 @@ def decompress_features(df: pd.DataFrame, features: List[str], compressed_featur
166
170
  )
167
171
 
168
172
  # Decompress the specified compressed features
169
- decompressed_features = features
173
+ decompressed_features = features.copy()
170
174
  for feature in compressed_features:
171
175
  if (feature not in df.columns) or (feature not in features):
172
176
  print(f"Feature '{feature}' not in the features list, skipping decompression.")
@@ -197,13 +201,14 @@ if __name__ == "__main__":
197
201
  """The main function is for training the XGBoost model"""
198
202
 
199
203
  # Harness Template Parameters
200
- target = TEMPLATE_PARAMS["target_column"]
204
+ target = TEMPLATE_PARAMS["target"]
201
205
  features = TEMPLATE_PARAMS["features"]
202
206
  orig_features = features.copy()
203
207
  compressed_features = TEMPLATE_PARAMS["compressed_features"]
204
208
  model_type = TEMPLATE_PARAMS["model_type"]
205
209
  model_metrics_s3_path = TEMPLATE_PARAMS["model_metrics_s3_path"]
206
210
  train_all_data = TEMPLATE_PARAMS["train_all_data"]
211
+ hyperparameters = TEMPLATE_PARAMS["hyperparameters"]
207
212
  validation_split = 0.2
208
213
 
209
214
  # Script arguments for input/output directories
@@ -216,11 +221,7 @@ if __name__ == "__main__":
216
221
  args = parser.parse_args()
217
222
 
218
223
  # Read the training data into DataFrames
219
- training_files = [
220
- os.path.join(args.train, file)
221
- for file in os.listdir(args.train)
222
- if file.endswith(".csv")
223
- ]
224
+ training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
224
225
  print(f"Training Files: {training_files}")
225
226
 
226
227
  # Combine files and read them all into a single pandas dataframe
@@ -255,15 +256,16 @@ if __name__ == "__main__":
255
256
  else:
256
257
  # Just do a random training Split
257
258
  print("WARNING: No training column found, splitting data with random state=42")
258
- df_train, df_val = train_test_split(
259
- all_df, test_size=validation_split, random_state=42
260
- )
259
+ df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
261
260
  print(f"FIT/TRAIN: {df_train.shape}")
262
261
  print(f"VALIDATION: {df_val.shape}")
263
262
 
263
+ # Use any hyperparameters to set up both the trainer and model configurations
264
+ print(f"Hyperparameters: {hyperparameters}")
265
+
264
266
  # Now spin up our XGB Model
265
267
  if model_type == "classifier":
266
- xgb_model = xgb.XGBClassifier(enable_categorical=True)
268
+ xgb_model = xgb.XGBClassifier(enable_categorical=True, **hyperparameters)
267
269
 
268
270
  # Encode the target column
269
271
  label_encoder = LabelEncoder()
@@ -271,12 +273,12 @@ if __name__ == "__main__":
271
273
  df_val[target] = label_encoder.transform(df_val[target])
272
274
 
273
275
  else:
274
- xgb_model = xgb.XGBRegressor(enable_categorical=True)
276
+ xgb_model = xgb.XGBRegressor(enable_categorical=True, **hyperparameters)
275
277
  label_encoder = None # We don't need this for regression
276
278
 
277
279
  # Grab our Features, Target and Train the Model
278
280
  y_train = df_train[target]
279
- X_train= df_train[features]
281
+ X_train = df_train[features]
280
282
  xgb_model.fit(X_train, y_train)
281
283
 
282
284
  # Make Predictions on the Validation Set
@@ -315,9 +317,7 @@ if __name__ == "__main__":
315
317
  label_names = label_encoder.classes_
316
318
 
317
319
  # Calculate various model performance metrics
318
- scores = precision_recall_fscore_support(
319
- y_validate, preds, average=None, labels=label_names
320
- )
320
+ scores = precision_recall_fscore_support(y_validate, preds, average=None, labels=label_names)
321
321
 
322
322
  # Put the scores into a dataframe
323
323
  score_df = pd.DataFrame(
@@ -355,7 +355,9 @@ if __name__ == "__main__":
355
355
  print(f"NumRows: {len(df_val)}")
356
356
 
357
357
  # Now save the model to the standard place/name
358
- xgb_model.save_model(os.path.join(args.model_dir, "xgb_model.json"))
358
+ joblib.dump(xgb_model, os.path.join(args.model_dir, "xgb_model.joblib"))
359
+
360
+ # Save the label encoder if we have one
359
361
  if label_encoder:
360
362
  joblib.dump(label_encoder, os.path.join(args.model_dir, "label_encoder.joblib"))
361
363
 
@@ -370,19 +372,8 @@ if __name__ == "__main__":
370
372
 
371
373
  def model_fn(model_dir):
372
374
  """Deserialize and return fitted XGBoost model"""
373
-
374
- model_path = os.path.join(model_dir, "xgb_model.json")
375
-
376
- with open(model_path, "r") as f:
377
- model_json = json.load(f)
378
-
379
- sklearn_data = model_json['learner']['attributes']['scikit_learn']
380
- model_type = json.loads(sklearn_data)['_estimator_type']
381
-
382
- model_class = xgb.XGBClassifier if model_type == "classifier" else xgb.XGBRegressor
383
- model = model_class(enable_categorical=True)
384
- model.load_model(model_path)
385
-
375
+ model_path = os.path.join(model_dir, "xgb_model.joblib")
376
+ model = joblib.load(model_path)
386
377
  return model
387
378
 
388
379
 
@@ -390,7 +381,7 @@ def input_fn(input_data, content_type):
390
381
  """Parse input data and return a DataFrame."""
391
382
  if not input_data:
392
383
  raise ValueError("Empty input data is not supported!")
393
-
384
+
394
385
  # Decode bytes to string if necessary
395
386
  if isinstance(input_data, bytes):
396
387
  input_data = input_data.decode("utf-8")
@@ -41,7 +41,7 @@ from workbench.cached.cached_meta import CachedMeta
41
41
  try:
42
42
  import rdkit # noqa
43
43
  import mordred # noqa
44
- from workbench.utils import chem_utils
44
+ from workbench.utils.chem_utils import vis
45
45
 
46
46
  HAVE_CHEM_UTILS = True
47
47
  except ImportError:
@@ -178,12 +178,12 @@ class WorkbenchShell:
178
178
 
179
179
  # Add cheminformatics utils if available
180
180
  if HAVE_CHEM_UTILS:
181
- self.commands["show"] = chem_utils.show
181
+ self.commands["show"] = vis.show
182
182
 
183
183
  def start(self):
184
184
  """Start the Workbench IPython shell"""
185
185
  cprint("magenta", "\nWelcome to Workbench!")
186
- if self.aws_status is False:
186
+ if not self.aws_status:
187
187
  cprint("red", "AWS Account Connection Failed...Review/Fix the Workbench Config:")
188
188
  cprint("red", f"Path: {self.cm.site_config_path}")
189
189
  self.show_config()
@@ -560,7 +560,7 @@ class WorkbenchShell:
560
560
  from workbench.web_interface.components.plugin_unit_test import PluginUnitTest
561
561
 
562
562
  # Get kwargs
563
- theme = kwargs.get("theme", "dark")
563
+ theme = kwargs.get("theme", "midnight_blue")
564
564
 
565
565
  plugin_test = PluginUnitTest(plugin_class, theme=theme, input_data=data, **kwargs)
566
566
 
@@ -0,0 +1,63 @@
1
+ import sys
2
+ import os
3
+ import json
4
+ import importlib.util
5
+
6
+
7
+ def main():
8
+ if len(sys.argv) != 2:
9
+ print("Usage: lambda_launcher <handler_module_name>")
10
+ print("\nOptional: testing/event.json with test event")
11
+ print("Optional: testing/env.json with environment variables")
12
+ sys.exit(1)
13
+
14
+ handler_file = sys.argv[1]
15
+
16
+ # Add .py if not present
17
+ if not handler_file.endswith(".py"):
18
+ handler_file += ".py"
19
+
20
+ # Check if file exists
21
+ if not os.path.exists(handler_file):
22
+ print(f"Error: File '{handler_file}' not found")
23
+ sys.exit(1)
24
+
25
+ # Load environment variables from env.json if it exists
26
+ if os.path.exists("testing/env.json"):
27
+ print("Loading environment variables from testing/env.json")
28
+ with open("testing/env.json") as f:
29
+ env_vars = json.load(f)
30
+ for key, value in env_vars.items():
31
+ os.environ[key] = value
32
+ print(f" Set {key} = {value}")
33
+ print()
34
+
35
+ # Load event configuration
36
+ if os.path.exists("testing/event.json"):
37
+ print("Loading event from testing/event.json")
38
+ with open("testing/event.json") as f:
39
+ event = json.load(f)
40
+ else:
41
+ print("No testing/event.json found, using empty event")
42
+ event = {}
43
+
44
+ # Load the module dynamically
45
+ spec = importlib.util.spec_from_file_location("lambda_module", handler_file)
46
+ lambda_module = importlib.util.module_from_spec(spec)
47
+ spec.loader.exec_module(lambda_module)
48
+
49
+ # Call the lambda_handler
50
+ print(f"Invoking lambda_handler from {handler_file}...")
51
+ print("-" * 50)
52
+ print(f"Event: {json.dumps(event, indent=2)}")
53
+ print("-" * 50)
54
+
55
+ result = lambda_module.lambda_handler(event, {})
56
+
57
+ print("-" * 50)
58
+ print("Result:")
59
+ print(json.dumps(result, indent=2))
60
+
61
+
62
+ if __name__ == "__main__":
63
+ main()
@@ -27,60 +27,56 @@ def get_batch_role_arn() -> str:
27
27
  return f"arn:aws:iam::{account_id}:role/Workbench-BatchRole"
28
28
 
29
29
 
30
- def ensure_job_definition():
31
- """Register or update the Batch job definition for ML pipeline runner."""
32
- batch = AWSAccountClamp().boto3_session.client("batch")
33
- name = "workbench-ml-pipeline-runner"
34
- response = batch.register_job_definition(
35
- jobDefinitionName=name,
36
- type="container",
37
- platformCapabilities=["FARGATE"],
38
- containerProperties={
39
- "image": get_ecr_image_uri(),
40
- "resourceRequirements": [{"type": "VCPU", "value": "2"}, {"type": "MEMORY", "value": "4096"}],
41
- "jobRoleArn": get_batch_role_arn(),
42
- "executionRoleArn": get_batch_role_arn(),
43
- "environment": [
44
- {"name": "WORKBENCH_BUCKET", "value": workbench_bucket},
45
- {"name": "PYTHONUNBUFFERED", "value": "1"},
46
- ],
47
- # "networkConfiguration": {"assignPublicIp": "ENABLED"}, # Required for ECR Image Pull (when not in VPC)
48
- },
49
- timeout={"attemptDurationSeconds": 10800}, # 3 hours
50
- )
51
- log.info(f"Job definition ready: {name} (revision {response['revision']})")
52
- return name
30
+ def _log_cloudwatch_link(job: dict, message_prefix: str = "View logs") -> None:
31
+ """
32
+ Helper method to log CloudWatch logs link with clickable URL and full URL display.
33
+
34
+ Args:
35
+ job: Batch job description dictionary
36
+ message_prefix: Prefix for the log message (default: "View logs")
37
+ """
38
+ log_stream = job.get("container", {}).get("logStreamName")
39
+ logs_url = get_cloudwatch_logs_url(log_group="/aws/batch/job", log_stream=log_stream)
40
+ if logs_url:
41
+ clickable_url = f"\033]8;;{logs_url}\033\\{logs_url}\033]8;;\033\\"
42
+ log.info(f"{message_prefix}: {clickable_url}")
43
+ else:
44
+ log.info("Check AWS Batch console for logs")
53
45
 
54
46
 
55
- def run_batch_job(script_path: str) -> int:
47
+ def run_batch_job(script_path: str, size: str = "small") -> int:
56
48
  """
57
49
  Submit and monitor an AWS Batch job for ML pipeline execution.
58
- This function:
59
- 1. Uploads the ML pipeline script to S3
60
- 2. Submits a Batch job to run the script in a container
61
- 3. Monitors job status until completion
62
- 4. Returns the job's exit code
50
+
51
+ Uploads script to S3, submits Batch job, monitors until completion or 2 minutes of RUNNING.
63
52
 
64
53
  Args:
65
54
  script_path: Local path to the ML pipeline script
55
+ size: Job size tier - "small" (default), "medium", or "large"
56
+ - small: 2 vCPU, 4GB RAM for lightweight processing
57
+ - medium: 4 vCPU, 8GB RAM for standard ML workloads
58
+ - large: 8 vCPU, 16GB RAM for heavy training/inference
66
59
 
67
60
  Returns:
68
- Exit code from the batch job (0 for success, non-zero for failure)
61
+ Exit code (0 for success/disconnected, non-zero for failure)
69
62
  """
63
+ if size not in ["small", "medium", "large"]:
64
+ raise ValueError(f"Invalid size '{size}'. Must be 'small', 'medium', or 'large'")
65
+
70
66
  batch = AWSAccountClamp().boto3_session.client("batch")
71
67
  script_name = Path(script_path).stem
72
68
 
73
- # Upload script to S3 for the container to download
69
+ # Upload script to S3
74
70
  s3_path = f"s3://{workbench_bucket}/batch-jobs/{Path(script_path).name}"
75
71
  log.info(f"Uploading script to {s3_path}")
76
72
  upload_content_to_s3(Path(script_path).read_text(), s3_path)
77
73
 
78
- # Submit the Batch job
74
+ # Submit job
79
75
  job_name = f"workbench_{script_name}_{datetime.now():%Y%m%d_%H%M%S}"
80
76
  response = batch.submit_job(
81
77
  jobName=job_name,
82
78
  jobQueue="workbench-job-queue",
83
- jobDefinition=ensure_job_definition(),
79
+ jobDefinition=f"workbench-batch-{size}",
84
80
  containerOverrides={
85
81
  "environment": [
86
82
  {"name": "ML_PIPELINE_S3_PATH", "value": s3_path},
@@ -89,36 +85,38 @@ def run_batch_job(script_path: str) -> int:
89
85
  },
90
86
  )
91
87
  job_id = response["jobId"]
92
- log.info(f"Submitted job: {job_name} ({job_id})")
88
+ log.info(f"Submitted job: {job_name} ({job_id}) using {size} tier")
93
89
 
94
- # Monitor job execution
95
- last_status = None
90
+ # Monitor job
91
+ last_status, running_start = None, None
96
92
  while True:
97
- # Check job status
98
93
  job = batch.describe_jobs(jobs=[job_id])["jobs"][0]
99
94
  status = job["status"]
95
+
100
96
  if status != last_status:
101
97
  log.info(f"Job status: {status}")
102
98
  last_status = status
99
+ if status == "RUNNING":
100
+ running_start = time.time()
101
+
102
+ # Disconnect after 2 minutes of running
103
+ if status == "RUNNING" and running_start and (time.time() - running_start >= 120):
104
+ log.info("✅ ML Pipeline is running successfully!")
105
+ _log_cloudwatch_link(job, "📊 Monitor logs")
106
+ return 0
103
107
 
104
- # Check if job completed
108
+ # Handle completion
105
109
  if status in ["SUCCEEDED", "FAILED"]:
106
110
  exit_code = job.get("attempts", [{}])[-1].get("exitCode", 1)
107
- if status == "FAILED":
108
- log.error(f"Job failed: {job.get('statusReason', 'Unknown reason')}")
109
- else:
110
- log.info("Job completed successfully")
111
-
112
- # Get CloudWatch logs URL
113
- log_stream_name = job.get("container", {}).get("logStreamName")
114
- logs_url = get_cloudwatch_logs_url(log_group="/aws/batch/job", log_stream=log_stream_name)
115
- if logs_url:
116
- # OSC 8 hyperlink format for modern terminals
117
- clickable_url = f"\033]8;;{logs_url}\033\\{logs_url}\033]8;;\033\\"
118
- log.info(f"View logs: {clickable_url}")
111
+ msg = (
112
+ "Job completed successfully"
113
+ if status == "SUCCEEDED"
114
+ else f"Job failed: {job.get('statusReason', 'Unknown')}"
115
+ )
116
+ log.info(msg) if status == "SUCCEEDED" else log.error(msg)
117
+ _log_cloudwatch_link(job)
119
118
  return exit_code
120
119
 
121
- # Sleep a bit before next status check
122
120
  time.sleep(10)
123
121
 
124
122