workbench 0.8.168__py3-none-any.whl → 0.8.193__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- workbench/algorithms/dataframe/proximity.py +143 -102
- workbench/algorithms/graph/light/proximity_graph.py +2 -1
- workbench/api/compound.py +1 -1
- workbench/api/endpoint.py +3 -2
- workbench/api/feature_set.py +4 -4
- workbench/api/model.py +16 -12
- workbench/api/monitor.py +1 -16
- workbench/core/artifacts/artifact.py +11 -3
- workbench/core/artifacts/data_capture_core.py +355 -0
- workbench/core/artifacts/endpoint_core.py +113 -27
- workbench/core/artifacts/feature_set_core.py +72 -13
- workbench/core/artifacts/model_core.py +71 -49
- workbench/core/artifacts/monitor_core.py +33 -249
- workbench/core/cloud_platform/aws/aws_account_clamp.py +50 -1
- workbench/core/cloud_platform/aws/aws_meta.py +11 -4
- workbench/core/transforms/data_to_features/light/molecular_descriptors.py +4 -4
- workbench/core/transforms/features_to_model/features_to_model.py +11 -6
- workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +36 -6
- workbench/core/transforms/pandas_transforms/pandas_to_features.py +27 -0
- workbench/core/views/training_view.py +49 -53
- workbench/core/views/view.py +51 -1
- workbench/core/views/view_utils.py +4 -4
- workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +483 -0
- workbench/model_scripts/custom_models/chem_info/mol_standardize.py +450 -0
- workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +7 -9
- workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +3 -5
- workbench/model_scripts/custom_models/proximity/proximity.py +143 -102
- workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
- workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +10 -17
- workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
- workbench/model_scripts/custom_models/uq_models/meta_uq.template +156 -58
- workbench/model_scripts/custom_models/uq_models/ngboost.template +20 -14
- workbench/model_scripts/custom_models/uq_models/proximity.py +143 -102
- workbench/model_scripts/custom_models/uq_models/requirements.txt +1 -3
- workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +5 -13
- workbench/model_scripts/pytorch_model/pytorch.template +9 -18
- workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
- workbench/model_scripts/script_generation.py +7 -2
- workbench/model_scripts/uq_models/mapie.template +492 -0
- workbench/model_scripts/uq_models/requirements.txt +1 -0
- workbench/model_scripts/xgb_model/generated_model_script.py +34 -43
- workbench/model_scripts/xgb_model/xgb_model.template +31 -40
- workbench/repl/workbench_shell.py +4 -4
- workbench/scripts/lambda_launcher.py +63 -0
- workbench/scripts/{ml_pipeline_launcher.py → ml_pipeline_batch.py} +49 -51
- workbench/scripts/ml_pipeline_sqs.py +186 -0
- workbench/utils/chem_utils/__init__.py +0 -0
- workbench/utils/chem_utils/fingerprints.py +134 -0
- workbench/utils/chem_utils/misc.py +194 -0
- workbench/utils/chem_utils/mol_descriptors.py +483 -0
- workbench/utils/chem_utils/mol_standardize.py +450 -0
- workbench/utils/chem_utils/mol_tagging.py +348 -0
- workbench/utils/chem_utils/projections.py +209 -0
- workbench/utils/chem_utils/salts.py +256 -0
- workbench/utils/chem_utils/sdf.py +292 -0
- workbench/utils/chem_utils/toxicity.py +250 -0
- workbench/utils/chem_utils/vis.py +253 -0
- workbench/utils/config_manager.py +2 -6
- workbench/utils/endpoint_utils.py +5 -7
- workbench/utils/license_manager.py +2 -6
- workbench/utils/model_utils.py +89 -31
- workbench/utils/monitor_utils.py +44 -62
- workbench/utils/pandas_utils.py +3 -3
- workbench/utils/shap_utils.py +10 -2
- workbench/utils/workbench_sqs.py +1 -1
- workbench/utils/xgboost_model_utils.py +300 -151
- workbench/web_interface/components/model_plot.py +7 -1
- workbench/web_interface/components/plugins/dashboard_status.py +3 -1
- workbench/web_interface/components/plugins/generated_compounds.py +1 -1
- workbench/web_interface/components/plugins/model_details.py +7 -2
- workbench/web_interface/components/plugins/scatter_plot.py +3 -3
- {workbench-0.8.168.dist-info → workbench-0.8.193.dist-info}/METADATA +24 -2
- {workbench-0.8.168.dist-info → workbench-0.8.193.dist-info}/RECORD +77 -72
- {workbench-0.8.168.dist-info → workbench-0.8.193.dist-info}/entry_points.txt +3 -1
- {workbench-0.8.168.dist-info → workbench-0.8.193.dist-info}/licenses/LICENSE +1 -1
- workbench/model_scripts/custom_models/chem_info/local_utils.py +0 -769
- workbench/model_scripts/custom_models/chem_info/tautomerize.py +0 -83
- workbench/model_scripts/custom_models/proximity/generated_model_script.py +0 -138
- workbench/model_scripts/custom_models/uq_models/generated_model_script.py +0 -393
- workbench/model_scripts/custom_models/uq_models/mapie_xgb.template +0 -203
- workbench/model_scripts/ensemble_xgb/generated_model_script.py +0 -279
- workbench/model_scripts/pytorch_model/generated_model_script.py +0 -576
- workbench/model_scripts/quant_regression/quant_regression.template +0 -279
- workbench/model_scripts/quant_regression/requirements.txt +0 -1
- workbench/model_scripts/scikit_learn/generated_model_script.py +0 -307
- workbench/utils/chem_utils.py +0 -1556
- workbench/utils/fast_inference.py +0 -167
- workbench/utils/resource_utils.py +0 -39
- {workbench-0.8.168.dist-info → workbench-0.8.193.dist-info}/WHEEL +0 -0
- {workbench-0.8.168.dist-info → workbench-0.8.193.dist-info}/top_level.txt +0 -0
|
@@ -28,14 +28,16 @@ from typing import List, Tuple
|
|
|
28
28
|
|
|
29
29
|
# Template Parameters
|
|
30
30
|
TEMPLATE_PARAMS = {
|
|
31
|
-
"model_type": "
|
|
32
|
-
"
|
|
33
|
-
"features": ['
|
|
31
|
+
"model_type": "regressor",
|
|
32
|
+
"target": "solubility",
|
|
33
|
+
"features": ['molwt', 'mollogp', 'molmr', 'heavyatomcount', 'numhacceptors', 'numhdonors', 'numheteroatoms', 'numrotatablebonds', 'numvalenceelectrons', 'numaromaticrings', 'numsaturatedrings', 'numaliphaticrings', 'ringcount', 'tpsa', 'labuteasa', 'balabanj', 'bertzct'],
|
|
34
34
|
"compressed_features": [],
|
|
35
|
-
"model_metrics_s3_path": "s3://
|
|
36
|
-
"train_all_data":
|
|
35
|
+
"model_metrics_s3_path": "s3://sandbox-sageworks-artifacts/models/aqsol-regression/training",
|
|
36
|
+
"train_all_data": False,
|
|
37
|
+
"hyperparameters": {},
|
|
37
38
|
}
|
|
38
39
|
|
|
40
|
+
|
|
39
41
|
# Function to check if dataframe is empty
|
|
40
42
|
def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
|
|
41
43
|
"""
|
|
@@ -75,7 +77,7 @@ def expand_proba_column(df: pd.DataFrame, class_labels: List[str]) -> pd.DataFra
|
|
|
75
77
|
proba_df = pd.DataFrame(df[proba_column].tolist(), columns=proba_splits)
|
|
76
78
|
|
|
77
79
|
# Drop any proba columns and reset the index in prep for the concat
|
|
78
|
-
df = df.drop(columns=[proba_column]+proba_splits, errors="ignore")
|
|
80
|
+
df = df.drop(columns=[proba_column] + proba_splits, errors="ignore")
|
|
79
81
|
df = df.reset_index(drop=True)
|
|
80
82
|
|
|
81
83
|
# Concatenate the new columns with the original DataFrame
|
|
@@ -88,13 +90,12 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
|
|
|
88
90
|
"""
|
|
89
91
|
Matches and renames DataFrame columns to match model feature names (case-insensitive).
|
|
90
92
|
Prioritizes exact matches, then case-insensitive matches.
|
|
91
|
-
|
|
93
|
+
|
|
92
94
|
Raises ValueError if any model features cannot be matched.
|
|
93
95
|
"""
|
|
94
96
|
df_columns_lower = {col.lower(): col for col in df.columns}
|
|
95
97
|
rename_dict = {}
|
|
96
98
|
missing = []
|
|
97
|
-
|
|
98
99
|
for feature in model_features:
|
|
99
100
|
if feature in df.columns:
|
|
100
101
|
continue # Exact match
|
|
@@ -102,10 +103,11 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
|
|
|
102
103
|
rename_dict[df_columns_lower[feature.lower()]] = feature
|
|
103
104
|
else:
|
|
104
105
|
missing.append(feature)
|
|
105
|
-
|
|
106
|
+
|
|
106
107
|
if missing:
|
|
107
108
|
raise ValueError(f"Features not found: {missing}")
|
|
108
|
-
|
|
109
|
+
|
|
110
|
+
# Rename the DataFrame columns to match the model features
|
|
109
111
|
return df.rename(columns=rename_dict)
|
|
110
112
|
|
|
111
113
|
|
|
@@ -140,8 +142,10 @@ def convert_categorical_types(df: pd.DataFrame, features: list, category_mapping
|
|
|
140
142
|
return df, category_mappings
|
|
141
143
|
|
|
142
144
|
|
|
143
|
-
def decompress_features(
|
|
144
|
-
|
|
145
|
+
def decompress_features(
|
|
146
|
+
df: pd.DataFrame, features: List[str], compressed_features: List[str]
|
|
147
|
+
) -> Tuple[pd.DataFrame, List[str]]:
|
|
148
|
+
"""Prepare features for the model by decompressing bitstring features
|
|
145
149
|
|
|
146
150
|
Args:
|
|
147
151
|
df (pd.DataFrame): The features DataFrame
|
|
@@ -166,7 +170,7 @@ def decompress_features(df: pd.DataFrame, features: List[str], compressed_featur
|
|
|
166
170
|
)
|
|
167
171
|
|
|
168
172
|
# Decompress the specified compressed features
|
|
169
|
-
decompressed_features = features
|
|
173
|
+
decompressed_features = features.copy()
|
|
170
174
|
for feature in compressed_features:
|
|
171
175
|
if (feature not in df.columns) or (feature not in features):
|
|
172
176
|
print(f"Feature '{feature}' not in the features list, skipping decompression.")
|
|
@@ -197,13 +201,14 @@ if __name__ == "__main__":
|
|
|
197
201
|
"""The main function is for training the XGBoost model"""
|
|
198
202
|
|
|
199
203
|
# Harness Template Parameters
|
|
200
|
-
target = TEMPLATE_PARAMS["
|
|
204
|
+
target = TEMPLATE_PARAMS["target"]
|
|
201
205
|
features = TEMPLATE_PARAMS["features"]
|
|
202
206
|
orig_features = features.copy()
|
|
203
207
|
compressed_features = TEMPLATE_PARAMS["compressed_features"]
|
|
204
208
|
model_type = TEMPLATE_PARAMS["model_type"]
|
|
205
209
|
model_metrics_s3_path = TEMPLATE_PARAMS["model_metrics_s3_path"]
|
|
206
210
|
train_all_data = TEMPLATE_PARAMS["train_all_data"]
|
|
211
|
+
hyperparameters = TEMPLATE_PARAMS["hyperparameters"]
|
|
207
212
|
validation_split = 0.2
|
|
208
213
|
|
|
209
214
|
# Script arguments for input/output directories
|
|
@@ -216,11 +221,7 @@ if __name__ == "__main__":
|
|
|
216
221
|
args = parser.parse_args()
|
|
217
222
|
|
|
218
223
|
# Read the training data into DataFrames
|
|
219
|
-
training_files = [
|
|
220
|
-
os.path.join(args.train, file)
|
|
221
|
-
for file in os.listdir(args.train)
|
|
222
|
-
if file.endswith(".csv")
|
|
223
|
-
]
|
|
224
|
+
training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
|
|
224
225
|
print(f"Training Files: {training_files}")
|
|
225
226
|
|
|
226
227
|
# Combine files and read them all into a single pandas dataframe
|
|
@@ -255,15 +256,16 @@ if __name__ == "__main__":
|
|
|
255
256
|
else:
|
|
256
257
|
# Just do a random training Split
|
|
257
258
|
print("WARNING: No training column found, splitting data with random state=42")
|
|
258
|
-
df_train, df_val = train_test_split(
|
|
259
|
-
all_df, test_size=validation_split, random_state=42
|
|
260
|
-
)
|
|
259
|
+
df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
|
|
261
260
|
print(f"FIT/TRAIN: {df_train.shape}")
|
|
262
261
|
print(f"VALIDATION: {df_val.shape}")
|
|
263
262
|
|
|
263
|
+
# Use any hyperparameters to set up both the trainer and model configurations
|
|
264
|
+
print(f"Hyperparameters: {hyperparameters}")
|
|
265
|
+
|
|
264
266
|
# Now spin up our XGB Model
|
|
265
267
|
if model_type == "classifier":
|
|
266
|
-
xgb_model = xgb.XGBClassifier(enable_categorical=True)
|
|
268
|
+
xgb_model = xgb.XGBClassifier(enable_categorical=True, **hyperparameters)
|
|
267
269
|
|
|
268
270
|
# Encode the target column
|
|
269
271
|
label_encoder = LabelEncoder()
|
|
@@ -271,12 +273,12 @@ if __name__ == "__main__":
|
|
|
271
273
|
df_val[target] = label_encoder.transform(df_val[target])
|
|
272
274
|
|
|
273
275
|
else:
|
|
274
|
-
xgb_model = xgb.XGBRegressor(enable_categorical=True)
|
|
276
|
+
xgb_model = xgb.XGBRegressor(enable_categorical=True, **hyperparameters)
|
|
275
277
|
label_encoder = None # We don't need this for regression
|
|
276
278
|
|
|
277
279
|
# Grab our Features, Target and Train the Model
|
|
278
280
|
y_train = df_train[target]
|
|
279
|
-
X_train= df_train[features]
|
|
281
|
+
X_train = df_train[features]
|
|
280
282
|
xgb_model.fit(X_train, y_train)
|
|
281
283
|
|
|
282
284
|
# Make Predictions on the Validation Set
|
|
@@ -315,9 +317,7 @@ if __name__ == "__main__":
|
|
|
315
317
|
label_names = label_encoder.classes_
|
|
316
318
|
|
|
317
319
|
# Calculate various model performance metrics
|
|
318
|
-
scores = precision_recall_fscore_support(
|
|
319
|
-
y_validate, preds, average=None, labels=label_names
|
|
320
|
-
)
|
|
320
|
+
scores = precision_recall_fscore_support(y_validate, preds, average=None, labels=label_names)
|
|
321
321
|
|
|
322
322
|
# Put the scores into a dataframe
|
|
323
323
|
score_df = pd.DataFrame(
|
|
@@ -355,7 +355,9 @@ if __name__ == "__main__":
|
|
|
355
355
|
print(f"NumRows: {len(df_val)}")
|
|
356
356
|
|
|
357
357
|
# Now save the model to the standard place/name
|
|
358
|
-
|
|
358
|
+
joblib.dump(xgb_model, os.path.join(args.model_dir, "xgb_model.joblib"))
|
|
359
|
+
|
|
360
|
+
# Save the label encoder if we have one
|
|
359
361
|
if label_encoder:
|
|
360
362
|
joblib.dump(label_encoder, os.path.join(args.model_dir, "label_encoder.joblib"))
|
|
361
363
|
|
|
@@ -370,19 +372,8 @@ if __name__ == "__main__":
|
|
|
370
372
|
|
|
371
373
|
def model_fn(model_dir):
|
|
372
374
|
"""Deserialize and return fitted XGBoost model"""
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
with open(model_path, "r") as f:
|
|
377
|
-
model_json = json.load(f)
|
|
378
|
-
|
|
379
|
-
sklearn_data = model_json['learner']['attributes']['scikit_learn']
|
|
380
|
-
model_type = json.loads(sklearn_data)['_estimator_type']
|
|
381
|
-
|
|
382
|
-
model_class = xgb.XGBClassifier if model_type == "classifier" else xgb.XGBRegressor
|
|
383
|
-
model = model_class(enable_categorical=True)
|
|
384
|
-
model.load_model(model_path)
|
|
385
|
-
|
|
375
|
+
model_path = os.path.join(model_dir, "xgb_model.joblib")
|
|
376
|
+
model = joblib.load(model_path)
|
|
386
377
|
return model
|
|
387
378
|
|
|
388
379
|
|
|
@@ -390,7 +381,7 @@ def input_fn(input_data, content_type):
|
|
|
390
381
|
"""Parse input data and return a DataFrame."""
|
|
391
382
|
if not input_data:
|
|
392
383
|
raise ValueError("Empty input data is not supported!")
|
|
393
|
-
|
|
384
|
+
|
|
394
385
|
# Decode bytes to string if necessary
|
|
395
386
|
if isinstance(input_data, bytes):
|
|
396
387
|
input_data = input_data.decode("utf-8")
|
|
@@ -29,13 +29,15 @@ from typing import List, Tuple
|
|
|
29
29
|
# Template Parameters
|
|
30
30
|
TEMPLATE_PARAMS = {
|
|
31
31
|
"model_type": "{{model_type}}",
|
|
32
|
-
"
|
|
32
|
+
"target": "{{target_column}}",
|
|
33
33
|
"features": "{{feature_list}}",
|
|
34
34
|
"compressed_features": "{{compressed_features}}",
|
|
35
35
|
"model_metrics_s3_path": "{{model_metrics_s3_path}}",
|
|
36
|
-
"train_all_data": "{{train_all_data}}"
|
|
36
|
+
"train_all_data": "{{train_all_data}}",
|
|
37
|
+
"hyperparameters": "{{hyperparameters}}",
|
|
37
38
|
}
|
|
38
39
|
|
|
40
|
+
|
|
39
41
|
# Function to check if dataframe is empty
|
|
40
42
|
def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
|
|
41
43
|
"""
|
|
@@ -75,7 +77,7 @@ def expand_proba_column(df: pd.DataFrame, class_labels: List[str]) -> pd.DataFra
|
|
|
75
77
|
proba_df = pd.DataFrame(df[proba_column].tolist(), columns=proba_splits)
|
|
76
78
|
|
|
77
79
|
# Drop any proba columns and reset the index in prep for the concat
|
|
78
|
-
df = df.drop(columns=[proba_column]+proba_splits, errors="ignore")
|
|
80
|
+
df = df.drop(columns=[proba_column] + proba_splits, errors="ignore")
|
|
79
81
|
df = df.reset_index(drop=True)
|
|
80
82
|
|
|
81
83
|
# Concatenate the new columns with the original DataFrame
|
|
@@ -88,13 +90,12 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
|
|
|
88
90
|
"""
|
|
89
91
|
Matches and renames DataFrame columns to match model feature names (case-insensitive).
|
|
90
92
|
Prioritizes exact matches, then case-insensitive matches.
|
|
91
|
-
|
|
93
|
+
|
|
92
94
|
Raises ValueError if any model features cannot be matched.
|
|
93
95
|
"""
|
|
94
96
|
df_columns_lower = {col.lower(): col for col in df.columns}
|
|
95
97
|
rename_dict = {}
|
|
96
98
|
missing = []
|
|
97
|
-
|
|
98
99
|
for feature in model_features:
|
|
99
100
|
if feature in df.columns:
|
|
100
101
|
continue # Exact match
|
|
@@ -102,10 +103,11 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
|
|
|
102
103
|
rename_dict[df_columns_lower[feature.lower()]] = feature
|
|
103
104
|
else:
|
|
104
105
|
missing.append(feature)
|
|
105
|
-
|
|
106
|
+
|
|
106
107
|
if missing:
|
|
107
108
|
raise ValueError(f"Features not found: {missing}")
|
|
108
|
-
|
|
109
|
+
|
|
110
|
+
# Rename the DataFrame columns to match the model features
|
|
109
111
|
return df.rename(columns=rename_dict)
|
|
110
112
|
|
|
111
113
|
|
|
@@ -140,8 +142,10 @@ def convert_categorical_types(df: pd.DataFrame, features: list, category_mapping
|
|
|
140
142
|
return df, category_mappings
|
|
141
143
|
|
|
142
144
|
|
|
143
|
-
def decompress_features(
|
|
144
|
-
|
|
145
|
+
def decompress_features(
|
|
146
|
+
df: pd.DataFrame, features: List[str], compressed_features: List[str]
|
|
147
|
+
) -> Tuple[pd.DataFrame, List[str]]:
|
|
148
|
+
"""Prepare features for the model by decompressing bitstring features
|
|
145
149
|
|
|
146
150
|
Args:
|
|
147
151
|
df (pd.DataFrame): The features DataFrame
|
|
@@ -166,7 +170,7 @@ def decompress_features(df: pd.DataFrame, features: List[str], compressed_featur
|
|
|
166
170
|
)
|
|
167
171
|
|
|
168
172
|
# Decompress the specified compressed features
|
|
169
|
-
decompressed_features = features
|
|
173
|
+
decompressed_features = features.copy()
|
|
170
174
|
for feature in compressed_features:
|
|
171
175
|
if (feature not in df.columns) or (feature not in features):
|
|
172
176
|
print(f"Feature '{feature}' not in the features list, skipping decompression.")
|
|
@@ -197,13 +201,14 @@ if __name__ == "__main__":
|
|
|
197
201
|
"""The main function is for training the XGBoost model"""
|
|
198
202
|
|
|
199
203
|
# Harness Template Parameters
|
|
200
|
-
target = TEMPLATE_PARAMS["
|
|
204
|
+
target = TEMPLATE_PARAMS["target"]
|
|
201
205
|
features = TEMPLATE_PARAMS["features"]
|
|
202
206
|
orig_features = features.copy()
|
|
203
207
|
compressed_features = TEMPLATE_PARAMS["compressed_features"]
|
|
204
208
|
model_type = TEMPLATE_PARAMS["model_type"]
|
|
205
209
|
model_metrics_s3_path = TEMPLATE_PARAMS["model_metrics_s3_path"]
|
|
206
210
|
train_all_data = TEMPLATE_PARAMS["train_all_data"]
|
|
211
|
+
hyperparameters = TEMPLATE_PARAMS["hyperparameters"]
|
|
207
212
|
validation_split = 0.2
|
|
208
213
|
|
|
209
214
|
# Script arguments for input/output directories
|
|
@@ -216,11 +221,7 @@ if __name__ == "__main__":
|
|
|
216
221
|
args = parser.parse_args()
|
|
217
222
|
|
|
218
223
|
# Read the training data into DataFrames
|
|
219
|
-
training_files = [
|
|
220
|
-
os.path.join(args.train, file)
|
|
221
|
-
for file in os.listdir(args.train)
|
|
222
|
-
if file.endswith(".csv")
|
|
223
|
-
]
|
|
224
|
+
training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
|
|
224
225
|
print(f"Training Files: {training_files}")
|
|
225
226
|
|
|
226
227
|
# Combine files and read them all into a single pandas dataframe
|
|
@@ -255,15 +256,16 @@ if __name__ == "__main__":
|
|
|
255
256
|
else:
|
|
256
257
|
# Just do a random training Split
|
|
257
258
|
print("WARNING: No training column found, splitting data with random state=42")
|
|
258
|
-
df_train, df_val = train_test_split(
|
|
259
|
-
all_df, test_size=validation_split, random_state=42
|
|
260
|
-
)
|
|
259
|
+
df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
|
|
261
260
|
print(f"FIT/TRAIN: {df_train.shape}")
|
|
262
261
|
print(f"VALIDATION: {df_val.shape}")
|
|
263
262
|
|
|
263
|
+
# Use any hyperparameters to set up both the trainer and model configurations
|
|
264
|
+
print(f"Hyperparameters: {hyperparameters}")
|
|
265
|
+
|
|
264
266
|
# Now spin up our XGB Model
|
|
265
267
|
if model_type == "classifier":
|
|
266
|
-
xgb_model = xgb.XGBClassifier(enable_categorical=True)
|
|
268
|
+
xgb_model = xgb.XGBClassifier(enable_categorical=True, **hyperparameters)
|
|
267
269
|
|
|
268
270
|
# Encode the target column
|
|
269
271
|
label_encoder = LabelEncoder()
|
|
@@ -271,12 +273,12 @@ if __name__ == "__main__":
|
|
|
271
273
|
df_val[target] = label_encoder.transform(df_val[target])
|
|
272
274
|
|
|
273
275
|
else:
|
|
274
|
-
xgb_model = xgb.XGBRegressor(enable_categorical=True)
|
|
276
|
+
xgb_model = xgb.XGBRegressor(enable_categorical=True, **hyperparameters)
|
|
275
277
|
label_encoder = None # We don't need this for regression
|
|
276
278
|
|
|
277
279
|
# Grab our Features, Target and Train the Model
|
|
278
280
|
y_train = df_train[target]
|
|
279
|
-
X_train= df_train[features]
|
|
281
|
+
X_train = df_train[features]
|
|
280
282
|
xgb_model.fit(X_train, y_train)
|
|
281
283
|
|
|
282
284
|
# Make Predictions on the Validation Set
|
|
@@ -315,9 +317,7 @@ if __name__ == "__main__":
|
|
|
315
317
|
label_names = label_encoder.classes_
|
|
316
318
|
|
|
317
319
|
# Calculate various model performance metrics
|
|
318
|
-
scores = precision_recall_fscore_support(
|
|
319
|
-
y_validate, preds, average=None, labels=label_names
|
|
320
|
-
)
|
|
320
|
+
scores = precision_recall_fscore_support(y_validate, preds, average=None, labels=label_names)
|
|
321
321
|
|
|
322
322
|
# Put the scores into a dataframe
|
|
323
323
|
score_df = pd.DataFrame(
|
|
@@ -355,7 +355,9 @@ if __name__ == "__main__":
|
|
|
355
355
|
print(f"NumRows: {len(df_val)}")
|
|
356
356
|
|
|
357
357
|
# Now save the model to the standard place/name
|
|
358
|
-
|
|
358
|
+
joblib.dump(xgb_model, os.path.join(args.model_dir, "xgb_model.joblib"))
|
|
359
|
+
|
|
360
|
+
# Save the label encoder if we have one
|
|
359
361
|
if label_encoder:
|
|
360
362
|
joblib.dump(label_encoder, os.path.join(args.model_dir, "label_encoder.joblib"))
|
|
361
363
|
|
|
@@ -370,19 +372,8 @@ if __name__ == "__main__":
|
|
|
370
372
|
|
|
371
373
|
def model_fn(model_dir):
|
|
372
374
|
"""Deserialize and return fitted XGBoost model"""
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
with open(model_path, "r") as f:
|
|
377
|
-
model_json = json.load(f)
|
|
378
|
-
|
|
379
|
-
sklearn_data = model_json['learner']['attributes']['scikit_learn']
|
|
380
|
-
model_type = json.loads(sklearn_data)['_estimator_type']
|
|
381
|
-
|
|
382
|
-
model_class = xgb.XGBClassifier if model_type == "classifier" else xgb.XGBRegressor
|
|
383
|
-
model = model_class(enable_categorical=True)
|
|
384
|
-
model.load_model(model_path)
|
|
385
|
-
|
|
375
|
+
model_path = os.path.join(model_dir, "xgb_model.joblib")
|
|
376
|
+
model = joblib.load(model_path)
|
|
386
377
|
return model
|
|
387
378
|
|
|
388
379
|
|
|
@@ -390,7 +381,7 @@ def input_fn(input_data, content_type):
|
|
|
390
381
|
"""Parse input data and return a DataFrame."""
|
|
391
382
|
if not input_data:
|
|
392
383
|
raise ValueError("Empty input data is not supported!")
|
|
393
|
-
|
|
384
|
+
|
|
394
385
|
# Decode bytes to string if necessary
|
|
395
386
|
if isinstance(input_data, bytes):
|
|
396
387
|
input_data = input_data.decode("utf-8")
|
|
@@ -41,7 +41,7 @@ from workbench.cached.cached_meta import CachedMeta
|
|
|
41
41
|
try:
|
|
42
42
|
import rdkit # noqa
|
|
43
43
|
import mordred # noqa
|
|
44
|
-
from workbench.utils import
|
|
44
|
+
from workbench.utils.chem_utils import vis
|
|
45
45
|
|
|
46
46
|
HAVE_CHEM_UTILS = True
|
|
47
47
|
except ImportError:
|
|
@@ -178,12 +178,12 @@ class WorkbenchShell:
|
|
|
178
178
|
|
|
179
179
|
# Add cheminformatics utils if available
|
|
180
180
|
if HAVE_CHEM_UTILS:
|
|
181
|
-
self.commands["show"] =
|
|
181
|
+
self.commands["show"] = vis.show
|
|
182
182
|
|
|
183
183
|
def start(self):
|
|
184
184
|
"""Start the Workbench IPython shell"""
|
|
185
185
|
cprint("magenta", "\nWelcome to Workbench!")
|
|
186
|
-
if self.aws_status
|
|
186
|
+
if not self.aws_status:
|
|
187
187
|
cprint("red", "AWS Account Connection Failed...Review/Fix the Workbench Config:")
|
|
188
188
|
cprint("red", f"Path: {self.cm.site_config_path}")
|
|
189
189
|
self.show_config()
|
|
@@ -560,7 +560,7 @@ class WorkbenchShell:
|
|
|
560
560
|
from workbench.web_interface.components.plugin_unit_test import PluginUnitTest
|
|
561
561
|
|
|
562
562
|
# Get kwargs
|
|
563
|
-
theme = kwargs.get("theme", "
|
|
563
|
+
theme = kwargs.get("theme", "midnight_blue")
|
|
564
564
|
|
|
565
565
|
plugin_test = PluginUnitTest(plugin_class, theme=theme, input_data=data, **kwargs)
|
|
566
566
|
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import os
|
|
3
|
+
import json
|
|
4
|
+
import importlib.util
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def main():
|
|
8
|
+
if len(sys.argv) != 2:
|
|
9
|
+
print("Usage: lambda_launcher <handler_module_name>")
|
|
10
|
+
print("\nOptional: testing/event.json with test event")
|
|
11
|
+
print("Optional: testing/env.json with environment variables")
|
|
12
|
+
sys.exit(1)
|
|
13
|
+
|
|
14
|
+
handler_file = sys.argv[1]
|
|
15
|
+
|
|
16
|
+
# Add .py if not present
|
|
17
|
+
if not handler_file.endswith(".py"):
|
|
18
|
+
handler_file += ".py"
|
|
19
|
+
|
|
20
|
+
# Check if file exists
|
|
21
|
+
if not os.path.exists(handler_file):
|
|
22
|
+
print(f"Error: File '{handler_file}' not found")
|
|
23
|
+
sys.exit(1)
|
|
24
|
+
|
|
25
|
+
# Load environment variables from env.json if it exists
|
|
26
|
+
if os.path.exists("testing/env.json"):
|
|
27
|
+
print("Loading environment variables from testing/env.json")
|
|
28
|
+
with open("testing/env.json") as f:
|
|
29
|
+
env_vars = json.load(f)
|
|
30
|
+
for key, value in env_vars.items():
|
|
31
|
+
os.environ[key] = value
|
|
32
|
+
print(f" Set {key} = {value}")
|
|
33
|
+
print()
|
|
34
|
+
|
|
35
|
+
# Load event configuration
|
|
36
|
+
if os.path.exists("testing/event.json"):
|
|
37
|
+
print("Loading event from testing/event.json")
|
|
38
|
+
with open("testing/event.json") as f:
|
|
39
|
+
event = json.load(f)
|
|
40
|
+
else:
|
|
41
|
+
print("No testing/event.json found, using empty event")
|
|
42
|
+
event = {}
|
|
43
|
+
|
|
44
|
+
# Load the module dynamically
|
|
45
|
+
spec = importlib.util.spec_from_file_location("lambda_module", handler_file)
|
|
46
|
+
lambda_module = importlib.util.module_from_spec(spec)
|
|
47
|
+
spec.loader.exec_module(lambda_module)
|
|
48
|
+
|
|
49
|
+
# Call the lambda_handler
|
|
50
|
+
print(f"Invoking lambda_handler from {handler_file}...")
|
|
51
|
+
print("-" * 50)
|
|
52
|
+
print(f"Event: {json.dumps(event, indent=2)}")
|
|
53
|
+
print("-" * 50)
|
|
54
|
+
|
|
55
|
+
result = lambda_module.lambda_handler(event, {})
|
|
56
|
+
|
|
57
|
+
print("-" * 50)
|
|
58
|
+
print("Result:")
|
|
59
|
+
print(json.dumps(result, indent=2))
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
if __name__ == "__main__":
|
|
63
|
+
main()
|
|
@@ -27,60 +27,56 @@ def get_batch_role_arn() -> str:
|
|
|
27
27
|
return f"arn:aws:iam::{account_id}:role/Workbench-BatchRole"
|
|
28
28
|
|
|
29
29
|
|
|
30
|
-
def
|
|
31
|
-
"""
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
{"name": "PYTHONUNBUFFERED", "value": "1"},
|
|
46
|
-
],
|
|
47
|
-
# "networkConfiguration": {"assignPublicIp": "ENABLED"}, # Required for ECR Image Pull (when not in VPC)
|
|
48
|
-
},
|
|
49
|
-
timeout={"attemptDurationSeconds": 10800}, # 3 hours
|
|
50
|
-
)
|
|
51
|
-
log.info(f"Job definition ready: {name} (revision {response['revision']})")
|
|
52
|
-
return name
|
|
30
|
+
def _log_cloudwatch_link(job: dict, message_prefix: str = "View logs") -> None:
|
|
31
|
+
"""
|
|
32
|
+
Helper method to log CloudWatch logs link with clickable URL and full URL display.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
job: Batch job description dictionary
|
|
36
|
+
message_prefix: Prefix for the log message (default: "View logs")
|
|
37
|
+
"""
|
|
38
|
+
log_stream = job.get("container", {}).get("logStreamName")
|
|
39
|
+
logs_url = get_cloudwatch_logs_url(log_group="/aws/batch/job", log_stream=log_stream)
|
|
40
|
+
if logs_url:
|
|
41
|
+
clickable_url = f"\033]8;;{logs_url}\033\\{logs_url}\033]8;;\033\\"
|
|
42
|
+
log.info(f"{message_prefix}: {clickable_url}")
|
|
43
|
+
else:
|
|
44
|
+
log.info("Check AWS Batch console for logs")
|
|
53
45
|
|
|
54
46
|
|
|
55
|
-
def run_batch_job(script_path: str) -> int:
|
|
47
|
+
def run_batch_job(script_path: str, size: str = "small") -> int:
|
|
56
48
|
"""
|
|
57
49
|
Submit and monitor an AWS Batch job for ML pipeline execution.
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
2. Submits a Batch job to run the script in a container
|
|
61
|
-
3. Monitors job status until completion
|
|
62
|
-
4. Returns the job's exit code
|
|
50
|
+
|
|
51
|
+
Uploads script to S3, submits Batch job, monitors until completion or 2 minutes of RUNNING.
|
|
63
52
|
|
|
64
53
|
Args:
|
|
65
54
|
script_path: Local path to the ML pipeline script
|
|
55
|
+
size: Job size tier - "small" (default), "medium", or "large"
|
|
56
|
+
- small: 2 vCPU, 4GB RAM for lightweight processing
|
|
57
|
+
- medium: 4 vCPU, 8GB RAM for standard ML workloads
|
|
58
|
+
- large: 8 vCPU, 16GB RAM for heavy training/inference
|
|
66
59
|
|
|
67
60
|
Returns:
|
|
68
|
-
Exit code
|
|
61
|
+
Exit code (0 for success/disconnected, non-zero for failure)
|
|
69
62
|
"""
|
|
63
|
+
if size not in ["small", "medium", "large"]:
|
|
64
|
+
raise ValueError(f"Invalid size '{size}'. Must be 'small', 'medium', or 'large'")
|
|
65
|
+
|
|
70
66
|
batch = AWSAccountClamp().boto3_session.client("batch")
|
|
71
67
|
script_name = Path(script_path).stem
|
|
72
68
|
|
|
73
|
-
# Upload script to S3
|
|
69
|
+
# Upload script to S3
|
|
74
70
|
s3_path = f"s3://{workbench_bucket}/batch-jobs/{Path(script_path).name}"
|
|
75
71
|
log.info(f"Uploading script to {s3_path}")
|
|
76
72
|
upload_content_to_s3(Path(script_path).read_text(), s3_path)
|
|
77
73
|
|
|
78
|
-
# Submit
|
|
74
|
+
# Submit job
|
|
79
75
|
job_name = f"workbench_{script_name}_{datetime.now():%Y%m%d_%H%M%S}"
|
|
80
76
|
response = batch.submit_job(
|
|
81
77
|
jobName=job_name,
|
|
82
78
|
jobQueue="workbench-job-queue",
|
|
83
|
-
jobDefinition=
|
|
79
|
+
jobDefinition=f"workbench-batch-{size}",
|
|
84
80
|
containerOverrides={
|
|
85
81
|
"environment": [
|
|
86
82
|
{"name": "ML_PIPELINE_S3_PATH", "value": s3_path},
|
|
@@ -89,36 +85,38 @@ def run_batch_job(script_path: str) -> int:
|
|
|
89
85
|
},
|
|
90
86
|
)
|
|
91
87
|
job_id = response["jobId"]
|
|
92
|
-
log.info(f"Submitted job: {job_name} ({job_id})")
|
|
88
|
+
log.info(f"Submitted job: {job_name} ({job_id}) using {size} tier")
|
|
93
89
|
|
|
94
|
-
# Monitor job
|
|
95
|
-
last_status = None
|
|
90
|
+
# Monitor job
|
|
91
|
+
last_status, running_start = None, None
|
|
96
92
|
while True:
|
|
97
|
-
# Check job status
|
|
98
93
|
job = batch.describe_jobs(jobs=[job_id])["jobs"][0]
|
|
99
94
|
status = job["status"]
|
|
95
|
+
|
|
100
96
|
if status != last_status:
|
|
101
97
|
log.info(f"Job status: {status}")
|
|
102
98
|
last_status = status
|
|
99
|
+
if status == "RUNNING":
|
|
100
|
+
running_start = time.time()
|
|
101
|
+
|
|
102
|
+
# Disconnect after 2 minutes of running
|
|
103
|
+
if status == "RUNNING" and running_start and (time.time() - running_start >= 120):
|
|
104
|
+
log.info("✅ ML Pipeline is running successfully!")
|
|
105
|
+
_log_cloudwatch_link(job, "📊 Monitor logs")
|
|
106
|
+
return 0
|
|
103
107
|
|
|
104
|
-
#
|
|
108
|
+
# Handle completion
|
|
105
109
|
if status in ["SUCCEEDED", "FAILED"]:
|
|
106
110
|
exit_code = job.get("attempts", [{}])[-1].get("exitCode", 1)
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
logs_url = get_cloudwatch_logs_url(log_group="/aws/batch/job", log_stream=log_stream_name)
|
|
115
|
-
if logs_url:
|
|
116
|
-
# OSC 8 hyperlink format for modern terminals
|
|
117
|
-
clickable_url = f"\033]8;;{logs_url}\033\\{logs_url}\033]8;;\033\\"
|
|
118
|
-
log.info(f"View logs: {clickable_url}")
|
|
111
|
+
msg = (
|
|
112
|
+
"Job completed successfully"
|
|
113
|
+
if status == "SUCCEEDED"
|
|
114
|
+
else f"Job failed: {job.get('statusReason', 'Unknown')}"
|
|
115
|
+
)
|
|
116
|
+
log.info(msg) if status == "SUCCEEDED" else log.error(msg)
|
|
117
|
+
_log_cloudwatch_link(job)
|
|
119
118
|
return exit_code
|
|
120
119
|
|
|
121
|
-
# Sleep a bit before next status check
|
|
122
120
|
time.sleep(10)
|
|
123
121
|
|
|
124
122
|
|