PyPI - workbench - Versions diffs - 0.8.168__py3-none-any.whl → 0.8.193__py3-none-any.whl - Mend

workbench 0.8.168py3-none-any.whl → 0.8.193py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (90) hide show

workbench/algorithms/dataframe/proximity.py +143 -102
workbench/algorithms/graph/light/proximity_graph.py +2 -1
workbench/api/compound.py +1 -1
workbench/api/endpoint.py +3 -2
workbench/api/feature_set.py +4 -4
workbench/api/model.py +16 -12
workbench/api/monitor.py +1 -16
workbench/core/artifacts/artifact.py +11 -3
workbench/core/artifacts/data_capture_core.py +355 -0
workbench/core/artifacts/endpoint_core.py +113 -27
workbench/core/artifacts/feature_set_core.py +72 -13
workbench/core/artifacts/model_core.py +71 -49
workbench/core/artifacts/monitor_core.py +33 -249
workbench/core/cloud_platform/aws/aws_account_clamp.py +50 -1
workbench/core/cloud_platform/aws/aws_meta.py +11 -4
workbench/core/transforms/data_to_features/light/molecular_descriptors.py +4 -4
workbench/core/transforms/features_to_model/features_to_model.py +11 -6
workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +36 -6
workbench/core/transforms/pandas_transforms/pandas_to_features.py +27 -0
workbench/core/views/training_view.py +49 -53
workbench/core/views/view.py +51 -1
workbench/core/views/view_utils.py +4 -4
workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +483 -0
workbench/model_scripts/custom_models/chem_info/mol_standardize.py +450 -0
workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +7 -9
workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +3 -5
workbench/model_scripts/custom_models/proximity/proximity.py +143 -102
workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +10 -17
workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
workbench/model_scripts/custom_models/uq_models/meta_uq.template +156 -58
workbench/model_scripts/custom_models/uq_models/ngboost.template +20 -14
workbench/model_scripts/custom_models/uq_models/proximity.py +143 -102
workbench/model_scripts/custom_models/uq_models/requirements.txt +1 -3
workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +5 -13
workbench/model_scripts/pytorch_model/pytorch.template +9 -18
workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
workbench/model_scripts/script_generation.py +7 -2
workbench/model_scripts/uq_models/mapie.template +492 -0
workbench/model_scripts/uq_models/requirements.txt +1 -0
workbench/model_scripts/xgb_model/generated_model_script.py +34 -43
workbench/model_scripts/xgb_model/xgb_model.template +31 -40
workbench/repl/workbench_shell.py +4 -4
workbench/scripts/lambda_launcher.py +63 -0
workbench/scripts/{ml_pipeline_launcher.py → ml_pipeline_batch.py} +49 -51
workbench/scripts/ml_pipeline_sqs.py +186 -0
workbench/utils/chem_utils/__init__.py +0 -0
workbench/utils/chem_utils/fingerprints.py +134 -0
workbench/utils/chem_utils/misc.py +194 -0
workbench/utils/chem_utils/mol_descriptors.py +483 -0
workbench/utils/chem_utils/mol_standardize.py +450 -0
workbench/utils/chem_utils/mol_tagging.py +348 -0
workbench/utils/chem_utils/projections.py +209 -0
workbench/utils/chem_utils/salts.py +256 -0
workbench/utils/chem_utils/sdf.py +292 -0
workbench/utils/chem_utils/toxicity.py +250 -0
workbench/utils/chem_utils/vis.py +253 -0
workbench/utils/config_manager.py +2 -6
workbench/utils/endpoint_utils.py +5 -7
workbench/utils/license_manager.py +2 -6
workbench/utils/model_utils.py +89 -31
workbench/utils/monitor_utils.py +44 -62
workbench/utils/pandas_utils.py +3 -3
workbench/utils/shap_utils.py +10 -2
workbench/utils/workbench_sqs.py +1 -1
workbench/utils/xgboost_model_utils.py +300 -151
workbench/web_interface/components/model_plot.py +7 -1
workbench/web_interface/components/plugins/dashboard_status.py +3 -1
workbench/web_interface/components/plugins/generated_compounds.py +1 -1
workbench/web_interface/components/plugins/model_details.py +7 -2
workbench/web_interface/components/plugins/scatter_plot.py +3 -3
{workbench-0.8.168.dist-info → workbench-0.8.193.dist-info}/METADATA +24 -2
{workbench-0.8.168.dist-info → workbench-0.8.193.dist-info}/RECORD +77 -72
{workbench-0.8.168.dist-info → workbench-0.8.193.dist-info}/entry_points.txt +3 -1
{workbench-0.8.168.dist-info → workbench-0.8.193.dist-info}/licenses/LICENSE +1 -1
workbench/model_scripts/custom_models/chem_info/local_utils.py +0 -769
workbench/model_scripts/custom_models/chem_info/tautomerize.py +0 -83
workbench/model_scripts/custom_models/proximity/generated_model_script.py +0 -138
workbench/model_scripts/custom_models/uq_models/generated_model_script.py +0 -393
workbench/model_scripts/custom_models/uq_models/mapie_xgb.template +0 -203
workbench/model_scripts/ensemble_xgb/generated_model_script.py +0 -279
workbench/model_scripts/pytorch_model/generated_model_script.py +0 -576
workbench/model_scripts/quant_regression/quant_regression.template +0 -279
workbench/model_scripts/quant_regression/requirements.txt +0 -1
workbench/model_scripts/scikit_learn/generated_model_script.py +0 -307
workbench/utils/chem_utils.py +0 -1556
workbench/utils/fast_inference.py +0 -167
workbench/utils/resource_utils.py +0 -39
{workbench-0.8.168.dist-info → workbench-0.8.193.dist-info}/WHEEL +0 -0
{workbench-0.8.168.dist-info → workbench-0.8.193.dist-info}/top_level.txt +0 -0

workbench/model_scripts/xgb_model/generated_model_script.py CHANGED Viewed

@@ -28,14 +28,16 @@ from typing import List, Tuple
 # Template Parameters
 TEMPLATE_PARAMS = {
-    "model_type": "classifier",
-    "target_column": "class",
-    "features": ['bcut2d_logplow', 'numradicalelectrons', 'smr_vsa5', 'fr_lactam', 'fr_morpholine', 'fr_aldehyde', 'slogp_vsa1', 'fr_amidine', 'bpol', 'fr_ester', 'fr_azo', 'kappa3', 'peoe_vsa5', 'fr_ketone_topliss', 'vsa_estate9', 'estate_vsa9', 'bcut2d_mrhi', 'fr_ndealkylation1', 'numrotatablebonds', 'minestateindex', 'fr_quatn', 'peoe_vsa3', 'fr_epoxide', 'fr_aniline', 'minpartialcharge', 'fr_nitroso', 'fpdensitymorgan2', 'fr_oxime', 'fr_sulfone', 'smr_vsa1', 'kappa1', 'fr_pyridine', 'numaromaticrings', 'vsa_estate6', 'molmr', 'estate_vsa1', 'fr_dihydropyridine', 'vsa_estate10', 'fr_alkyl_halide', 'chi2n', 'fr_thiocyan', 'fpdensitymorgan1', 'fr_unbrch_alkane', 'slogp_vsa9', 'chi4n', 'fr_nitro_arom', 'fr_al_oh', 'fr_furan', 'fr_c_s', 'peoe_vsa8', 'peoe_vsa14', 'numheteroatoms', 'fr_ndealkylation2', 'maxabspartialcharge', 'vsa_estate2', 'peoe_vsa7', 'apol', 'numhacceptors', 'fr_tetrazole', 'vsa_estate1', 'peoe_vsa9', 'naromatom', 'bcut2d_chghi', 'fr_sh', 'fr_halogen', 'slogp_vsa4', 'fr_benzodiazepine', 'molwt', 'fr_isocyan', 'fr_prisulfonamd', 'maxabsestateindex', 'minabsestateindex', 'peoe_vsa11', 'slogp_vsa12', 'estate_vsa5', 'numaliphaticcarbocycles', 'bcut2d_mwlow', 'slogp_vsa7', 'fr_allylic_oxid', 'fr_methoxy', 'fr_nh0', 'fr_coo2', 'fr_phenol', 'nacid', 'nbase', 'chi3v', 'fr_ar_nh', 'fr_nitrile', 'fr_imidazole', 'fr_urea', 'bcut2d_mrlow', 'chi1', 'smr_vsa6', 'fr_aryl_methyl', 'narombond', 'fr_alkyl_carbamate', 'fr_piperzine', 'exactmolwt', 'qed', 'chi0n', 'fr_sulfonamd', 'fr_thiazole', 'numvalenceelectrons', 'fr_phos_acid', 'peoe_vsa12', 'fr_nh1', 'fr_hdrzine', 'fr_c_o_nocoo', 'fr_lactone', 'estate_vsa6', 'bcut2d_logphi', 'vsa_estate7', 'peoe_vsa13', 'numsaturatedcarbocycles', 'fr_nitro', 'fr_phenol_noorthohbond', 'rotratio', 'fr_barbitur', 'fr_isothiocyan', 'balabanj', 'fr_arn', 'fr_imine', 'maxpartialcharge', 'fr_sulfide', 'slogp_vsa11', 'fr_hoccn', 'fr_n_o', 'peoe_vsa1', 'slogp_vsa6', 'heavyatommolwt', 'fractioncsp3', 'estate_vsa8', 'peoe_vsa10', 'numaliphaticrings', 'fr_thiophene', 'maxestateindex', 'smr_vsa10', 'labuteasa', 'smr_vsa2', 'fpdensitymorgan3', 'smr_vsa9', 'slogp_vsa10', 'numaromaticheterocycles', 'fr_nh2', 'fr_diazo', 'chi3n', 'fr_ar_coo', 'slogp_vsa5', 'fr_bicyclic', 'fr_amide', 'estate_vsa10', 'fr_guanido', 'chi1n', 'numsaturatedrings', 'fr_piperdine', 'fr_term_acetylene', 'estate_vsa4', 'slogp_vsa3', 'fr_coo', 'fr_ether', 'estate_vsa7', 'bcut2d_chglo', 'fr_oxazole', 'peoe_vsa6', 'hallkieralpha', 'peoe_vsa2', 'chi2v', 'nocount', 'vsa_estate5', 'fr_nhpyrrole', 'fr_al_coo', 'bertzct', 'estate_vsa11', 'minabspartialcharge', 'slogp_vsa8', 'fr_imide', 'kappa2', 'numaliphaticheterocycles', 'numsaturatedheterocycles', 'fr_hdrzone', 'smr_vsa4', 'fr_ar_n', 'nrot', 'smr_vsa8', 'slogp_vsa2', 'chi4v', 'fr_phos_ester', 'fr_para_hydroxylation', 'smr_vsa3', 'nhohcount', 'estate_vsa2', 'mollogp', 'tpsa', 'fr_azide', 'peoe_vsa4', 'numhdonors', 'fr_al_oh_notert', 'fr_c_o', 'chi0', 'fr_nitro_arom_nonortho', 'vsa_estate3', 'fr_benzene', 'fr_ketone', 'vsa_estate8', 'smr_vsa7', 'fr_ar_oh', 'fr_priamide', 'ringcount', 'estate_vsa3', 'numaromaticcarbocycles', 'bcut2d_mwhi', 'chi1v', 'heavyatomcount', 'vsa_estate4', 'chi0v', 'pred_pka_reg'],
+    "model_type": "regressor",
+    "target": "solubility",
+    "features": ['molwt', 'mollogp', 'molmr', 'heavyatomcount', 'numhacceptors', 'numhdonors', 'numheteroatoms', 'numrotatablebonds', 'numvalenceelectrons', 'numaromaticrings', 'numsaturatedrings', 'numaliphaticrings', 'ringcount', 'tpsa', 'labuteasa', 'balabanj', 'bertzct'],
     "compressed_features": [],
-    "model_metrics_s3_path": "s3://ideaya-sageworks-bucket/models/sol-with-pka-class-100-test/training",
-    "train_all_data": True
+    "model_metrics_s3_path": "s3://sandbox-sageworks-artifacts/models/aqsol-regression/training",
+    "train_all_data": False,
+    "hyperparameters": {},
 }
 # Function to check if dataframe is empty
 def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
     """
@@ -75,7 +77,7 @@ def expand_proba_column(df: pd.DataFrame, class_labels: List[str]) -> pd.DataFra
     proba_df = pd.DataFrame(df[proba_column].tolist(), columns=proba_splits)
     # Drop any proba columns and reset the index in prep for the concat
-    df = df.drop(columns=[proba_column]+proba_splits, errors="ignore")
+    df = df.drop(columns=[proba_column] + proba_splits, errors="ignore")
     df = df.reset_index(drop=True)
     # Concatenate the new columns with the original DataFrame
@@ -88,13 +90,12 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
     """
     Matches and renames DataFrame columns to match model feature names (case-insensitive).
     Prioritizes exact matches, then case-insensitive matches.
     Raises ValueError if any model features cannot be matched.
     """
     df_columns_lower = {col.lower(): col for col in df.columns}
     rename_dict = {}
     missing = []
     for feature in model_features:
         if feature in df.columns:
             continue  # Exact match
@@ -102,10 +103,11 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
             rename_dict[df_columns_lower[feature.lower()]] = feature
         else:
             missing.append(feature)
     if missing:
         raise ValueError(f"Features not found: {missing}")
+    # Rename the DataFrame columns to match the model features
     return df.rename(columns=rename_dict)
@@ -140,8 +142,10 @@ def convert_categorical_types(df: pd.DataFrame, features: list, category_mapping
     return df, category_mappings
-def decompress_features(df: pd.DataFrame, features: List[str], compressed_features: List[str]) -> Tuple[pd.DataFrame, List[str]]:
-    """Prepare features for the XGBoost model
+def decompress_features(
+    df: pd.DataFrame, features: List[str], compressed_features: List[str]
+) -> Tuple[pd.DataFrame, List[str]]:
+    """Prepare features for the model by decompressing bitstring features
     Args:
         df (pd.DataFrame): The features DataFrame
@@ -166,7 +170,7 @@ def decompress_features(df: pd.DataFrame, features: List[str], compressed_featur
         )
     # Decompress the specified compressed features
-    decompressed_features = features
+    decompressed_features = features.copy()
     for feature in compressed_features:
         if (feature not in df.columns) or (feature not in features):
             print(f"Feature '{feature}' not in the features list, skipping decompression.")
@@ -197,13 +201,14 @@ if __name__ == "__main__":
     """The main function is for training the XGBoost model"""
     # Harness Template Parameters
-    target = TEMPLATE_PARAMS["target_column"]
+    target = TEMPLATE_PARAMS["target"]
     features = TEMPLATE_PARAMS["features"]
     orig_features = features.copy()
     compressed_features = TEMPLATE_PARAMS["compressed_features"]
     model_type = TEMPLATE_PARAMS["model_type"]
     model_metrics_s3_path = TEMPLATE_PARAMS["model_metrics_s3_path"]
     train_all_data = TEMPLATE_PARAMS["train_all_data"]
+    hyperparameters = TEMPLATE_PARAMS["hyperparameters"]
     validation_split = 0.2
     # Script arguments for input/output directories
@@ -216,11 +221,7 @@ if __name__ == "__main__":
     args = parser.parse_args()
     # Read the training data into DataFrames
-    training_files = [
-        os.path.join(args.train, file)
-        for file in os.listdir(args.train)
-        if file.endswith(".csv")
-    ]
+    training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
     print(f"Training Files: {training_files}")
     # Combine files and read them all into a single pandas dataframe
@@ -255,15 +256,16 @@ if __name__ == "__main__":
     else:
         # Just do a random training Split
         print("WARNING: No training column found, splitting data with random state=42")
-        df_train, df_val = train_test_split(
-            all_df, test_size=validation_split, random_state=42
-        )
+        df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
     print(f"FIT/TRAIN: {df_train.shape}")
     print(f"VALIDATION: {df_val.shape}")
+    # Use any hyperparameters to set up both the trainer and model configurations
+    print(f"Hyperparameters: {hyperparameters}")
     # Now spin up our XGB Model
     if model_type == "classifier":
-        xgb_model = xgb.XGBClassifier(enable_categorical=True)
+        xgb_model = xgb.XGBClassifier(enable_categorical=True, **hyperparameters)
         # Encode the target column
         label_encoder = LabelEncoder()
@@ -271,12 +273,12 @@ if __name__ == "__main__":
         df_val[target] = label_encoder.transform(df_val[target])
     else:
-        xgb_model = xgb.XGBRegressor(enable_categorical=True)
+        xgb_model = xgb.XGBRegressor(enable_categorical=True, **hyperparameters)
         label_encoder = None  # We don't need this for regression
     # Grab our Features, Target and Train the Model
     y_train = df_train[target]
-    X_train= df_train[features]
+    X_train = df_train[features]
     xgb_model.fit(X_train, y_train)
     # Make Predictions on the Validation Set
@@ -315,9 +317,7 @@ if __name__ == "__main__":
         label_names = label_encoder.classes_
         # Calculate various model performance metrics
-        scores = precision_recall_fscore_support(
-            y_validate, preds, average=None, labels=label_names
-        )
+        scores = precision_recall_fscore_support(y_validate, preds, average=None, labels=label_names)
         # Put the scores into a dataframe
         score_df = pd.DataFrame(
@@ -355,7 +355,9 @@ if __name__ == "__main__":
         print(f"NumRows: {len(df_val)}")
     # Now save the model to the standard place/name
-    xgb_model.save_model(os.path.join(args.model_dir, "xgb_model.json"))
+    joblib.dump(xgb_model, os.path.join(args.model_dir, "xgb_model.joblib"))
+    # Save the label encoder if we have one
     if label_encoder:
         joblib.dump(label_encoder, os.path.join(args.model_dir, "label_encoder.joblib"))
@@ -370,19 +372,8 @@ if __name__ == "__main__":
 def model_fn(model_dir):
     """Deserialize and return fitted XGBoost model"""
-    model_path = os.path.join(model_dir, "xgb_model.json")
-    with open(model_path, "r") as f:
-        model_json = json.load(f)
-    sklearn_data = model_json['learner']['attributes']['scikit_learn']
-    model_type = json.loads(sklearn_data)['_estimator_type']
-    model_class = xgb.XGBClassifier if model_type == "classifier" else xgb.XGBRegressor
-    model = model_class(enable_categorical=True)
-    model.load_model(model_path)
+    model_path = os.path.join(model_dir, "xgb_model.joblib")
+    model = joblib.load(model_path)
     return model
@@ -390,7 +381,7 @@ def input_fn(input_data, content_type):
     """Parse input data and return a DataFrame."""
     if not input_data:
         raise ValueError("Empty input data is not supported!")
     # Decode bytes to string if necessary
     if isinstance(input_data, bytes):
         input_data = input_data.decode("utf-8")

workbench/model_scripts/xgb_model/xgb_model.template CHANGED Viewed

@@ -29,13 +29,15 @@ from typing import List, Tuple
 # Template Parameters
 TEMPLATE_PARAMS = {
     "model_type": "{{model_type}}",
-    "target_column": "{{target_column}}",
+    "target": "{{target_column}}",
     "features": "{{feature_list}}",
     "compressed_features": "{{compressed_features}}",
     "model_metrics_s3_path": "{{model_metrics_s3_path}}",
-    "train_all_data": "{{train_all_data}}"
+    "train_all_data": "{{train_all_data}}",
+    "hyperparameters": "{{hyperparameters}}",
 }
 # Function to check if dataframe is empty
 def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
     """
@@ -75,7 +77,7 @@ def expand_proba_column(df: pd.DataFrame, class_labels: List[str]) -> pd.DataFra
     proba_df = pd.DataFrame(df[proba_column].tolist(), columns=proba_splits)
     # Drop any proba columns and reset the index in prep for the concat
-    df = df.drop(columns=[proba_column]+proba_splits, errors="ignore")
+    df = df.drop(columns=[proba_column] + proba_splits, errors="ignore")
     df = df.reset_index(drop=True)
     # Concatenate the new columns with the original DataFrame
@@ -88,13 +90,12 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
     """
     Matches and renames DataFrame columns to match model feature names (case-insensitive).
     Prioritizes exact matches, then case-insensitive matches.
     Raises ValueError if any model features cannot be matched.
     """
     df_columns_lower = {col.lower(): col for col in df.columns}
     rename_dict = {}
     missing = []
     for feature in model_features:
         if feature in df.columns:
             continue  # Exact match
@@ -102,10 +103,11 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
             rename_dict[df_columns_lower[feature.lower()]] = feature
         else:
             missing.append(feature)
     if missing:
         raise ValueError(f"Features not found: {missing}")
+    # Rename the DataFrame columns to match the model features
     return df.rename(columns=rename_dict)
@@ -140,8 +142,10 @@ def convert_categorical_types(df: pd.DataFrame, features: list, category_mapping
     return df, category_mappings
-def decompress_features(df: pd.DataFrame, features: List[str], compressed_features: List[str]) -> Tuple[pd.DataFrame, List[str]]:
-    """Prepare features for the XGBoost model
+def decompress_features(
+    df: pd.DataFrame, features: List[str], compressed_features: List[str]
+) -> Tuple[pd.DataFrame, List[str]]:
+    """Prepare features for the model by decompressing bitstring features
     Args:
         df (pd.DataFrame): The features DataFrame
@@ -166,7 +170,7 @@ def decompress_features(df: pd.DataFrame, features: List[str], compressed_featur
         )
     # Decompress the specified compressed features
-    decompressed_features = features
+    decompressed_features = features.copy()
     for feature in compressed_features:
         if (feature not in df.columns) or (feature not in features):
             print(f"Feature '{feature}' not in the features list, skipping decompression.")
@@ -197,13 +201,14 @@ if __name__ == "__main__":
     """The main function is for training the XGBoost model"""
     # Harness Template Parameters
-    target = TEMPLATE_PARAMS["target_column"]
+    target = TEMPLATE_PARAMS["target"]
     features = TEMPLATE_PARAMS["features"]
     orig_features = features.copy()
     compressed_features = TEMPLATE_PARAMS["compressed_features"]
     model_type = TEMPLATE_PARAMS["model_type"]
     model_metrics_s3_path = TEMPLATE_PARAMS["model_metrics_s3_path"]
     train_all_data = TEMPLATE_PARAMS["train_all_data"]
+    hyperparameters = TEMPLATE_PARAMS["hyperparameters"]
     validation_split = 0.2
     # Script arguments for input/output directories
@@ -216,11 +221,7 @@ if __name__ == "__main__":
     args = parser.parse_args()
     # Read the training data into DataFrames
-    training_files = [
-        os.path.join(args.train, file)
-        for file in os.listdir(args.train)
-        if file.endswith(".csv")
-    ]
+    training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
     print(f"Training Files: {training_files}")
     # Combine files and read them all into a single pandas dataframe
@@ -255,15 +256,16 @@ if __name__ == "__main__":
     else:
         # Just do a random training Split
         print("WARNING: No training column found, splitting data with random state=42")
-        df_train, df_val = train_test_split(
-            all_df, test_size=validation_split, random_state=42
-        )
+        df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
     print(f"FIT/TRAIN: {df_train.shape}")
     print(f"VALIDATION: {df_val.shape}")
+    # Use any hyperparameters to set up both the trainer and model configurations
+    print(f"Hyperparameters: {hyperparameters}")
     # Now spin up our XGB Model
     if model_type == "classifier":
-        xgb_model = xgb.XGBClassifier(enable_categorical=True)
+        xgb_model = xgb.XGBClassifier(enable_categorical=True, **hyperparameters)
         # Encode the target column
         label_encoder = LabelEncoder()
@@ -271,12 +273,12 @@ if __name__ == "__main__":
         df_val[target] = label_encoder.transform(df_val[target])
     else:
-        xgb_model = xgb.XGBRegressor(enable_categorical=True)
+        xgb_model = xgb.XGBRegressor(enable_categorical=True, **hyperparameters)
         label_encoder = None  # We don't need this for regression
     # Grab our Features, Target and Train the Model
     y_train = df_train[target]
-    X_train= df_train[features]
+    X_train = df_train[features]
     xgb_model.fit(X_train, y_train)
     # Make Predictions on the Validation Set
@@ -315,9 +317,7 @@ if __name__ == "__main__":
         label_names = label_encoder.classes_
         # Calculate various model performance metrics
-        scores = precision_recall_fscore_support(
-            y_validate, preds, average=None, labels=label_names
-        )
+        scores = precision_recall_fscore_support(y_validate, preds, average=None, labels=label_names)
         # Put the scores into a dataframe
         score_df = pd.DataFrame(
@@ -355,7 +355,9 @@ if __name__ == "__main__":
         print(f"NumRows: {len(df_val)}")
     # Now save the model to the standard place/name
-    xgb_model.save_model(os.path.join(args.model_dir, "xgb_model.json"))
+    joblib.dump(xgb_model, os.path.join(args.model_dir, "xgb_model.joblib"))
+    # Save the label encoder if we have one
     if label_encoder:
         joblib.dump(label_encoder, os.path.join(args.model_dir, "label_encoder.joblib"))
@@ -370,19 +372,8 @@ if __name__ == "__main__":
 def model_fn(model_dir):
     """Deserialize and return fitted XGBoost model"""
-    model_path = os.path.join(model_dir, "xgb_model.json")
-    with open(model_path, "r") as f:
-        model_json = json.load(f)
-    sklearn_data = model_json['learner']['attributes']['scikit_learn']
-    model_type = json.loads(sklearn_data)['_estimator_type']
-    model_class = xgb.XGBClassifier if model_type == "classifier" else xgb.XGBRegressor
-    model = model_class(enable_categorical=True)
-    model.load_model(model_path)
+    model_path = os.path.join(model_dir, "xgb_model.joblib")
+    model = joblib.load(model_path)
     return model
@@ -390,7 +381,7 @@ def input_fn(input_data, content_type):
     """Parse input data and return a DataFrame."""
     if not input_data:
         raise ValueError("Empty input data is not supported!")
     # Decode bytes to string if necessary
     if isinstance(input_data, bytes):
         input_data = input_data.decode("utf-8")

workbench/repl/workbench_shell.py CHANGED Viewed

@@ -41,7 +41,7 @@ from workbench.cached.cached_meta import CachedMeta
 try:
     import rdkit  # noqa
     import mordred  # noqa
-    from workbench.utils import chem_utils
+    from workbench.utils.chem_utils import vis
     HAVE_CHEM_UTILS = True
 except ImportError:
@@ -178,12 +178,12 @@ class WorkbenchShell:
         # Add cheminformatics utils if available
         if HAVE_CHEM_UTILS:
-            self.commands["show"] = chem_utils.show
+            self.commands["show"] = vis.show
     def start(self):
         """Start the Workbench IPython shell"""
         cprint("magenta", "\nWelcome to Workbench!")
-        if self.aws_status is False:
+        if not self.aws_status:
             cprint("red", "AWS Account Connection Failed...Review/Fix the Workbench Config:")
             cprint("red", f"Path: {self.cm.site_config_path}")
             self.show_config()
@@ -560,7 +560,7 @@ class WorkbenchShell:
         from workbench.web_interface.components.plugin_unit_test import PluginUnitTest
         # Get kwargs
-        theme = kwargs.get("theme", "dark")
+        theme = kwargs.get("theme", "midnight_blue")
         plugin_test = PluginUnitTest(plugin_class, theme=theme, input_data=data, **kwargs)

workbench/scripts/lambda_launcher.py ADDED Viewed

@@ -0,0 +1,63 @@
+import sys
+import os
+import json
+import importlib.util
+def main():
+    if len(sys.argv) != 2:
+        print("Usage: lambda_launcher <handler_module_name>")
+        print("\nOptional: testing/event.json with test event")
+        print("Optional: testing/env.json with environment variables")
+        sys.exit(1)
+    handler_file = sys.argv[1]
+    # Add .py if not present
+    if not handler_file.endswith(".py"):
+        handler_file += ".py"
+    # Check if file exists
+    if not os.path.exists(handler_file):
+        print(f"Error: File '{handler_file}' not found")
+        sys.exit(1)
+    # Load environment variables from env.json if it exists
+    if os.path.exists("testing/env.json"):
+        print("Loading environment variables from testing/env.json")
+        with open("testing/env.json") as f:
+            env_vars = json.load(f)
+            for key, value in env_vars.items():
+                os.environ[key] = value
+                print(f"  Set {key} = {value}")
+        print()
+    # Load event configuration
+    if os.path.exists("testing/event.json"):
+        print("Loading event from testing/event.json")
+        with open("testing/event.json") as f:
+            event = json.load(f)
+    else:
+        print("No testing/event.json found, using empty event")
+        event = {}
+    # Load the module dynamically
+    spec = importlib.util.spec_from_file_location("lambda_module", handler_file)
+    lambda_module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(lambda_module)
+    # Call the lambda_handler
+    print(f"Invoking lambda_handler from {handler_file}...")
+    print("-" * 50)
+    print(f"Event: {json.dumps(event, indent=2)}")
+    print("-" * 50)
+    result = lambda_module.lambda_handler(event, {})
+    print("-" * 50)
+    print("Result:")
+    print(json.dumps(result, indent=2))
+if __name__ == "__main__":
+    main()

workbench/scripts/{ml_pipeline_launcher.py → ml_pipeline_batch.py} RENAMED Viewed

@@ -27,60 +27,56 @@ def get_batch_role_arn() -> str:
     return f"arn:aws:iam::{account_id}:role/Workbench-BatchRole"
-def ensure_job_definition():
-    """Register or update the Batch job definition for ML pipeline runner."""
-    batch = AWSAccountClamp().boto3_session.client("batch")
-    name = "workbench-ml-pipeline-runner"
-    response = batch.register_job_definition(
-        jobDefinitionName=name,
-        type="container",
-        platformCapabilities=["FARGATE"],
-        containerProperties={
-            "image": get_ecr_image_uri(),
-            "resourceRequirements": [{"type": "VCPU", "value": "2"}, {"type": "MEMORY", "value": "4096"}],
-            "jobRoleArn": get_batch_role_arn(),
-            "executionRoleArn": get_batch_role_arn(),
-            "environment": [
-                {"name": "WORKBENCH_BUCKET", "value": workbench_bucket},
-                {"name": "PYTHONUNBUFFERED", "value": "1"},
-            ],
-            # "networkConfiguration": {"assignPublicIp": "ENABLED"},  # Required for ECR Image Pull (when not in VPC)
-        },
-        timeout={"attemptDurationSeconds": 10800},  # 3 hours
-    )
-    log.info(f"Job definition ready: {name} (revision {response['revision']})")
-    return name
+def _log_cloudwatch_link(job: dict, message_prefix: str = "View logs") -> None:
+    """
+    Helper method to log CloudWatch logs link with clickable URL and full URL display.
+    Args:
+        job: Batch job description dictionary
+        message_prefix: Prefix for the log message (default: "View logs")
+    """
+    log_stream = job.get("container", {}).get("logStreamName")
+    logs_url = get_cloudwatch_logs_url(log_group="/aws/batch/job", log_stream=log_stream)
+    if logs_url:
+        clickable_url = f"\033]8;;{logs_url}\033\\{logs_url}\033]8;;\033\\"
+        log.info(f"{message_prefix}: {clickable_url}")
+    else:
+        log.info("Check AWS Batch console for logs")
-def run_batch_job(script_path: str) -> int:
+def run_batch_job(script_path: str, size: str = "small") -> int:
     """
     Submit and monitor an AWS Batch job for ML pipeline execution.
-    This function:
-    1. Uploads the ML pipeline script to S3
-    2. Submits a Batch job to run the script in a container
-    3. Monitors job status until completion
-    4. Returns the job's exit code
+    Uploads script to S3, submits Batch job, monitors until completion or 2 minutes of RUNNING.
     Args:
         script_path: Local path to the ML pipeline script
+        size: Job size tier - "small" (default), "medium", or "large"
+          - small: 2 vCPU, 4GB RAM for lightweight processing
+          - medium: 4 vCPU, 8GB RAM for standard ML workloads
+          - large: 8 vCPU, 16GB RAM for heavy training/inference
     Returns:
-        Exit code from the batch job (0 for success, non-zero for failure)
+        Exit code (0 for success/disconnected, non-zero for failure)
     """
+    if size not in ["small", "medium", "large"]:
+        raise ValueError(f"Invalid size '{size}'. Must be 'small', 'medium', or 'large'")
     batch = AWSAccountClamp().boto3_session.client("batch")
     script_name = Path(script_path).stem
-    # Upload script to S3 for the container to download
+    # Upload script to S3
     s3_path = f"s3://{workbench_bucket}/batch-jobs/{Path(script_path).name}"
     log.info(f"Uploading script to {s3_path}")
     upload_content_to_s3(Path(script_path).read_text(), s3_path)
-    # Submit the Batch job
+    # Submit job
     job_name = f"workbench_{script_name}_{datetime.now():%Y%m%d_%H%M%S}"
     response = batch.submit_job(
         jobName=job_name,
         jobQueue="workbench-job-queue",
-        jobDefinition=ensure_job_definition(),
+        jobDefinition=f"workbench-batch-{size}",
         containerOverrides={
             "environment": [
                 {"name": "ML_PIPELINE_S3_PATH", "value": s3_path},
@@ -89,36 +85,38 @@ def run_batch_job(script_path: str) -> int:
         },
     )
     job_id = response["jobId"]
-    log.info(f"Submitted job: {job_name} ({job_id})")
+    log.info(f"Submitted job: {job_name} ({job_id}) using {size} tier")
-    # Monitor job execution
-    last_status = None
+    # Monitor job
+    last_status, running_start = None, None
     while True:
-        # Check job status
         job = batch.describe_jobs(jobs=[job_id])["jobs"][0]
         status = job["status"]
         if status != last_status:
             log.info(f"Job status: {status}")
             last_status = status
+            if status == "RUNNING":
+                running_start = time.time()
+        # Disconnect after 2 minutes of running
+        if status == "RUNNING" and running_start and (time.time() - running_start >= 120):
+            log.info("✅  ML Pipeline is running successfully!")
+            _log_cloudwatch_link(job, "📊  Monitor logs")
+            return 0
-        # Check if job completed
+        # Handle completion
         if status in ["SUCCEEDED", "FAILED"]:
             exit_code = job.get("attempts", [{}])[-1].get("exitCode", 1)
-            if status == "FAILED":
-                log.error(f"Job failed: {job.get('statusReason', 'Unknown reason')}")
-            else:
-                log.info("Job completed successfully")
-            # Get CloudWatch logs URL
-            log_stream_name = job.get("container", {}).get("logStreamName")
-            logs_url = get_cloudwatch_logs_url(log_group="/aws/batch/job", log_stream=log_stream_name)
-            if logs_url:
-                # OSC 8 hyperlink format for modern terminals
-                clickable_url = f"\033]8;;{logs_url}\033\\{logs_url}\033]8;;\033\\"
-                log.info(f"View logs: {clickable_url}")
+            msg = (
+                "Job completed successfully"
+                if status == "SUCCEEDED"
+                else f"Job failed: {job.get('statusReason', 'Unknown')}"
+            )
+            log.info(msg) if status == "SUCCEEDED" else log.error(msg)
+            _log_cloudwatch_link(job)
             return exit_code
-        # Sleep a bit before next status check
         time.sleep(10)

workbench 0.8.168__py3-none-any.whl → 0.8.193__py3-none-any.whl

workbench 0.8.168py3-none-any.whl → 0.8.193py3-none-any.whl