PyPI - workbench - Versions diffs - 0.8.178__py3-none-any.whl → 0.8.180__py3-none-any.whl - Mend

workbench 0.8.178py3-none-any.whl → 0.8.180py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of workbench might be problematic. Click here for more details.

Files changed (26) hide show

workbench/model_scripts/xgb_model/generated_model_script.py CHANGED Viewed

@@ -32,10 +32,12 @@ TEMPLATE_PARAMS = {
     "target": "udm_asy_res_value",
     "features": ['chi2v', 'fr_sulfone', 'chi1v', 'bcut2d_logplow', 'fr_piperzine', 'kappa3', 'smr_vsa1', 'slogp_vsa5', 'fr_ketone_topliss', 'fr_sulfonamd', 'fr_imine', 'fr_benzene', 'fr_ester', 'chi2n', 'labuteasa', 'peoe_vsa2', 'smr_vsa6', 'bcut2d_chglo', 'fr_sh', 'peoe_vsa1', 'fr_allylic_oxid', 'chi4n', 'fr_ar_oh', 'fr_nh0', 'fr_term_acetylene', 'slogp_vsa7', 'slogp_vsa4', 'estate_vsa1', 'vsa_estate4', 'numbridgeheadatoms', 'numheterocycles', 'fr_ketone', 'fr_morpholine', 'fr_guanido', 'estate_vsa2', 'numheteroatoms', 'fr_nitro_arom_nonortho', 'fr_piperdine', 'nocount', 'numspiroatoms', 'fr_aniline', 'fr_thiophene', 'slogp_vsa10', 'fr_amide', 'slogp_vsa2', 'fr_epoxide', 'vsa_estate7', 'fr_ar_coo', 'fr_imidazole', 'fr_nitrile', 'fr_oxazole', 'numsaturatedrings', 'fr_pyridine', 'fr_hoccn', 'fr_ndealkylation1', 'numaliphaticheterocycles', 'fr_phenol', 'maxpartialcharge', 'vsa_estate5', 'peoe_vsa13', 'minpartialcharge', 'qed', 'fr_al_oh', 'slogp_vsa11', 'chi0n', 'fr_bicyclic', 'peoe_vsa12', 'fpdensitymorgan1', 'fr_oxime', 'molwt', 'fr_dihydropyridine', 'smr_vsa5', 'peoe_vsa5', 'fr_nitro', 'hallkieralpha', 'heavyatommolwt', 'fr_alkyl_halide', 'peoe_vsa8', 'fr_nhpyrrole', 'fr_isocyan', 'bcut2d_chghi', 'fr_lactam', 'peoe_vsa11', 'smr_vsa9', 'tpsa', 'chi4v', 'slogp_vsa1', 'phi', 'bcut2d_logphi', 'avgipc', 'estate_vsa11', 'fr_coo', 'bcut2d_mwhi', 'numunspecifiedatomstereocenters', 'vsa_estate10', 'estate_vsa8', 'numvalenceelectrons', 'fr_nh2', 'fr_lactone', 'vsa_estate1', 'estate_vsa4', 'numatomstereocenters', 'vsa_estate8', 'fr_para_hydroxylation', 'peoe_vsa3', 'fr_thiazole', 'peoe_vsa10', 'fr_ndealkylation2', 'slogp_vsa12', 'peoe_vsa9', 'maxestateindex', 'fr_quatn', 'smr_vsa7', 'minestateindex', 'numaromaticheterocycles', 'numrotatablebonds', 'fr_ar_nh', 'fr_ether', 'exactmolwt', 'fr_phenol_noorthohbond', 'slogp_vsa3', 'fr_ar_n', 'sps', 'fr_c_o_nocoo', 'bertzct', 'peoe_vsa7', 'slogp_vsa8', 'numradicalelectrons', 'molmr', 'fr_tetrazole', 'numsaturatedcarbocycles', 'bcut2d_mrhi', 'kappa1', 'numamidebonds', 'fpdensitymorgan2', 'smr_vsa8', 'chi1n', 'estate_vsa6', 'fr_barbitur', 'fr_diazo', 'kappa2', 'chi0', 'bcut2d_mrlow', 'balabanj', 'peoe_vsa4', 'numhacceptors', 'fr_sulfide', 'chi3n', 'smr_vsa2', 'fr_al_oh_notert', 'fr_benzodiazepine', 'fr_phos_ester', 'fr_aldehyde', 'fr_coo2', 'estate_vsa5', 'fr_prisulfonamd', 'numaromaticcarbocycles', 'fr_unbrch_alkane', 'fr_urea', 'fr_nitroso', 'smr_vsa10', 'fr_c_s', 'smr_vsa3', 'fr_methoxy', 'maxabspartialcharge', 'slogp_vsa9', 'heavyatomcount', 'fr_azide', 'chi3v', 'smr_vsa4', 'mollogp', 'chi0v', 'fr_aryl_methyl', 'fr_nh1', 'fpdensitymorgan3', 'fr_furan', 'fr_hdrzine', 'fr_arn', 'numaromaticrings', 'vsa_estate3', 'fr_azo', 'fr_halogen', 'estate_vsa9', 'fr_hdrzone', 'numhdonors', 'fr_alkyl_carbamate', 'fr_isothiocyan', 'minabspartialcharge', 'fr_al_coo', 'ringcount', 'chi1', 'estate_vsa7', 'fr_nitro_arom', 'vsa_estate9', 'minabsestateindex', 'maxabsestateindex', 'vsa_estate6', 'estate_vsa10', 'estate_vsa3', 'fr_n_o', 'fr_amidine', 'fr_thiocyan', 'fr_phos_acid', 'fr_c_o', 'fr_imide', 'numaliphaticrings', 'peoe_vsa6', 'vsa_estate2', 'nhohcount', 'numsaturatedheterocycles', 'slogp_vsa6', 'peoe_vsa14', 'fractioncsp3', 'bcut2d_mwlow', 'numaliphaticcarbocycles', 'fr_priamide', 'nacid', 'nbase', 'naromatom', 'narombond', 'sz', 'sm', 'sv', 'sse', 'spe', 'sare', 'sp', 'si', 'mz', 'mm', 'mv', 'mse', 'mpe', 'mare', 'mp', 'mi', 'xch_3d', 'xch_4d', 'xch_5d', 'xch_6d', 'xch_7d', 'xch_3dv', 'xch_4dv', 'xch_5dv', 'xch_6dv', 'xch_7dv', 'xc_3d', 'xc_4d', 'xc_5d', 'xc_6d', 'xc_3dv', 'xc_4dv', 'xc_5dv', 'xc_6dv', 'xpc_4d', 'xpc_5d', 'xpc_6d', 'xpc_4dv', 'xpc_5dv', 'xpc_6dv', 'xp_0d', 'xp_1d', 'xp_2d', 'xp_3d', 'xp_4d', 'xp_5d', 'xp_6d', 'xp_7d', 'axp_0d', 'axp_1d', 'axp_2d', 'axp_3d', 'axp_4d', 'axp_5d', 'axp_6d', 'axp_7d', 'xp_0dv', 'xp_1dv', 'xp_2dv', 'xp_3dv', 'xp_4dv', 'xp_5dv', 'xp_6dv', 'xp_7dv', 'axp_0dv', 'axp_1dv', 'axp_2dv', 'axp_3dv', 'axp_4dv', 'axp_5dv', 'axp_6dv', 'axp_7dv', 'c1sp1', 'c2sp1', 'c1sp2', 'c2sp2', 'c3sp2', 'c1sp3', 'c2sp3', 'c3sp3', 'c4sp3', 'hybratio', 'fcsp3', 'num_stereocenters', 'num_unspecified_stereocenters', 'num_defined_stereocenters', 'num_r_centers', 'num_s_centers', 'num_stereobonds', 'num_e_bonds', 'num_z_bonds', 'stereo_complexity', 'frac_defined_stereo'],
     "compressed_features": [],
-    "model_metrics_s3_path": "s3://ideaya-sageworks-bucket/models/pka-a1-reg-0-nightly-100-test/training",
-    "train_all_data": True
+    "model_metrics_s3_path": "s3://ideaya-sageworks-bucket/models/logd-hyper-80/training",
+    "train_all_data": False,
+    "hyperparameters": {'n_estimators': 200, 'max_depth': 6, 'learning_rate': 0.05, 'subsample': 0.7, 'colsample_bytree': 0.3, 'colsample_bylevel': 0.5, 'min_child_weight': 5, 'gamma': 0.2, 'reg_alpha': 0.5, 'reg_lambda': 2.0, 'scale_pos_weight': 1},
 }
 # Function to check if dataframe is empty
 def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
     """
@@ -75,7 +77,7 @@ def expand_proba_column(df: pd.DataFrame, class_labels: List[str]) -> pd.DataFra
     proba_df = pd.DataFrame(df[proba_column].tolist(), columns=proba_splits)
     # Drop any proba columns and reset the index in prep for the concat
-    df = df.drop(columns=[proba_column]+proba_splits, errors="ignore")
+    df = df.drop(columns=[proba_column] + proba_splits, errors="ignore")
     df = df.reset_index(drop=True)
     # Concatenate the new columns with the original DataFrame
@@ -140,8 +142,10 @@ def convert_categorical_types(df: pd.DataFrame, features: list, category_mapping
     return df, category_mappings
-def decompress_features(df: pd.DataFrame, features: List[str], compressed_features: List[str]) -> Tuple[pd.DataFrame, List[str]]:
-    """Prepare features for the XGBoost model
+def decompress_features(
+    df: pd.DataFrame, features: List[str], compressed_features: List[str]
+) -> Tuple[pd.DataFrame, List[str]]:
+    """Prepare features for the model
     Args:
         df (pd.DataFrame): The features DataFrame
@@ -204,6 +208,7 @@ if __name__ == "__main__":
     model_type = TEMPLATE_PARAMS["model_type"]
     model_metrics_s3_path = TEMPLATE_PARAMS["model_metrics_s3_path"]
     train_all_data = TEMPLATE_PARAMS["train_all_data"]
+    hyperparameters = TEMPLATE_PARAMS["hyperparameters"]
     validation_split = 0.2
     # Script arguments for input/output directories
@@ -216,11 +221,7 @@ if __name__ == "__main__":
     args = parser.parse_args()
     # Read the training data into DataFrames
-    training_files = [
-        os.path.join(args.train, file)
-        for file in os.listdir(args.train)
-        if file.endswith(".csv")
-    ]
+    training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
     print(f"Training Files: {training_files}")
     # Combine files and read them all into a single pandas dataframe
@@ -255,15 +256,16 @@ if __name__ == "__main__":
     else:
         # Just do a random training Split
         print("WARNING: No training column found, splitting data with random state=42")
-        df_train, df_val = train_test_split(
-            all_df, test_size=validation_split, random_state=42
-        )
+        df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
     print(f"FIT/TRAIN: {df_train.shape}")
     print(f"VALIDATION: {df_val.shape}")
+    # Use any hyperparameters to set up both the trainer and model configurations
+    print(f"Hyperparameters: {hyperparameters}")
     # Now spin up our XGB Model
     if model_type == "classifier":
-        xgb_model = xgb.XGBClassifier(enable_categorical=True)
+        xgb_model = xgb.XGBClassifier(enable_categorical=True, **hyperparameters)
         # Encode the target column
         label_encoder = LabelEncoder()
@@ -271,12 +273,12 @@ if __name__ == "__main__":
         df_val[target] = label_encoder.transform(df_val[target])
     else:
-        xgb_model = xgb.XGBRegressor(enable_categorical=True)
+        xgb_model = xgb.XGBRegressor(enable_categorical=True, **hyperparameters)
         label_encoder = None  # We don't need this for regression
     # Grab our Features, Target and Train the Model
     y_train = df_train[target]
-    X_train= df_train[features]
+    X_train = df_train[features]
     xgb_model.fit(X_train, y_train)
     # Make Predictions on the Validation Set
@@ -315,9 +317,7 @@ if __name__ == "__main__":
         label_names = label_encoder.classes_
         # Calculate various model performance metrics
-        scores = precision_recall_fscore_support(
-            y_validate, preds, average=None, labels=label_names
-        )
+        scores = precision_recall_fscore_support(y_validate, preds, average=None, labels=label_names)
         # Put the scores into a dataframe
         score_df = pd.DataFrame(
@@ -355,7 +355,9 @@ if __name__ == "__main__":
         print(f"NumRows: {len(df_val)}")
     # Now save the model to the standard place/name
-    xgb_model.save_model(os.path.join(args.model_dir, "xgb_model.json"))
+    joblib.dump(xgb_model, os.path.join(args.model_dir, "xgb_model.joblib"))
+    # Save the label encoder if we have one
     if label_encoder:
         joblib.dump(label_encoder, os.path.join(args.model_dir, "label_encoder.joblib"))
@@ -370,19 +372,8 @@ if __name__ == "__main__":
 def model_fn(model_dir):
     """Deserialize and return fitted XGBoost model"""
-    model_path = os.path.join(model_dir, "xgb_model.json")
-    with open(model_path, "r") as f:
-        model_json = json.load(f)
-    sklearn_data = model_json['learner']['attributes']['scikit_learn']
-    model_type = json.loads(sklearn_data)['_estimator_type']
-    model_class = xgb.XGBClassifier if model_type == "classifier" else xgb.XGBRegressor
-    model = model_class(enable_categorical=True)
-    model.load_model(model_path)
+    model_path = os.path.join(model_dir, "xgb_model.joblib")
+    model = joblib.load(model_path)
     return model

workbench/model_scripts/xgb_model/xgb_model.template CHANGED Viewed

@@ -33,9 +33,11 @@ TEMPLATE_PARAMS = {
     "features": "{{feature_list}}",
     "compressed_features": "{{compressed_features}}",
     "model_metrics_s3_path": "{{model_metrics_s3_path}}",
-    "train_all_data": "{{train_all_data}}"
+    "train_all_data": "{{train_all_data}}",
+    "hyperparameters": "{{hyperparameters}}",
 }
 # Function to check if dataframe is empty
 def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
     """
@@ -75,7 +77,7 @@ def expand_proba_column(df: pd.DataFrame, class_labels: List[str]) -> pd.DataFra
     proba_df = pd.DataFrame(df[proba_column].tolist(), columns=proba_splits)
     # Drop any proba columns and reset the index in prep for the concat
-    df = df.drop(columns=[proba_column]+proba_splits, errors="ignore")
+    df = df.drop(columns=[proba_column] + proba_splits, errors="ignore")
     df = df.reset_index(drop=True)
     # Concatenate the new columns with the original DataFrame
@@ -140,8 +142,10 @@ def convert_categorical_types(df: pd.DataFrame, features: list, category_mapping
     return df, category_mappings
-def decompress_features(df: pd.DataFrame, features: List[str], compressed_features: List[str]) -> Tuple[pd.DataFrame, List[str]]:
-    """Prepare features for the XGBoost model
+def decompress_features(
+    df: pd.DataFrame, features: List[str], compressed_features: List[str]
+) -> Tuple[pd.DataFrame, List[str]]:
+    """Prepare features for the model
     Args:
         df (pd.DataFrame): The features DataFrame
@@ -204,6 +208,7 @@ if __name__ == "__main__":
     model_type = TEMPLATE_PARAMS["model_type"]
     model_metrics_s3_path = TEMPLATE_PARAMS["model_metrics_s3_path"]
     train_all_data = TEMPLATE_PARAMS["train_all_data"]
+    hyperparameters = TEMPLATE_PARAMS["hyperparameters"]
     validation_split = 0.2
     # Script arguments for input/output directories
@@ -216,11 +221,7 @@ if __name__ == "__main__":
     args = parser.parse_args()
     # Read the training data into DataFrames
-    training_files = [
-        os.path.join(args.train, file)
-        for file in os.listdir(args.train)
-        if file.endswith(".csv")
-    ]
+    training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
     print(f"Training Files: {training_files}")
     # Combine files and read them all into a single pandas dataframe
@@ -255,15 +256,16 @@ if __name__ == "__main__":
     else:
         # Just do a random training Split
         print("WARNING: No training column found, splitting data with random state=42")
-        df_train, df_val = train_test_split(
-            all_df, test_size=validation_split, random_state=42
-        )
+        df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
     print(f"FIT/TRAIN: {df_train.shape}")
     print(f"VALIDATION: {df_val.shape}")
+    # Use any hyperparameters to set up both the trainer and model configurations
+    print(f"Hyperparameters: {hyperparameters}")
     # Now spin up our XGB Model
     if model_type == "classifier":
-        xgb_model = xgb.XGBClassifier(enable_categorical=True)
+        xgb_model = xgb.XGBClassifier(enable_categorical=True, **hyperparameters)
         # Encode the target column
         label_encoder = LabelEncoder()
@@ -271,12 +273,12 @@ if __name__ == "__main__":
         df_val[target] = label_encoder.transform(df_val[target])
     else:
-        xgb_model = xgb.XGBRegressor(enable_categorical=True)
+        xgb_model = xgb.XGBRegressor(enable_categorical=True, **hyperparameters)
         label_encoder = None  # We don't need this for regression
     # Grab our Features, Target and Train the Model
     y_train = df_train[target]
-    X_train= df_train[features]
+    X_train = df_train[features]
     xgb_model.fit(X_train, y_train)
     # Make Predictions on the Validation Set
@@ -315,9 +317,7 @@ if __name__ == "__main__":
         label_names = label_encoder.classes_
         # Calculate various model performance metrics
-        scores = precision_recall_fscore_support(
-            y_validate, preds, average=None, labels=label_names
-        )
+        scores = precision_recall_fscore_support(y_validate, preds, average=None, labels=label_names)
         # Put the scores into a dataframe
         score_df = pd.DataFrame(
@@ -355,7 +355,9 @@ if __name__ == "__main__":
         print(f"NumRows: {len(df_val)}")
     # Now save the model to the standard place/name
-    xgb_model.save_model(os.path.join(args.model_dir, "xgb_model.json"))
+    joblib.dump(xgb_model, os.path.join(args.model_dir, "xgb_model.joblib"))
+    # Save the label encoder if we have one
     if label_encoder:
         joblib.dump(label_encoder, os.path.join(args.model_dir, "label_encoder.joblib"))
@@ -370,19 +372,8 @@ if __name__ == "__main__":
 def model_fn(model_dir):
     """Deserialize and return fitted XGBoost model"""
-    model_path = os.path.join(model_dir, "xgb_model.json")
-    with open(model_path, "r") as f:
-        model_json = json.load(f)
-    sklearn_data = model_json['learner']['attributes']['scikit_learn']
-    model_type = json.loads(sklearn_data)['_estimator_type']
-    model_class = xgb.XGBClassifier if model_type == "classifier" else xgb.XGBRegressor
-    model = model_class(enable_categorical=True)
-    model.load_model(model_path)
+    model_path = os.path.join(model_dir, "xgb_model.joblib")
+    model = joblib.load(model_path)
     return model

workbench/utils/model_utils.py CHANGED Viewed

@@ -222,7 +222,8 @@ def uq_metrics(df: pd.DataFrame, target_col: str) -> Dict[str, Any]:
         lower_95, upper_95 = df["q_025"], df["q_975"]
         lower_90, upper_90 = df["q_05"], df["q_95"]
         lower_80, upper_80 = df["q_10"], df["q_90"]
-        lower_68, upper_68 = df["q_16"], df["q_84"]
+        lower_68 = df.get("q_16", 0)
+        upper_68 = df.get("q_84", 0)
         lower_50, upper_50 = df["q_25"], df["q_75"]
     elif "prediction_std" in df.columns:
         lower_95 = df["prediction"] - 1.96 * df["prediction_std"]

workbench/utils/shap_utils.py CHANGED Viewed

@@ -212,6 +212,14 @@ def _calculate_shap_values(workbench_model, sample_df: pd.DataFrame = None):
         log.error("No XGBoost model found in the artifact.")
         return None, None, None, None
+    # Get the booster (SHAP requires the booster, not the sklearn wrapper)
+    if hasattr(xgb_model, "get_booster"):
+        # Full sklearn model - extract the booster
+        booster = xgb_model.get_booster()
+    else:
+        # Already a booster
+        booster = xgb_model
     # Load category mappings if available
     category_mappings = load_category_mappings_from_s3(model_artifact_uri)
@@ -229,8 +237,8 @@ def _calculate_shap_values(workbench_model, sample_df: pd.DataFrame = None):
     # Create a DMatrix with categorical support
     dmatrix = xgb.DMatrix(X, enable_categorical=True)
-    # Use XGBoost's built-in SHAP calculation
-    shap_values = xgb_model.predict(dmatrix, pred_contribs=True, strict_shape=True)
+    # Use XGBoost's built-in SHAP calculation (booster method, not sklearn)
+    shap_values = booster.predict(dmatrix, pred_contribs=True, strict_shape=True)
     features_with_bias = features + ["bias"]
     # Now we need to subset the columns based on top 10 SHAP values

workbench 0.8.178__py3-none-any.whl → 0.8.180__py3-none-any.whl

Potentially problematic release.

workbench 0.8.178py3-none-any.whl → 0.8.180py3-none-any.whl