PyPI - workbench - Versions diffs - 0.8.172__py3-none-any.whl → 0.8.173__py3-none-any.whl - Mend

workbench 0.8.172py3-none-any.whl → 0.8.173py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

workbench/algorithms/graph/light/proximity_graph.py +2 -1
workbench/api/compound.py +1 -1
workbench/api/monitor.py +1 -16
workbench/core/artifacts/data_capture_core.py +315 -0
workbench/core/artifacts/endpoint_core.py +9 -3
workbench/core/artifacts/monitor_core.py +33 -249
workbench/core/transforms/data_to_features/light/molecular_descriptors.py +4 -4
workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +471 -0
workbench/model_scripts/custom_models/chem_info/mol_standardize.py +428 -0
workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +7 -9
workbench/model_scripts/custom_models/uq_models/generated_model_script.py +95 -204
workbench/model_scripts/xgb_model/generated_model_script.py +5 -5
workbench/repl/workbench_shell.py +3 -3
workbench/utils/chem_utils/__init__.py +0 -0
workbench/utils/chem_utils/fingerprints.py +134 -0
workbench/utils/chem_utils/misc.py +194 -0
workbench/utils/chem_utils/mol_descriptors.py +471 -0
workbench/utils/chem_utils/mol_standardize.py +428 -0
workbench/utils/chem_utils/mol_tagging.py +348 -0
workbench/utils/chem_utils/projections.py +209 -0
workbench/utils/chem_utils/salts.py +256 -0
workbench/utils/chem_utils/sdf.py +292 -0
workbench/utils/chem_utils/toxicity.py +250 -0
workbench/utils/chem_utils/vis.py +253 -0
workbench/utils/monitor_utils.py +49 -56
workbench/utils/pandas_utils.py +3 -3
workbench/web_interface/components/plugins/generated_compounds.py +1 -1
{workbench-0.8.172.dist-info → workbench-0.8.173.dist-info}/METADATA +1 -1
{workbench-0.8.172.dist-info → workbench-0.8.173.dist-info}/RECORD +33 -22
workbench/model_scripts/custom_models/chem_info/local_utils.py +0 -769
workbench/model_scripts/custom_models/chem_info/tautomerize.py +0 -83
workbench/utils/chem_utils.py +0 -1556
{workbench-0.8.172.dist-info → workbench-0.8.173.dist-info}/WHEEL +0 -0
{workbench-0.8.172.dist-info → workbench-0.8.173.dist-info}/entry_points.txt +0 -0
{workbench-0.8.172.dist-info → workbench-0.8.173.dist-info}/licenses/LICENSE +0 -0
{workbench-0.8.172.dist-info → workbench-0.8.173.dist-info}/top_level.txt +0 -0

workbench/model_scripts/custom_models/uq_models/generated_model_script.py CHANGED Viewed

@@ -1,7 +1,7 @@
-# Model: XGBoost for point predictions + LightGBM with MAPIE for conformalized intervals
-from mapie.regression import ConformalizedQuantileRegressor
-from lightgbm import LGBMRegressor
-from xgboost import XGBRegressor
+# Model: NGBoost Regressor with Distribution output
+from ngboost import NGBRegressor
+from ngboost.distns import Cauchy, T
+from xgboost import XGBRegressor  # Point Estimator
 from sklearn.model_selection import train_test_split
 # Model Performance Scores
@@ -20,12 +20,19 @@ import numpy as np
 import pandas as pd
 from typing import List, Tuple
+# Local Imports
+from proximity import Proximity
 # Template Placeholders
 TEMPLATE_PARAMS = {
+    "id_column": "udm_mol_id",
     "target": "udm_asy_res_value",
     "features": ['bcut2d_logplow', 'numradicalelectrons', 'smr_vsa5', 'fr_lactam', 'fr_morpholine', 'fr_aldehyde', 'slogp_vsa1', 'fr_amidine', 'bpol', 'fr_ester', 'fr_azo', 'kappa3', 'peoe_vsa5', 'fr_ketone_topliss', 'vsa_estate9', 'estate_vsa9', 'bcut2d_mrhi', 'fr_ndealkylation1', 'numrotatablebonds', 'minestateindex', 'fr_quatn', 'peoe_vsa3', 'fr_epoxide', 'fr_aniline', 'minpartialcharge', 'fr_nitroso', 'fpdensitymorgan2', 'fr_oxime', 'fr_sulfone', 'smr_vsa1', 'kappa1', 'fr_pyridine', 'numaromaticrings', 'vsa_estate6', 'molmr', 'estate_vsa1', 'fr_dihydropyridine', 'vsa_estate10', 'fr_alkyl_halide', 'chi2n', 'fr_thiocyan', 'fpdensitymorgan1', 'fr_unbrch_alkane', 'slogp_vsa9', 'chi4n', 'fr_nitro_arom', 'fr_al_oh', 'fr_furan', 'fr_c_s', 'peoe_vsa8', 'peoe_vsa14', 'numheteroatoms', 'fr_ndealkylation2', 'maxabspartialcharge', 'vsa_estate2', 'peoe_vsa7', 'apol', 'numhacceptors', 'fr_tetrazole', 'vsa_estate1', 'peoe_vsa9', 'naromatom', 'bcut2d_chghi', 'fr_sh', 'fr_halogen', 'slogp_vsa4', 'fr_benzodiazepine', 'molwt', 'fr_isocyan', 'fr_prisulfonamd', 'maxabsestateindex', 'minabsestateindex', 'peoe_vsa11', 'slogp_vsa12', 'estate_vsa5', 'numaliphaticcarbocycles', 'bcut2d_mwlow', 'slogp_vsa7', 'fr_allylic_oxid', 'fr_methoxy', 'fr_nh0', 'fr_coo2', 'fr_phenol', 'nacid', 'nbase', 'chi3v', 'fr_ar_nh', 'fr_nitrile', 'fr_imidazole', 'fr_urea', 'bcut2d_mrlow', 'chi1', 'smr_vsa6', 'fr_aryl_methyl', 'narombond', 'fr_alkyl_carbamate', 'fr_piperzine', 'exactmolwt', 'qed', 'chi0n', 'fr_sulfonamd', 'fr_thiazole', 'numvalenceelectrons', 'fr_phos_acid', 'peoe_vsa12', 'fr_nh1', 'fr_hdrzine', 'fr_c_o_nocoo', 'fr_lactone', 'estate_vsa6', 'bcut2d_logphi', 'vsa_estate7', 'peoe_vsa13', 'numsaturatedcarbocycles', 'fr_nitro', 'fr_phenol_noorthohbond', 'rotratio', 'fr_barbitur', 'fr_isothiocyan', 'balabanj', 'fr_arn', 'fr_imine', 'maxpartialcharge', 'fr_sulfide', 'slogp_vsa11', 'fr_hoccn', 'fr_n_o', 'peoe_vsa1', 'slogp_vsa6', 'heavyatommolwt', 'fractioncsp3', 'estate_vsa8', 'peoe_vsa10', 'numaliphaticrings', 'fr_thiophene', 'maxestateindex', 'smr_vsa10', 'labuteasa', 'smr_vsa2', 'fpdensitymorgan3', 'smr_vsa9', 'slogp_vsa10', 'numaromaticheterocycles', 'fr_nh2', 'fr_diazo', 'chi3n', 'fr_ar_coo', 'slogp_vsa5', 'fr_bicyclic', 'fr_amide', 'estate_vsa10', 'fr_guanido', 'chi1n', 'numsaturatedrings', 'fr_piperdine', 'fr_term_acetylene', 'estate_vsa4', 'slogp_vsa3', 'fr_coo', 'fr_ether', 'estate_vsa7', 'bcut2d_chglo', 'fr_oxazole', 'peoe_vsa6', 'hallkieralpha', 'peoe_vsa2', 'chi2v', 'nocount', 'vsa_estate5', 'fr_nhpyrrole', 'fr_al_coo', 'bertzct', 'estate_vsa11', 'minabspartialcharge', 'slogp_vsa8', 'fr_imide', 'kappa2', 'numaliphaticheterocycles', 'numsaturatedheterocycles', 'fr_hdrzone', 'smr_vsa4', 'fr_ar_n', 'nrot', 'smr_vsa8', 'slogp_vsa2', 'chi4v', 'fr_phos_ester', 'fr_para_hydroxylation', 'smr_vsa3', 'nhohcount', 'estate_vsa2', 'mollogp', 'tpsa', 'fr_azide', 'peoe_vsa4', 'numhdonors', 'fr_al_oh_notert', 'fr_c_o', 'chi0', 'fr_nitro_arom_nonortho', 'vsa_estate3', 'fr_benzene', 'fr_ketone', 'vsa_estate8', 'smr_vsa7', 'fr_ar_oh', 'fr_priamide', 'ringcount', 'estate_vsa3', 'numaromaticcarbocycles', 'bcut2d_mwhi', 'chi1v', 'heavyatomcount', 'vsa_estate4', 'chi0v', 'chiral_centers', 'r_cnt', 's_cnt', 'db_stereo', 'e_cnt', 'z_cnt', 'chiral_fp', 'db_fp'],
     "compressed_features": [],
-    "train_all_data": True
+    "train_all_data": False,
+    "track_columns": "udm_asy_res_value"
 }
@@ -101,7 +108,7 @@ def convert_categorical_types(df: pd.DataFrame, features: list, category_mapping
 def decompress_features(
-        df: pd.DataFrame, features: List[str], compressed_features: List[str]
+    df: pd.DataFrame, features: List[str], compressed_features: List[str]
 ) -> Tuple[pd.DataFrame, List[str]]:
     """Prepare features for the model by decompressing bitstring features
@@ -157,11 +164,13 @@ def decompress_features(
 if __name__ == "__main__":
     # Template Parameters
+    id_column = TEMPLATE_PARAMS["id_column"]
     target = TEMPLATE_PARAMS["target"]
     features = TEMPLATE_PARAMS["features"]
     orig_features = features.copy()
     compressed_features = TEMPLATE_PARAMS["compressed_features"]
     train_all_data = TEMPLATE_PARAMS["train_all_data"]
+    track_columns = TEMPLATE_PARAMS["track_columns"]  # Can be None
     validation_split = 0.2
     # Script arguments for input/output directories
@@ -219,175 +228,78 @@ if __name__ == "__main__":
     print(f"FIT/TRAIN: {df_train.shape}")
     print(f"VALIDATION: {df_val.shape}")
+    # We're using XGBoost for point predictions and NGBoost for uncertainty quantification
+    xgb_model = XGBRegressor()
+    ngb_model = NGBRegressor()  # Dist=Cauchy) Seems to give HUGE prediction intervals
+    ngb_model = NGBRegressor(
+        Dist=T,
+        learning_rate=0.005,
+        minibatch_frac=0.1,  # Very small batches
+        col_sample=0.8  # This parameter DOES exist
+    ) # Testing this out
+    print("NGBoost using T distribution for uncertainty quantification")
     # Prepare features and targets for training
     X_train = df_train[features]
     X_validate = df_val[features]
     y_train = df_train[target]
     y_validate = df_val[target]
-    # Train XGBoost for point predictions
-    print("\nTraining XGBoost for point predictions...")
-    xgb_model = XGBRegressor(
-        n_estimators=1000,
-        max_depth=6,
-        learning_rate=0.01,
-        subsample=0.8,
-        colsample_bytree=0.8,
-        random_state=42,
-        verbosity=0
-    )
+    # Train both models using the training data
     xgb_model.fit(X_train, y_train)
-    # Evaluate XGBoost performance
-    y_pred_xgb = xgb_model.predict(X_validate)
-    xgb_rmse = root_mean_squared_error(y_validate, y_pred_xgb)
-    xgb_mae = mean_absolute_error(y_validate, y_pred_xgb)
-    xgb_r2 = r2_score(y_validate, y_pred_xgb)
-    print(f"\nXGBoost Point Prediction Performance:")
-    print(f"RMSE: {xgb_rmse:.3f}")
-    print(f"MAE: {xgb_mae:.3f}")
-    print(f"R2: {xgb_r2:.3f}")
-    # Define confidence levels we want to model
-    confidence_levels = [0.50, 0.80, 0.90, 0.95]  # 50%, 80%, 90%, 95% confidence intervals
-    # Store MAPIE models for each confidence level
-    mapie_models = {}
-    # Train models for each confidence level
-    for confidence_level in confidence_levels:
-        alpha = 1 - confidence_level
-        lower_q = alpha / 2
-        upper_q = 1 - alpha / 2
-        print(f"\nTraining quantile models for {confidence_level * 100:.0f}% confidence interval...")
-        print(f"  Quantiles: {lower_q:.3f}, {upper_q:.3f}, 0.500")
-        # Train three models for this confidence level
-        quantile_estimators = []
-        for q in [lower_q, upper_q, 0.5]:
-            print(f"    Training model for quantile {q:.3f}...")
-            est = LGBMRegressor(
-                objective="quantile",
-                alpha=q,
-                n_estimators=1000,
-                max_depth=6,
-                learning_rate=0.01,
-                num_leaves=31,
-                min_child_samples=20,
-                subsample=0.8,
-                colsample_bytree=0.8,
-                random_state=42,
-                verbose=-1,
-                force_col_wise=True
-            )
-            est.fit(X_train, y_train)
-            quantile_estimators.append(est)
-        # Create MAPIE CQR model for this confidence level
-        print(f"  Setting up MAPIE CQR for {confidence_level * 100:.0f}% confidence...")
-        mapie_model = ConformalizedQuantileRegressor(
-            quantile_estimators,
-            confidence_level=confidence_level,
-            prefit=True
-        )
-        # Conformalize the model
-        print(f"  Conformalizing with validation data...")
-        mapie_model.conformalize(X_validate, y_validate)
-        # Store the model
-        mapie_models[f"mapie_{confidence_level:.2f}"] = mapie_model
-        # Validate coverage for this confidence level
-        y_pred, y_pis = mapie_model.predict_interval(X_validate)
-        coverage = np.mean((y_validate >= y_pis[:, 0, 0]) & (y_validate <= y_pis[:, 1, 0]))
-        print(f"  Coverage: Target={confidence_level * 100:.0f}%, Empirical={coverage * 100:.1f}%")
-    print(f"\nOverall Model Performance Summary:")
-    print(f"XGBoost RMSE: {xgb_rmse:.3f}")
-    print(f"XGBoost MAE: {xgb_mae:.3f}")
-    print(f"XGBoost R2: {xgb_r2:.3f}")
+    ngb_model.fit(X_train, y_train, X_val=X_validate, Y_val=y_validate)
+    # Make Predictions on the Validation Set
+    print(f"Making Predictions on Validation Set...")
+    preds = xgb_model.predict(X_validate)
+    # Calculate various model performance metrics (regression)
+    rmse = root_mean_squared_error(y_validate, preds)
+    mae = mean_absolute_error(y_validate, preds)
+    r2 = r2_score(y_validate, preds)
+    print(f"RMSE: {rmse:.3f}")
+    print(f"MAE: {mae:.3f}")
+    print(f"R2: {r2:.3f}")
     print(f"NumRows: {len(df_val)}")
-    # Analyze interval widths across confidence levels
-    print(f"\nInterval Width Analysis:")
-    for conf_level in confidence_levels:
-        model = mapie_models[f"mapie_{conf_level:.2f}"]
-        _, y_pis = model.predict_interval(X_validate)
-        widths = y_pis[:, 1, 0] - y_pis[:, 0, 0]
-        print(f"  {conf_level * 100:.0f}% CI: Mean width={np.mean(widths):.3f}, Std={np.std(widths):.3f}")
     # Save the trained XGBoost model
     xgb_model.save_model(os.path.join(args.model_dir, "xgb_model.json"))
-    # Save all MAPIE models
-    for model_name, model in mapie_models.items():
-        joblib.dump(model, os.path.join(args.model_dir, f"{model_name}.joblib"))
+    # Save the trained NGBoost model
+    joblib.dump(ngb_model, os.path.join(args.model_dir, "ngb_model.joblib"))
-    # Save the feature list
+    # Save the features (this will validate input during predictions)
     with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
-        json.dump(features, fp)
-    # Save category mappings if any
-    if category_mappings:
-        with open(os.path.join(args.model_dir, "category_mappings.json"), "w") as fp:
-            json.dump(category_mappings, fp)
-    # Save model configuration
-    model_config = {
-        "model_type": "XGBoost_MAPIE_CQR_LightGBM",
-        "confidence_levels": confidence_levels,
-        "n_features": len(features),
-        "target": target,
-        "validation_metrics": {
-            "xgb_rmse": float(xgb_rmse),
-            "xgb_mae": float(xgb_mae),
-            "xgb_r2": float(xgb_r2),
-            "n_validation": len(df_val)
-        }
-    }
-    with open(os.path.join(args.model_dir, "model_config.json"), "w") as fp:
-        json.dump(model_config, fp, indent=2)
+        json.dump(orig_features, fp)  # We save the original features, not the decompressed ones
+    # Now the Proximity model
+    model = Proximity(df_train, id_column, features, target, track_columns=track_columns)
-    print(f"\nModel training complete!")
-    print(f"Saved 1 XGBoost model and {len(mapie_models)} MAPIE models to {args.model_dir}")
+    # Now serialize the model
+    model.serialize(args.model_dir)
 #
 # Inference Section
 #
 def model_fn(model_dir) -> dict:
-    """Load XGBoost and all MAPIE models from the specified directory."""
-    # Load model configuration to know which models to load
-    with open(os.path.join(model_dir, "model_config.json")) as fp:
-        config = json.load(fp)
+    """Load and return XGBoost, NGBoost, and Prox Model from model directory."""
     # Load XGBoost regressor
     xgb_path = os.path.join(model_dir, "xgb_model.json")
     xgb_model = XGBRegressor(enable_categorical=True)
     xgb_model.load_model(xgb_path)
-    # Load all MAPIE models
-    mapie_models = {}
-    for conf_level in config["confidence_levels"]:
-        model_name = f"mapie_{conf_level:.2f}"
-        mapie_models[model_name] = joblib.load(os.path.join(model_dir, f"{model_name}.joblib"))
+    # Load NGBoost regressor
+    ngb_model = joblib.load(os.path.join(model_dir, "ngb_model.joblib"))
-    # Load category mappings if they exist
-    category_mappings = {}
-    category_path = os.path.join(model_dir, "category_mappings.json")
-    if os.path.exists(category_path):
-        with open(category_path) as fp:
-            category_mappings = json.load(fp)
+    # Deserialize the proximity model
+    prox_model = Proximity.deserialize(model_dir)
     return {
-        "xgb_model": xgb_model,
-        "mapie_models": mapie_models,
-        "confidence_levels": config["confidence_levels"],
-        "category_mappings": category_mappings
+        "xgboost": xgb_model,
+        "ngboost": ngb_model,
+        "proximity": prox_model
     }
@@ -403,7 +315,7 @@ def input_fn(input_data, content_type):
     if "text/csv" in content_type:
         return pd.read_csv(StringIO(input_data))
     elif "application/json" in content_type:
-        return pd.DataFrame(json.loads(input_data))
+        return pd.DataFrame(json.loads(input_data))  # Assumes JSON array of records
     else:
         raise ValueError(f"{content_type} not supported!")
@@ -411,26 +323,23 @@ def input_fn(input_data, content_type):
 def output_fn(output_df, accept_type):
     """Supports both CSV and JSON output formats."""
     if "text/csv" in accept_type:
-        # Convert categorical columns to string to avoid fillna issues
-        for col in output_df.select_dtypes(include=['category']).columns:
-            output_df[col] = output_df[col].astype(str)
-        csv_output = output_df.fillna("N/A").to_csv(index=False)
+        csv_output = output_df.fillna("N/A").to_csv(index=False)  # CSV with N/A for missing values
         return csv_output, "text/csv"
     elif "application/json" in accept_type:
-        return output_df.to_json(orient="records"), "application/json"
+        return output_df.to_json(orient="records"), "application/json"  # JSON array of records (NaNs -> null)
     else:
         raise RuntimeError(f"{accept_type} accept type is not supported by this script.")
 def predict_fn(df, models) -> pd.DataFrame:
-    """Make predictions using XGBoost for point estimates and MAPIE for conformalized intervals
+    """Make Predictions with our XGB Quantile Regression Model
     Args:
         df (pd.DataFrame): The input DataFrame
-        models (dict): Dictionary containing XGBoost and MAPIE models
+        models (dict): The dictionary of models to use for predictions
     Returns:
-        pd.DataFrame: DataFrame with XGBoost predictions and conformalized intervals
+        pd.DataFrame: The DataFrame with the predictions added
     """
     # Grab our feature columns (from training)
@@ -441,62 +350,44 @@ def predict_fn(df, models) -> pd.DataFrame:
     # Match features in a case-insensitive manner
     matched_df = match_features_case_insensitive(df, model_features)
-    # Apply categorical mappings if they exist
-    if models.get("category_mappings"):
-        matched_df, _ = convert_categorical_types(
-            matched_df,
-            model_features,
-            models["category_mappings"]
-        )
+    # Use XGBoost for point predictions
+    df["prediction"] = models["xgboost"].predict(matched_df[model_features])
+    # NGBoost predict returns distribution objects
+    y_dists = models["ngboost"].pred_dist(matched_df[model_features])
+    # Extract parameters from distribution
+    dist_params = y_dists.params
-    # Get features for prediction
-    X = matched_df[model_features]
-    # Get XGBoost point predictions
-    df["prediction"] = models["xgb_model"].predict(X)
-    # Get predictions from each MAPIE model for conformalized intervals
-    for conf_level in models["confidence_levels"]:
-        model_name = f"mapie_{conf_level:.2f}"
-        model = models["mapie_models"][model_name]
-        # Get conformalized predictions
-        y_pred, y_pis = model.predict_interval(X)
-        # Map confidence levels to quantile names
-        if conf_level == 0.50:  # 50% CI
-            df["q_25"] = y_pis[:, 0, 0]
-            df["q_75"] = y_pis[:, 1, 0]
-        elif conf_level == 0.80:  # 80% CI
-            df["q_10"] = y_pis[:, 0, 0]
-            df["q_90"] = y_pis[:, 1, 0]
-        elif conf_level == 0.90:  # 90% CI
-            df["q_05"] = y_pis[:, 0, 0]
-            df["q_95"] = y_pis[:, 1, 0]
-        elif conf_level == 0.95:  # 95% CI
-            df["q_025"] = y_pis[:, 0, 0]
-            df["q_975"] = y_pis[:, 1, 0]
-    # Add median (q_50) from XGBoost prediction
-    df["q_50"] = df["prediction"]
-    # Calculate uncertainty metrics based on 95% interval
-    interval_width = df["q_975"] - df["q_025"]
-    df["prediction_std"] = interval_width / 3.92
+    # Extract mean and std from distribution parameters
+    df["prediction_uq"] = dist_params['loc']  # mean
+    df["prediction_std"] = dist_params['scale']  # standard deviation
+    # Add 95% prediction intervals using ppf (percent point function)
+    # Note: Our hybrid model uses XGB point prediction and NGBoost UQ
+    #  so we need to adjust the bounds to include the point prediction
+    df["q_025"] = np.minimum(y_dists.ppf(0.025), df["prediction"])
+    df["q_975"] = np.maximum(y_dists.ppf(0.975), df["prediction"])
+    # Add 90% prediction intervals
+    df["q_05"] = y_dists.ppf(0.05)  # 5th percentile
+    df["q_95"] = y_dists.ppf(0.95)  # 95th percentile
+    # Add 80% prediction intervals
+    df["q_10"] = y_dists.ppf(0.10)  # 10th percentile
+    df["q_90"] = y_dists.ppf(0.90)  # 90th percentile
+    # Add 50% prediction intervals
+    df["q_25"] = y_dists.ppf(0.25)  # 25th percentile
+    df["q_75"] = y_dists.ppf(0.75)  # 75th percentile
     # Reorder the quantile columns for easier reading
     quantile_cols = ["q_025", "q_05", "q_10", "q_25", "q_75", "q_90", "q_95", "q_975"]
     other_cols = [col for col in df.columns if col not in quantile_cols]
     df = df[other_cols + quantile_cols]
-    # Uncertainty score
-    df["uncertainty_score"] = interval_width / (np.abs(df["prediction"]) + 1e-6)
-    # Confidence bands
-    df["confidence_band"] = pd.cut(
-        df["uncertainty_score"],
-        bins=[0, 0.5, 1.0, 2.0, np.inf],
-        labels=["high", "medium", "low", "very_low"]
-    )
+    # Compute Nearest neighbors with Proximity model
+    models["proximity"].neighbors(df)
+    # Return the modified DataFrame
     return df

workbench/model_scripts/xgb_model/generated_model_script.py CHANGED Viewed

@@ -28,12 +28,12 @@ from typing import List, Tuple
 # Template Parameters
 TEMPLATE_PARAMS = {
-    "model_type": "regressor",
-    "target": "udm_asy_res_intrinsic_clearance_ul_per_min_per_mg_protein",
-    "features": ['bcut2d_logplow', 'numradicalelectrons', 'smr_vsa5', 'fr_lactam', 'fr_morpholine', 'fr_aldehyde', 'slogp_vsa1', 'fr_amidine', 'bpol', 'fr_ester', 'fr_azo', 'kappa3', 'peoe_vsa5', 'fr_ketone_topliss', 'vsa_estate9', 'estate_vsa9', 'bcut2d_mrhi', 'fr_ndealkylation1', 'numrotatablebonds', 'minestateindex', 'fr_quatn', 'peoe_vsa3', 'fr_epoxide', 'fr_aniline', 'minpartialcharge', 'fr_nitroso', 'fpdensitymorgan2', 'fr_oxime', 'fr_sulfone', 'smr_vsa1', 'kappa1', 'fr_pyridine', 'numaromaticrings', 'vsa_estate6', 'molmr', 'estate_vsa1', 'fr_dihydropyridine', 'vsa_estate10', 'fr_alkyl_halide', 'chi2n', 'fr_thiocyan', 'fpdensitymorgan1', 'fr_unbrch_alkane', 'slogp_vsa9', 'chi4n', 'fr_nitro_arom', 'fr_al_oh', 'fr_furan', 'fr_c_s', 'peoe_vsa8', 'peoe_vsa14', 'numheteroatoms', 'fr_ndealkylation2', 'maxabspartialcharge', 'vsa_estate2', 'peoe_vsa7', 'apol', 'numhacceptors', 'fr_tetrazole', 'vsa_estate1', 'peoe_vsa9', 'naromatom', 'bcut2d_chghi', 'fr_sh', 'fr_halogen', 'slogp_vsa4', 'fr_benzodiazepine', 'molwt', 'fr_isocyan', 'fr_prisulfonamd', 'maxabsestateindex', 'minabsestateindex', 'peoe_vsa11', 'slogp_vsa12', 'estate_vsa5', 'numaliphaticcarbocycles', 'bcut2d_mwlow', 'slogp_vsa7', 'fr_allylic_oxid', 'fr_methoxy', 'fr_nh0', 'fr_coo2', 'fr_phenol', 'nacid', 'nbase', 'chi3v', 'fr_ar_nh', 'fr_nitrile', 'fr_imidazole', 'fr_urea', 'bcut2d_mrlow', 'chi1', 'smr_vsa6', 'fr_aryl_methyl', 'narombond', 'fr_alkyl_carbamate', 'fr_piperzine', 'exactmolwt', 'qed', 'chi0n', 'fr_sulfonamd', 'fr_thiazole', 'numvalenceelectrons', 'fr_phos_acid', 'peoe_vsa12', 'fr_nh1', 'fr_hdrzine', 'fr_c_o_nocoo', 'fr_lactone', 'estate_vsa6', 'bcut2d_logphi', 'vsa_estate7', 'peoe_vsa13', 'numsaturatedcarbocycles', 'fr_nitro', 'fr_phenol_noorthohbond', 'rotratio', 'fr_barbitur', 'fr_isothiocyan', 'balabanj', 'fr_arn', 'fr_imine', 'maxpartialcharge', 'fr_sulfide', 'slogp_vsa11', 'fr_hoccn', 'fr_n_o', 'peoe_vsa1', 'slogp_vsa6', 'heavyatommolwt', 'fractioncsp3', 'estate_vsa8', 'peoe_vsa10', 'numaliphaticrings', 'fr_thiophene', 'maxestateindex', 'smr_vsa10', 'labuteasa', 'smr_vsa2', 'fpdensitymorgan3', 'smr_vsa9', 'slogp_vsa10', 'numaromaticheterocycles', 'fr_nh2', 'fr_diazo', 'chi3n', 'fr_ar_coo', 'slogp_vsa5', 'fr_bicyclic', 'fr_amide', 'estate_vsa10', 'fr_guanido', 'chi1n', 'numsaturatedrings', 'fr_piperdine', 'fr_term_acetylene', 'estate_vsa4', 'slogp_vsa3', 'fr_coo', 'fr_ether', 'estate_vsa7', 'bcut2d_chglo', 'fr_oxazole', 'peoe_vsa6', 'hallkieralpha', 'peoe_vsa2', 'chi2v', 'nocount', 'vsa_estate5', 'fr_nhpyrrole', 'fr_al_coo', 'bertzct', 'estate_vsa11', 'minabspartialcharge', 'slogp_vsa8', 'fr_imide', 'kappa2', 'numaliphaticheterocycles', 'numsaturatedheterocycles', 'fr_hdrzone', 'smr_vsa4', 'fr_ar_n', 'nrot', 'smr_vsa8', 'slogp_vsa2', 'chi4v', 'fr_phos_ester', 'fr_para_hydroxylation', 'smr_vsa3', 'nhohcount', 'estate_vsa2', 'mollogp', 'tpsa', 'fr_azide', 'peoe_vsa4', 'numhdonors', 'fr_al_oh_notert', 'fr_c_o', 'chi0', 'fr_nitro_arom_nonortho', 'vsa_estate3', 'fr_benzene', 'fr_ketone', 'vsa_estate8', 'smr_vsa7', 'fr_ar_oh', 'fr_priamide', 'ringcount', 'estate_vsa3', 'numaromaticcarbocycles', 'bcut2d_mwhi', 'chi1v', 'heavyatomcount', 'vsa_estate4', 'chi0v'],
+    "model_type": "classifier",
+    "target": "class",
+    "features": ['chi2v', 'fr_sulfone', 'chi1v', 'bcut2d_logplow', 'fr_piperzine', 'kappa3', 'smr_vsa1', 'slogp_vsa5', 'fr_ketone_topliss', 'fr_sulfonamd', 'fr_imine', 'fr_benzene', 'fr_ester', 'chi2n', 'labuteasa', 'peoe_vsa2', 'smr_vsa6', 'bcut2d_chglo', 'fr_sh', 'peoe_vsa1', 'fr_allylic_oxid', 'chi4n', 'fr_ar_oh', 'fr_nh0', 'fr_term_acetylene', 'slogp_vsa7', 'slogp_vsa4', 'estate_vsa1', 'vsa_estate4', 'numbridgeheadatoms', 'numheterocycles', 'fr_ketone', 'fr_morpholine', 'fr_guanido', 'estate_vsa2', 'numheteroatoms', 'fr_nitro_arom_nonortho', 'fr_piperdine', 'nocount', 'numspiroatoms', 'fr_aniline', 'fr_thiophene', 'slogp_vsa10', 'fr_amide', 'slogp_vsa2', 'fr_epoxide', 'vsa_estate7', 'fr_ar_coo', 'fr_imidazole', 'fr_nitrile', 'fr_oxazole', 'numsaturatedrings', 'fr_pyridine', 'fr_hoccn', 'fr_ndealkylation1', 'numaliphaticheterocycles', 'fr_phenol', 'maxpartialcharge', 'vsa_estate5', 'peoe_vsa13', 'minpartialcharge', 'qed', 'fr_al_oh', 'slogp_vsa11', 'chi0n', 'fr_bicyclic', 'peoe_vsa12', 'fpdensitymorgan1', 'fr_oxime', 'molwt', 'fr_dihydropyridine', 'smr_vsa5', 'peoe_vsa5', 'fr_nitro', 'hallkieralpha', 'heavyatommolwt', 'fr_alkyl_halide', 'peoe_vsa8', 'fr_nhpyrrole', 'fr_isocyan', 'bcut2d_chghi', 'fr_lactam', 'peoe_vsa11', 'smr_vsa9', 'tpsa', 'chi4v', 'slogp_vsa1', 'phi', 'bcut2d_logphi', 'avgipc', 'estate_vsa11', 'fr_coo', 'bcut2d_mwhi', 'numunspecifiedatomstereocenters', 'vsa_estate10', 'estate_vsa8', 'numvalenceelectrons', 'fr_nh2', 'fr_lactone', 'vsa_estate1', 'estate_vsa4', 'numatomstereocenters', 'vsa_estate8', 'fr_para_hydroxylation', 'peoe_vsa3', 'fr_thiazole', 'peoe_vsa10', 'fr_ndealkylation2', 'slogp_vsa12', 'peoe_vsa9', 'maxestateindex', 'fr_quatn', 'smr_vsa7', 'minestateindex', 'numaromaticheterocycles', 'numrotatablebonds', 'fr_ar_nh', 'fr_ether', 'exactmolwt', 'fr_phenol_noorthohbond', 'slogp_vsa3', 'fr_ar_n', 'sps', 'fr_c_o_nocoo', 'bertzct', 'peoe_vsa7', 'slogp_vsa8', 'numradicalelectrons', 'molmr', 'fr_tetrazole', 'numsaturatedcarbocycles', 'bcut2d_mrhi', 'kappa1', 'numamidebonds', 'fpdensitymorgan2', 'smr_vsa8', 'chi1n', 'estate_vsa6', 'fr_barbitur', 'fr_diazo', 'kappa2', 'chi0', 'bcut2d_mrlow', 'balabanj', 'peoe_vsa4', 'numhacceptors', 'fr_sulfide', 'chi3n', 'smr_vsa2', 'fr_al_oh_notert', 'fr_benzodiazepine', 'fr_phos_ester', 'fr_aldehyde', 'fr_coo2', 'estate_vsa5', 'fr_prisulfonamd', 'numaromaticcarbocycles', 'fr_unbrch_alkane', 'fr_urea', 'fr_nitroso', 'smr_vsa10', 'fr_c_s', 'smr_vsa3', 'fr_methoxy', 'maxabspartialcharge', 'slogp_vsa9', 'heavyatomcount', 'fr_azide', 'chi3v', 'smr_vsa4', 'mollogp', 'chi0v', 'fr_aryl_methyl', 'fr_nh1', 'fpdensitymorgan3', 'fr_furan', 'fr_hdrzine', 'fr_arn', 'numaromaticrings', 'vsa_estate3', 'fr_azo', 'fr_halogen', 'estate_vsa9', 'fr_hdrzone', 'numhdonors', 'fr_alkyl_carbamate', 'fr_isothiocyan', 'minabspartialcharge', 'fr_al_coo', 'ringcount', 'chi1', 'estate_vsa7', 'fr_nitro_arom', 'vsa_estate9', 'minabsestateindex', 'maxabsestateindex', 'vsa_estate6', 'estate_vsa10', 'estate_vsa3', 'fr_n_o', 'fr_amidine', 'fr_thiocyan', 'fr_phos_acid', 'fr_c_o', 'fr_imide', 'numaliphaticrings', 'peoe_vsa6', 'vsa_estate2', 'nhohcount', 'numsaturatedheterocycles', 'slogp_vsa6', 'peoe_vsa14', 'fractioncsp3', 'bcut2d_mwlow', 'numaliphaticcarbocycles', 'fr_priamide', 'nacid', 'nbase', 'naromatom', 'narombond', 'sz', 'sm', 'sv', 'sse', 'spe', 'sare', 'sp', 'si', 'mz', 'mm', 'mv', 'mse', 'mpe', 'mare', 'mp', 'mi', 'xch_3d', 'xch_4d', 'xch_5d', 'xch_6d', 'xch_7d', 'xch_3dv', 'xch_4dv', 'xch_5dv', 'xch_6dv', 'xch_7dv', 'xc_3d', 'xc_4d', 'xc_5d', 'xc_6d', 'xc_3dv', 'xc_4dv', 'xc_5dv', 'xc_6dv', 'xpc_4d', 'xpc_5d', 'xpc_6d', 'xpc_4dv', 'xpc_5dv', 'xpc_6dv', 'xp_0d', 'xp_1d', 'xp_2d', 'xp_3d', 'xp_4d', 'xp_5d', 'xp_6d', 'xp_7d', 'axp_0d', 'axp_1d', 'axp_2d', 'axp_3d', 'axp_4d', 'axp_5d', 'axp_6d', 'axp_7d', 'xp_0dv', 'xp_1dv', 'xp_2dv', 'xp_3dv', 'xp_4dv', 'xp_5dv', 'xp_6dv', 'xp_7dv', 'axp_0dv', 'axp_1dv', 'axp_2dv', 'axp_3dv', 'axp_4dv', 'axp_5dv', 'axp_6dv', 'axp_7dv', 'c1sp1', 'c2sp1', 'c1sp2', 'c2sp2', 'c3sp2', 'c1sp3', 'c2sp3', 'c3sp3', 'c4sp3', 'hybratio', 'fcsp3', 'num_stereocenters', 'num_unspecified_stereocenters', 'num_defined_stereocenters', 'num_r_centers', 'num_s_centers', 'num_stereobonds', 'num_e_bonds', 'num_z_bonds', 'stereo_complexity', 'frac_defined_stereo'],
     "compressed_features": [],
-    "model_metrics_s3_path": "s3://ideaya-sageworks-bucket/models/temp-hlm-phase1-reg-0-80/training",
-    "train_all_data": False
+    "model_metrics_s3_path": "s3://ideaya-sageworks-bucket/models/sol-class-f1-100/training",
+    "train_all_data": True
 }
 # Function to check if dataframe is empty

workbench/repl/workbench_shell.py CHANGED Viewed

@@ -41,7 +41,7 @@ from workbench.cached.cached_meta import CachedMeta
 try:
     import rdkit  # noqa
     import mordred  # noqa
-    from workbench.utils import chem_utils
+    from workbench.utils.chem_utils import vis
     HAVE_CHEM_UTILS = True
 except ImportError:
@@ -178,12 +178,12 @@ class WorkbenchShell:
         # Add cheminformatics utils if available
         if HAVE_CHEM_UTILS:
-            self.commands["show"] = chem_utils.show
+            self.commands["show"] = vis.show
     def start(self):
         """Start the Workbench IPython shell"""
         cprint("magenta", "\nWelcome to Workbench!")
-        if self.aws_status is False:
+        if not self.aws_status:
             cprint("red", "AWS Account Connection Failed...Review/Fix the Workbench Config:")
             cprint("red", f"Path: {self.cm.site_config_path}")
             self.show_config()

workbench/utils/chem_utils/__init__.py ADDED Viewed

File without changes

workbench/utils/chem_utils/fingerprints.py ADDED Viewed

@@ -0,0 +1,134 @@
+"""Molecular fingerprint computation utilities"""
+import logging
+import pandas as pd
+# Molecular Descriptor Imports
+from rdkit import Chem
+from rdkit.Chem import rdFingerprintGenerator
+from rdkit.Chem.MolStandardize import rdMolStandardize
+# Set up the logger
+log = logging.getLogger("workbench")
+def compute_morgan_fingerprints(df: pd.DataFrame, radius=2, n_bits=2048, counts=True) -> pd.DataFrame:
+    """Compute and add Morgan fingerprints to the DataFrame.
+    Args:
+        df (pd.DataFrame): Input DataFrame containing SMILES strings.
+        radius (int): Radius for the Morgan fingerprint.
+        n_bits (int): Number of bits for the fingerprint.
+        counts (bool): Count simulation for the fingerprint.
+    Returns:
+        pd.DataFrame: The input DataFrame with the Morgan fingerprints added as bit strings.
+    Note:
+        See: https://greglandrum.github.io/rdkit-blog/posts/2021-07-06-simulating-counts.html
+    """
+    delete_mol_column = False
+    # Check for the SMILES column (case-insensitive)
+    smiles_column = next((col for col in df.columns if col.lower() == "smiles"), None)
+    if smiles_column is None:
+        raise ValueError("Input DataFrame must have a 'smiles' column")
+    # Sanity check the molecule column (sometimes it gets serialized, which doesn't work)
+    if "molecule" in df.columns and df["molecule"].dtype == "string":
+        log.warning("Detected serialized molecules in 'molecule' column. Removing...")
+        del df["molecule"]
+    # Convert SMILES to RDKit molecule objects (vectorized)
+    if "molecule" not in df.columns:
+        log.info("Converting SMILES to RDKit Molecules...")
+        delete_mol_column = True
+        df["molecule"] = df[smiles_column].apply(Chem.MolFromSmiles)
+        # Make sure our molecules are not None
+        failed_smiles = df[df["molecule"].isnull()][smiles_column].tolist()
+        if failed_smiles:
+            log.error(f"Failed to convert the following SMILES to molecules: {failed_smiles}")
+        df = df.dropna(subset=["molecule"])
+    # If we have fragments in our compounds, get the largest fragment before computing fingerprints
+    largest_frags = df["molecule"].apply(
+        lambda mol: rdMolStandardize.LargestFragmentChooser().choose(mol) if mol else None
+    )
+    # Create a Morgan fingerprint generator
+    if counts:
+        n_bits *= 4  # Multiply by 4 to simulate counts
+    morgan_generator = rdFingerprintGenerator.GetMorganGenerator(radius=radius, fpSize=n_bits, countSimulation=counts)
+    # Compute Morgan fingerprints (vectorized)
+    fingerprints = largest_frags.apply(
+        lambda mol: (morgan_generator.GetFingerprint(mol).ToBitString() if mol else pd.NA)
+    )
+    # Add the fingerprints to the DataFrame
+    df["fingerprint"] = fingerprints
+    # Drop the intermediate 'molecule' column if it was added
+    if delete_mol_column:
+        del df["molecule"]
+    return df
+if __name__ == "__main__":
+    print("Running molecular fingerprint tests...")
+    print("Note: This requires molecular_screening module to be available")
+    # Test molecules
+    test_molecules = {
+        "aspirin": "CC(=O)OC1=CC=CC=C1C(=O)O",
+        "caffeine": "CN1C=NC2=C1C(=O)N(C(=O)N2C)C",
+        "glucose": "C([C@@H]1[C@H]([C@@H]([C@H](C(O1)O)O)O)O)O",  # With stereochemistry
+        "sodium_acetate": "CC(=O)[O-].[Na+]",  # Salt
+        "benzene": "c1ccccc1",
+        "butene_e": "C/C=C/C",  # E-butene
+        "butene_z": "C/C=C\\C",  # Z-butene
+    }
+    # Test 1: Morgan Fingerprints
+    print("\n1. Testing Morgan fingerprint generation...")
+    test_df = pd.DataFrame({"SMILES": list(test_molecules.values()), "name": list(test_molecules.keys())})
+    fp_df = compute_morgan_fingerprints(test_df.copy(), radius=2, n_bits=512, counts=False)
+    print("   Fingerprint generation results:")
+    for _, row in fp_df.iterrows():
+        fp = row.get("fingerprint", "N/A")
+        fp_len = len(fp) if fp != "N/A" else 0
+        print(f"   {row['name']:15} → {fp_len} bits")
+    # Test 2: Different fingerprint parameters
+    print("\n2. Testing different fingerprint parameters...")
+    # Test with counts enabled
+    fp_counts_df = compute_morgan_fingerprints(test_df.copy(), radius=3, n_bits=256, counts=True)
+    print("   With count simulation (256 bits * 4):")
+    for _, row in fp_counts_df.iterrows():
+        fp = row.get("fingerprint", "N/A")
+        fp_len = len(fp) if fp != "N/A" else 0
+        print(f"   {row['name']:15} → {fp_len} bits")
+    # Test 3: Edge cases
+    print("\n3. Testing edge cases...")
+    # Invalid SMILES
+    invalid_df = pd.DataFrame({"SMILES": ["INVALID", ""]})
+    try:
+        fp_invalid = compute_morgan_fingerprints(invalid_df.copy())
+        print(f"   ✓ Invalid SMILES handled: {len(fp_invalid)} valid molecules")
+    except Exception as e:
+        print(f"   ✓ Invalid SMILES properly raised error: {type(e).__name__}")
+    # Test with pre-existing molecule column
+    mol_df = test_df.copy()
+    mol_df["molecule"] = mol_df["SMILES"].apply(Chem.MolFromSmiles)
+    fp_with_mol = compute_morgan_fingerprints(mol_df)
+    print(f"   ✓ Pre-existing molecule column handled: {len(fp_with_mol)} fingerprints generated")
+    print("\n✅ All fingerprint tests completed!")

workbench 0.8.172__py3-none-any.whl → 0.8.173__py3-none-any.whl

workbench 0.8.172py3-none-any.whl → 0.8.173py3-none-any.whl