PyPI - workbench - Versions diffs - 0.8.213__py3-none-any.whl → 0.8.217__py3-none-any.whl - Mend

workbench 0.8.213py3-none-any.whl → 0.8.217py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

workbench/algorithms/dataframe/feature_space_proximity.py +168 -75
workbench/algorithms/dataframe/fingerprint_proximity.py +257 -80
workbench/algorithms/dataframe/projection_2d.py +38 -21
workbench/algorithms/dataframe/proximity.py +75 -150
workbench/algorithms/graph/light/proximity_graph.py +5 -5
workbench/algorithms/models/cleanlab_model.py +382 -0
workbench/algorithms/models/noise_model.py +2 -2
workbench/api/__init__.py +3 -0
workbench/api/endpoint.py +10 -5
workbench/api/feature_set.py +76 -6
workbench/api/meta_model.py +289 -0
workbench/api/model.py +43 -4
workbench/core/artifacts/endpoint_core.py +63 -115
workbench/core/artifacts/feature_set_core.py +1 -1
workbench/core/artifacts/model_core.py +6 -4
workbench/core/pipelines/pipeline_executor.py +1 -1
workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +30 -10
workbench/model_script_utils/pytorch_utils.py +11 -1
workbench/model_scripts/chemprop/chemprop.template +145 -69
workbench/model_scripts/chemprop/generated_model_script.py +147 -71
workbench/model_scripts/custom_models/chem_info/fingerprints.py +7 -3
workbench/model_scripts/custom_models/proximity/feature_space_proximity.py +194 -0
workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +6 -6
workbench/model_scripts/custom_models/uq_models/feature_space_proximity.py +194 -0
workbench/model_scripts/custom_models/uq_models/meta_uq.template +6 -6
workbench/model_scripts/meta_model/generated_model_script.py +209 -0
workbench/model_scripts/meta_model/meta_model.template +209 -0
workbench/model_scripts/pytorch_model/generated_model_script.py +42 -24
workbench/model_scripts/pytorch_model/pytorch.template +42 -24
workbench/model_scripts/pytorch_model/pytorch_utils.py +11 -1
workbench/model_scripts/script_generation.py +4 -0
workbench/model_scripts/xgb_model/generated_model_script.py +169 -158
workbench/model_scripts/xgb_model/xgb_model.template +163 -152
workbench/repl/workbench_shell.py +0 -5
workbench/scripts/endpoint_test.py +2 -2
workbench/utils/chem_utils/fingerprints.py +7 -3
workbench/utils/chemprop_utils.py +23 -5
workbench/utils/meta_model_simulator.py +471 -0
workbench/utils/metrics_utils.py +94 -10
workbench/utils/model_utils.py +91 -9
workbench/utils/pytorch_utils.py +1 -1
workbench/web_interface/components/plugins/scatter_plot.py +4 -8
{workbench-0.8.213.dist-info → workbench-0.8.217.dist-info}/METADATA +2 -1
{workbench-0.8.213.dist-info → workbench-0.8.217.dist-info}/RECORD +48 -43
workbench/model_scripts/custom_models/proximity/proximity.py +0 -410
workbench/model_scripts/custom_models/uq_models/proximity.py +0 -410
{workbench-0.8.213.dist-info → workbench-0.8.217.dist-info}/WHEEL +0 -0
{workbench-0.8.213.dist-info → workbench-0.8.217.dist-info}/entry_points.txt +0 -0
{workbench-0.8.213.dist-info → workbench-0.8.217.dist-info}/licenses/LICENSE +0 -0
{workbench-0.8.213.dist-info → workbench-0.8.217.dist-info}/top_level.txt +0 -0

workbench/model_scripts/xgb_model/xgb_model.template CHANGED Viewed

@@ -7,39 +7,30 @@
 # - Sample weights support
 # - Categorical feature handling
 # - Compressed feature decompression
+#
+# NOTE: Imports are structured to minimize serverless endpoint startup time.
+# Heavy imports (sklearn, awswrangler) are deferred to training time.
-import argparse
 import json
 import os
-import awswrangler as wr
 import joblib
 import numpy as np
 import pandas as pd
 import xgboost as xgb
-from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
-from sklearn.preprocessing import LabelEncoder
 from model_script_utils import (
-    check_dataframe,
-    compute_classification_metrics,
-    compute_regression_metrics,
     convert_categorical_types,
     decompress_features,
     expand_proba_column,
     input_fn,
     match_features_case_insensitive,
     output_fn,
-    print_classification_metrics,
-    print_confusion_matrix,
-    print_regression_metrics,
 )
 from uq_harness import (
     compute_confidence,
     load_uq_models,
     predict_intervals,
-    save_uq_models,
-    train_uq_models,
 )
 # =============================================================================
@@ -49,25 +40,27 @@ DEFAULT_HYPERPARAMETERS = {
     # Training parameters
     "n_folds": 5,  # Number of CV folds (1 = single train/val split)
     # Core tree parameters
-    "n_estimators": 200,
-    "max_depth": 6,
+    "n_estimators": 300,
+    "max_depth": 7,
     "learning_rate": 0.05,
-    # Sampling parameters
-    "subsample": 0.7,
-    "colsample_bytree": 0.6,
-    "colsample_bylevel": 0.8,
-    # Regularization
-    "min_child_weight": 5,
-    "gamma": 0.2,
-    "reg_alpha": 0.5,
-    "reg_lambda": 2.0,
+    # Sampling parameters (less aggressive - ensemble provides regularization)
+    "subsample": 0.8,
+    "colsample_bytree": 0.8,
+    # Regularization (lighter - ensemble averaging reduces overfitting)
+    "min_child_weight": 3,
+    "gamma": 0.1,
+    "reg_alpha": 0.1,
+    "reg_lambda": 1.0,
     # Random seed
-    "random_state": 42,
+    "seed": 42,
 }
 # Workbench-specific parameters (not passed to XGBoost)
 WORKBENCH_PARAMS = {"n_folds"}
+# Regression-only parameters (filtered out for classifiers)
+REGRESSION_ONLY_PARAMS = {"objective"}
 # Template parameters (filled in by Workbench)
 TEMPLATE_PARAMS = {
     "model_type": "{{model_type}}",
@@ -80,10 +73,140 @@ TEMPLATE_PARAMS = {
 }
+# =============================================================================
+# Model Loading (for SageMaker inference)
+# =============================================================================
+def model_fn(model_dir: str) -> dict:
+    """Load XGBoost ensemble from the specified directory."""
+    # Load ensemble metadata
+    metadata_path = os.path.join(model_dir, "ensemble_metadata.json")
+    if os.path.exists(metadata_path):
+        with open(metadata_path) as f:
+            metadata = json.load(f)
+        n_ensemble = metadata["n_ensemble"]
+    else:
+        n_ensemble = 1  # Legacy single model
+    # Load ensemble models
+    ensemble_models = []
+    for i in range(n_ensemble):
+        model_path = os.path.join(model_dir, f"xgb_model_{i}.joblib")
+        if not os.path.exists(model_path):
+            model_path = os.path.join(model_dir, "xgb_model.joblib")  # Legacy fallback
+        ensemble_models.append(joblib.load(model_path))
+    print(f"Loaded {len(ensemble_models)} model(s)")
+    # Load label encoder (classifier only)
+    label_encoder = None
+    encoder_path = os.path.join(model_dir, "label_encoder.joblib")
+    if os.path.exists(encoder_path):
+        label_encoder = joblib.load(encoder_path)
+    # Load category mappings
+    category_mappings = {}
+    category_path = os.path.join(model_dir, "category_mappings.json")
+    if os.path.exists(category_path):
+        with open(category_path) as f:
+            category_mappings = json.load(f)
+    # Load UQ models (regression only)
+    uq_models, uq_metadata = None, None
+    uq_path = os.path.join(model_dir, "uq_metadata.json")
+    if os.path.exists(uq_path):
+        uq_models, uq_metadata = load_uq_models(model_dir)
+    return {
+        "ensemble_models": ensemble_models,
+        "n_ensemble": n_ensemble,
+        "label_encoder": label_encoder,
+        "category_mappings": category_mappings,
+        "uq_models": uq_models,
+        "uq_metadata": uq_metadata,
+    }
+# =============================================================================
+# Inference (for SageMaker inference)
+# =============================================================================
+def predict_fn(df: pd.DataFrame, model_dict: dict) -> pd.DataFrame:
+    """Make predictions with XGBoost ensemble."""
+    model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
+    with open(os.path.join(model_dir, "feature_columns.json")) as f:
+        features = json.load(f)
+    print(f"Model Features: {features}")
+    # Extract model components
+    ensemble_models = model_dict["ensemble_models"]
+    label_encoder = model_dict.get("label_encoder")
+    category_mappings = model_dict.get("category_mappings", {})
+    uq_models = model_dict.get("uq_models")
+    uq_metadata = model_dict.get("uq_metadata")
+    compressed_features = TEMPLATE_PARAMS["compressed_features"]
+    # Prepare features
+    matched_df = match_features_case_insensitive(df, features)
+    matched_df, _ = convert_categorical_types(matched_df, features, category_mappings)
+    if compressed_features:
+        print("Decompressing features for prediction...")
+        matched_df, features = decompress_features(matched_df, features, compressed_features)
+    X = matched_df[features]
+    # Collect ensemble predictions
+    all_preds = [m.predict(X) for m in ensemble_models]
+    ensemble_preds = np.stack(all_preds, axis=0)
+    if label_encoder is not None:
+        # Classification: average probabilities, then argmax
+        all_probs = [m.predict_proba(X) for m in ensemble_models]
+        avg_probs = np.mean(np.stack(all_probs, axis=0), axis=0)
+        class_preds = np.argmax(avg_probs, axis=1)
+        df["prediction"] = label_encoder.inverse_transform(class_preds)
+        df["pred_proba"] = [p.tolist() for p in avg_probs]
+        df = expand_proba_column(df, label_encoder.classes_)
+    else:
+        # Regression: average predictions
+        df["prediction"] = np.mean(ensemble_preds, axis=0)
+        df["prediction_std"] = np.std(ensemble_preds, axis=0)
+        # Add UQ intervals if available
+        if uq_models and uq_metadata:
+            df = predict_intervals(df, X, uq_models, uq_metadata)
+            df = compute_confidence(df, uq_metadata["median_interval_width"], "q_10", "q_90")
+    print(f"Inference complete: {len(df)} predictions, {len(ensemble_models)} ensemble members")
+    return df
 # =============================================================================
 # Training
 # =============================================================================
 if __name__ == "__main__":
+    # -------------------------------------------------------------------------
+    # Training-only imports (deferred to reduce serverless startup time)
+    # -------------------------------------------------------------------------
+    import argparse
+    import awswrangler as wr
+    from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
+    from sklearn.preprocessing import LabelEncoder
+    from model_script_utils import (
+        check_dataframe,
+        compute_classification_metrics,
+        compute_regression_metrics,
+        print_classification_metrics,
+        print_confusion_matrix,
+        print_regression_metrics,
+    )
+    from uq_harness import (
+        save_uq_models,
+        train_uq_models,
+    )
     # -------------------------------------------------------------------------
     # Setup: Parse arguments and load data
     # -------------------------------------------------------------------------
@@ -123,7 +246,7 @@ if __name__ == "__main__":
         all_df, features = decompress_features(all_df, features, compressed_features)
     # -------------------------------------------------------------------------
-    # Classification setup: Encode target labels
+    # Classification setup
     # -------------------------------------------------------------------------
     label_encoder = None
     if model_type == "classifier":
@@ -136,6 +259,18 @@ if __name__ == "__main__":
     # -------------------------------------------------------------------------
     n_folds = hyperparameters["n_folds"]
     xgb_params = {k: v for k, v in hyperparameters.items() if k not in WORKBENCH_PARAMS}
+    # Map 'seed' to 'random_state' for XGBoost
+    if "seed" in xgb_params:
+        xgb_params["random_state"] = xgb_params.pop("seed")
+    # Handle objective: filter regression-only params for classifiers, set default for regressors
+    if model_type == "classifier":
+        xgb_params = {k: v for k, v in xgb_params.items() if k not in REGRESSION_ONLY_PARAMS}
+    else:
+        # Default to MAE (reg:absoluteerror) for regression if not specified
+        xgb_params.setdefault("objective", "reg:absoluteerror")
     print(f"XGBoost params: {xgb_params}")
     if n_folds == 1:
@@ -285,12 +420,10 @@ if __name__ == "__main__":
     # -------------------------------------------------------------------------
     # Save model artifacts
     # -------------------------------------------------------------------------
-    # Ensemble models
-    for idx, ens_model in enumerate(ensemble_models):
-        joblib.dump(ens_model, os.path.join(args.model_dir, f"xgb_model_{idx}.joblib"))
-    print(f"Saved {len(ensemble_models)} XGBoost model(s)")
+    for idx, m in enumerate(ensemble_models):
+        joblib.dump(m, os.path.join(args.model_dir, f"xgb_model_{idx}.joblib"))
+    print(f"Saved {len(ensemble_models)} model(s)")
-    # Metadata files
     with open(os.path.join(args.model_dir, "ensemble_metadata.json"), "w") as f:
         json.dump({"n_ensemble": len(ensemble_models), "n_folds": n_folds}, f)
@@ -310,125 +443,3 @@ if __name__ == "__main__":
         save_uq_models(uq_models, uq_metadata, args.model_dir)
     print(f"\nModel training complete! Artifacts saved to {args.model_dir}")
-# =============================================================================
-# Model Loading (for SageMaker inference)
-# =============================================================================
-def model_fn(model_dir: str) -> dict:
-    """Load XGBoost ensemble and associated artifacts.
-    Args:
-        model_dir: Directory containing model artifacts
-    Returns:
-        Dictionary with ensemble_models, label_encoder, category_mappings, uq_models, etc.
-    """
-    # Load ensemble metadata
-    metadata_path = os.path.join(model_dir, "ensemble_metadata.json")
-    if os.path.exists(metadata_path):
-        with open(metadata_path) as f:
-            metadata = json.load(f)
-        n_ensemble = metadata["n_ensemble"]
-    else:
-        n_ensemble = 1  # Legacy single model
-    # Load ensemble models
-    ensemble_models = []
-    for i in range(n_ensemble):
-        model_path = os.path.join(model_dir, f"xgb_model_{i}.joblib")
-        if not os.path.exists(model_path):
-            model_path = os.path.join(model_dir, "xgb_model.joblib")  # Legacy fallback
-        ensemble_models.append(joblib.load(model_path))
-    # Load label encoder (classifier only)
-    label_encoder = None
-    encoder_path = os.path.join(model_dir, "label_encoder.joblib")
-    if os.path.exists(encoder_path):
-        label_encoder = joblib.load(encoder_path)
-    # Load category mappings
-    category_mappings = {}
-    category_path = os.path.join(model_dir, "category_mappings.json")
-    if os.path.exists(category_path):
-        with open(category_path) as f:
-            category_mappings = json.load(f)
-    # Load UQ models (regression only)
-    uq_models, uq_metadata = None, None
-    uq_path = os.path.join(model_dir, "uq_metadata.json")
-    if os.path.exists(uq_path):
-        uq_models, uq_metadata = load_uq_models(model_dir)
-    return {
-        "ensemble_models": ensemble_models,
-        "n_ensemble": n_ensemble,
-        "label_encoder": label_encoder,
-        "category_mappings": category_mappings,
-        "uq_models": uq_models,
-        "uq_metadata": uq_metadata,
-    }
-# =============================================================================
-# Inference (for SageMaker inference)
-# =============================================================================
-def predict_fn(df: pd.DataFrame, models: dict) -> pd.DataFrame:
-    """Make predictions with XGBoost ensemble.
-    Args:
-        df: Input DataFrame with features
-        models: Dictionary from model_fn containing ensemble and metadata
-    Returns:
-        DataFrame with predictions added
-    """
-    # Load feature columns
-    model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
-    with open(os.path.join(model_dir, "feature_columns.json")) as f:
-        features = json.load(f)
-    print(f"Model Features: {features}")
-    # Extract model components
-    ensemble_models = models["ensemble_models"]
-    label_encoder = models.get("label_encoder")
-    category_mappings = models.get("category_mappings", {})
-    uq_models = models.get("uq_models")
-    uq_metadata = models.get("uq_metadata")
-    compressed_features = TEMPLATE_PARAMS["compressed_features"]
-    # Prepare features
-    matched_df = match_features_case_insensitive(df, features)
-    matched_df, _ = convert_categorical_types(matched_df, features, category_mappings)
-    if compressed_features:
-        print("Decompressing features for prediction...")
-        matched_df, features = decompress_features(matched_df, features, compressed_features)
-    X = matched_df[features]
-    # Collect ensemble predictions
-    all_preds = [m.predict(X) for m in ensemble_models]
-    ensemble_preds = np.stack(all_preds, axis=0)
-    if label_encoder is not None:
-        # Classification: average probabilities, then argmax
-        all_probs = [m.predict_proba(X) for m in ensemble_models]
-        avg_probs = np.mean(np.stack(all_probs, axis=0), axis=0)
-        class_preds = np.argmax(avg_probs, axis=1)
-        df["prediction"] = label_encoder.inverse_transform(class_preds)
-        df["pred_proba"] = [p.tolist() for p in avg_probs]
-        df = expand_proba_column(df, label_encoder.classes_)
-    else:
-        # Regression: average predictions
-        df["prediction"] = np.mean(ensemble_preds, axis=0)
-        df["prediction_std"] = np.std(ensemble_preds, axis=0)
-        # Add UQ intervals if available
-        if uq_models and uq_metadata:
-            df = predict_intervals(df, X, uq_models, uq_metadata)
-            df = compute_confidence(df, uq_metadata["median_interval_width"], "q_10", "q_90")
-    print(f"Inference complete: {len(df)} predictions, {len(ensemble_models)} ensemble members")
-    return df

workbench/repl/workbench_shell.py CHANGED Viewed

@@ -302,11 +302,6 @@ class WorkbenchShell:
             self.commands["PandasToView"] = importlib.import_module("workbench.core.views.pandas_to_view").PandasToView
             self.commands["Pipeline"] = importlib.import_module("workbench.api.pipeline").Pipeline
-            # Algorithms
-            self.commands["FSP"] = importlib.import_module(
-                "workbench.algorithms.dataframe.feature_space_proximity"
-            ).FeatureSpaceProximity
             # These are 'nice to have' imports
             self.commands["pd"] = importlib.import_module("pandas")
             self.commands["wr"] = importlib.import_module("awswrangler")

workbench/scripts/endpoint_test.py CHANGED Viewed

@@ -5,7 +5,7 @@ Usage:
     python model_script_harness.py <local_script.py> <model_name>
 Example:
-    python model_script_harness.py pytorch.py aqsol-pytorch-reg
+    python model_script_harness.py pytorch.py aqsol-reg-pytorch
 This allows you to test LOCAL changes to a model script against deployed model artifacts.
 Evaluation data is automatically pulled from the FeatureSet (training = FALSE rows).
@@ -72,7 +72,7 @@ def main():
         print("Usage: python model_script_harness.py <local_script.py> <model_name>")
         print("\nArguments:")
         print("  local_script.py  - Path to your LOCAL model script to test")
-        print("  model_name       - Workbench model name (e.g., aqsol-pytorch-reg)")
+        print("  model_name       - Workbench model name (e.g., aqsol-reg-pytorch)")
         print("\nOptional: testing/env.json with additional environment variables")
         sys.exit(1)

workbench/utils/chem_utils/fingerprints.py CHANGED Viewed

@@ -4,10 +4,14 @@ import logging
 import pandas as pd
 # Molecular Descriptor Imports
-from rdkit import Chem
+from rdkit import Chem, RDLogger
 from rdkit.Chem import rdFingerprintGenerator
 from rdkit.Chem.MolStandardize import rdMolStandardize
+# Suppress RDKit warnings (e.g., "not removing hydrogen atom without neighbors")
+# Keep errors enabled so we see actual problems
+RDLogger.DisableLog("rdApp.warning")
 # Set up the logger
 log = logging.getLogger("workbench")
@@ -47,8 +51,8 @@ def compute_morgan_fingerprints(df: pd.DataFrame, radius=2, n_bits=2048, counts=
         # Make sure our molecules are not None
         failed_smiles = df[df["molecule"].isnull()][smiles_column].tolist()
         if failed_smiles:
-            log.error(f"Failed to convert the following SMILES to molecules: {failed_smiles}")
-        df = df.dropna(subset=["molecule"])
+            log.warning(f"Failed to convert {len(failed_smiles)} SMILES to molecules ({failed_smiles})")
+        df = df.dropna(subset=["molecule"]).copy()
     # If we have fragments in our compounds, get the largest fragment before computing fingerprints
     largest_frags = df["molecule"].apply(

workbench/utils/chemprop_utils.py CHANGED Viewed

@@ -76,6 +76,10 @@ def pull_cv_results(workbench_model: Any) -> Tuple[pd.DataFrame, pd.DataFrame]:
     This retrieves the validation predictions saved during model training and
     computes metrics directly from them.
+    Note:
+        - Regression: Supports both single-target and multi-target models
+        - Classification: Only single-target is supported (with any number of classes)
     Args:
         workbench_model: Workbench model object
@@ -84,6 +88,7 @@ def pull_cv_results(workbench_model: Any) -> Tuple[pd.DataFrame, pd.DataFrame]:
             - DataFrame with computed metrics
             - DataFrame with validation predictions
     """
     # Get the validation predictions from S3
     s3_path = f"{workbench_model.model_training_path}/validation_predictions.csv"
     predictions_df = pull_s3_data(s3_path)
@@ -93,14 +98,27 @@ def pull_cv_results(workbench_model: Any) -> Tuple[pd.DataFrame, pd.DataFrame]:
     log.info(f"Pulled {len(predictions_df)} validation predictions from {s3_path}")
-    # Compute metrics from predictions
+    # Get target and class labels
     target = workbench_model.target()
     class_labels = workbench_model.class_labels()
-    if target in predictions_df.columns and "prediction" in predictions_df.columns:
+    # If single target just use the "prediction" column
+    if isinstance(target, str):
         metrics_df = compute_metrics_from_predictions(predictions_df, target, class_labels)
-    else:
-        metrics_df = pd.DataFrame()
+        return metrics_df, predictions_df
+    # Multi-target regression
+    metrics_list = []
+    for t in target:
+        # Prediction will be {target}_pred in multi-target case
+        pred_col = f"{t}_pred"
+        # Drop NaNs for this target
+        target_preds_df = predictions_df.dropna(subset=[t, pred_col])
+        metrics_df = compute_metrics_from_predictions(target_preds_df, t, class_labels, prediction_col=pred_col)
+        metrics_df.insert(0, "target", t)
+        metrics_list.append(metrics_df)
+    metrics_df = pd.concat(metrics_list, ignore_index=True) if metrics_list else pd.DataFrame()
     return metrics_df, predictions_df
@@ -111,7 +129,7 @@ if __name__ == "__main__":
     from workbench.api import Model
     # Initialize Workbench model
-    model_name = "logd-reg-chemprop"
+    model_name = "open-admet-chemprop-mt"
     print(f"Loading Workbench model: {model_name}")
     model = Model(model_name)
     print(f"Model Framework: {model.model_framework}")

workbench 0.8.213__py3-none-any.whl → 0.8.217__py3-none-any.whl

workbench 0.8.213py3-none-any.whl → 0.8.217py3-none-any.whl