PyPI - workbench - Versions diffs - 0.8.161__py3-none-any.whl → 0.8.192__py3-none-any.whl - Mend

workbench 0.8.161py3-none-any.whl → 0.8.192py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (96) hide show

workbench/algorithms/dataframe/proximity.py +143 -102
workbench/algorithms/graph/light/proximity_graph.py +2 -1
workbench/api/compound.py +1 -1
workbench/api/endpoint.py +12 -0
workbench/api/feature_set.py +4 -4
workbench/api/meta.py +5 -2
workbench/api/model.py +16 -12
workbench/api/monitor.py +1 -16
workbench/core/artifacts/artifact.py +11 -3
workbench/core/artifacts/data_capture_core.py +355 -0
workbench/core/artifacts/endpoint_core.py +168 -78
workbench/core/artifacts/feature_set_core.py +72 -13
workbench/core/artifacts/model_core.py +50 -15
workbench/core/artifacts/monitor_core.py +33 -248
workbench/core/cloud_platform/aws/aws_account_clamp.py +50 -1
workbench/core/cloud_platform/aws/aws_meta.py +12 -5
workbench/core/cloud_platform/aws/aws_session.py +4 -4
workbench/core/transforms/data_to_features/light/molecular_descriptors.py +4 -4
workbench/core/transforms/features_to_model/features_to_model.py +9 -4
workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +36 -6
workbench/core/transforms/pandas_transforms/pandas_to_features.py +27 -0
workbench/core/views/training_view.py +49 -53
workbench/core/views/view.py +51 -1
workbench/core/views/view_utils.py +4 -4
workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +483 -0
workbench/model_scripts/custom_models/chem_info/mol_standardize.py +450 -0
workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +7 -9
workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +3 -5
workbench/model_scripts/custom_models/proximity/proximity.py +143 -102
workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +10 -17
workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
workbench/model_scripts/custom_models/uq_models/meta_uq.template +156 -58
workbench/model_scripts/custom_models/uq_models/ngboost.template +20 -14
workbench/model_scripts/custom_models/uq_models/proximity.py +143 -102
workbench/model_scripts/custom_models/uq_models/requirements.txt +1 -3
workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +5 -13
workbench/model_scripts/pytorch_model/pytorch.template +19 -20
workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
workbench/model_scripts/script_generation.py +7 -2
workbench/model_scripts/uq_models/mapie.template +492 -0
workbench/model_scripts/uq_models/requirements.txt +1 -0
workbench/model_scripts/xgb_model/xgb_model.template +31 -40
workbench/repl/workbench_shell.py +11 -6
workbench/scripts/lambda_launcher.py +63 -0
workbench/scripts/ml_pipeline_batch.py +137 -0
workbench/scripts/ml_pipeline_sqs.py +186 -0
workbench/scripts/monitor_cloud_watch.py +20 -100
workbench/utils/aws_utils.py +4 -3
workbench/utils/chem_utils/__init__.py +0 -0
workbench/utils/chem_utils/fingerprints.py +134 -0
workbench/utils/chem_utils/misc.py +194 -0
workbench/utils/chem_utils/mol_descriptors.py +483 -0
workbench/utils/chem_utils/mol_standardize.py +450 -0
workbench/utils/chem_utils/mol_tagging.py +348 -0
workbench/utils/chem_utils/projections.py +209 -0
workbench/utils/chem_utils/salts.py +256 -0
workbench/utils/chem_utils/sdf.py +292 -0
workbench/utils/chem_utils/toxicity.py +250 -0
workbench/utils/chem_utils/vis.py +253 -0
workbench/utils/cloudwatch_handler.py +1 -1
workbench/utils/cloudwatch_utils.py +137 -0
workbench/utils/config_manager.py +3 -7
workbench/utils/endpoint_utils.py +5 -7
workbench/utils/license_manager.py +2 -6
workbench/utils/model_utils.py +76 -30
workbench/utils/monitor_utils.py +44 -62
workbench/utils/pandas_utils.py +3 -3
workbench/utils/shap_utils.py +10 -2
workbench/utils/workbench_logging.py +0 -3
workbench/utils/workbench_sqs.py +1 -1
workbench/utils/xgboost_model_utils.py +283 -145
workbench/web_interface/components/plugins/dashboard_status.py +3 -1
workbench/web_interface/components/plugins/generated_compounds.py +1 -1
workbench/web_interface/components/plugins/scatter_plot.py +3 -3
{workbench-0.8.161.dist-info → workbench-0.8.192.dist-info}/METADATA +4 -4
{workbench-0.8.161.dist-info → workbench-0.8.192.dist-info}/RECORD +81 -76
{workbench-0.8.161.dist-info → workbench-0.8.192.dist-info}/entry_points.txt +3 -0
workbench/model_scripts/custom_models/chem_info/local_utils.py +0 -769
workbench/model_scripts/custom_models/chem_info/tautomerize.py +0 -83
workbench/model_scripts/custom_models/proximity/generated_model_script.py +0 -138
workbench/model_scripts/custom_models/uq_models/generated_model_script.py +0 -393
workbench/model_scripts/custom_models/uq_models/mapie_xgb.template +0 -203
workbench/model_scripts/ensemble_xgb/generated_model_script.py +0 -279
workbench/model_scripts/pytorch_model/generated_model_script.py +0 -565
workbench/model_scripts/quant_regression/quant_regression.template +0 -279
workbench/model_scripts/quant_regression/requirements.txt +0 -1
workbench/model_scripts/scikit_learn/generated_model_script.py +0 -307
workbench/model_scripts/xgb_model/generated_model_script.py +0 -477
workbench/utils/chem_utils.py +0 -1556
workbench/utils/execution_environment.py +0 -211
workbench/utils/fast_inference.py +0 -167
workbench/utils/resource_utils.py +0 -39
{workbench-0.8.161.dist-info → workbench-0.8.192.dist-info}/WHEEL +0 -0
{workbench-0.8.161.dist-info → workbench-0.8.192.dist-info}/licenses/LICENSE +0 -0
{workbench-0.8.161.dist-info → workbench-0.8.192.dist-info}/top_level.txt +0 -0

workbench/model_scripts/custom_models/uq_models/meta_uq.template CHANGED Viewed

@@ -1,34 +1,33 @@
 # Model: NGBoost Regressor with Distribution output
 from ngboost import NGBRegressor
-from xgboost import XGBRegressor  # Base Estimator
+from ngboost.distns import Cauchy
+from xgboost import XGBRegressor  # Point Estimator
 from sklearn.model_selection import train_test_split
 # Model Performance Scores
-from sklearn.metrics import (
-    mean_absolute_error,
-    r2_score,
-    root_mean_squared_error
-)
+from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
 from io import StringIO
 import json
 import argparse
 import joblib
 import os
+import numpy as np
 import pandas as pd
+from typing import List, Tuple
 # Local Imports
 from proximity import Proximity
 # Template Placeholders
 TEMPLATE_PARAMS = {
     "id_column": "{{id_column}}",
-    "features": "{{feature_list}}",
     "target": "{{target_column}}",
+    "features": "{{feature_list}}",
+    "compressed_features": "{{compressed_features}}",
     "train_all_data": "{{train_all_data}}",
-    "track_columns": "{{track_columns}}"
+    "track_columns": "{{track_columns}}",
 }
@@ -72,16 +71,99 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
     return df.rename(columns=rename_dict)
-# TRAINING SECTION
-#
-# This section (__main__) is where SageMaker will execute the training job
-# and save the model artifacts to the model directory.
-#
+def convert_categorical_types(df: pd.DataFrame, features: list, category_mappings={}) -> tuple:
+    """
+    Converts appropriate columns to categorical type with consistent mappings.
+    Args:
+        df (pd.DataFrame): The DataFrame to process.
+        features (list): List of feature names to consider for conversion.
+        category_mappings (dict, optional): Existing category mappings. If empty dict, we're in
+                                            training mode. If populated, we're in inference mode.
+    Returns:
+        tuple: (processed DataFrame, category mappings dictionary)
+    """
+    # Training mode
+    if category_mappings == {}:
+        for col in df.select_dtypes(include=["object", "string"]):
+            if col in features and df[col].nunique() < 20:
+                print(f"Training mode: Converting {col} to category")
+                df[col] = df[col].astype("category")
+                category_mappings[col] = df[col].cat.categories.tolist()  # Store category mappings
+    # Inference mode
+    else:
+        for col, categories in category_mappings.items():
+            if col in df.columns:
+                print(f"Inference mode: Applying categorical mapping for {col}")
+                df[col] = pd.Categorical(df[col], categories=categories)  # Apply consistent categorical mapping
+    return df, category_mappings
+def decompress_features(
+    df: pd.DataFrame, features: List[str], compressed_features: List[str]
+) -> Tuple[pd.DataFrame, List[str]]:
+    """Prepare features for the model by decompressing bitstring features
+    Args:
+        df (pd.DataFrame): The features DataFrame
+        features (List[str]): Full list of feature names
+        compressed_features (List[str]): List of feature names to decompress (bitstrings)
+    Returns:
+        pd.DataFrame: DataFrame with the decompressed features
+        List[str]: Updated list of feature names after decompression
+    Raises:
+        ValueError: If any missing values are found in the specified features
+    """
+    # Check for any missing values in the required features
+    missing_counts = df[features].isna().sum()
+    if missing_counts.any():
+        missing_features = missing_counts[missing_counts > 0]
+        print(
+            f"WARNING: Found missing values in features: {missing_features.to_dict()}. "
+            "WARNING: You might want to remove/replace all NaN values before processing."
+        )
+    # Decompress the specified compressed features
+    decompressed_features = features.copy()
+    for feature in compressed_features:
+        if (feature not in df.columns) or (feature not in features):
+            print(f"Feature '{feature}' not in the features list, skipping decompression.")
+            continue
+        # Remove the feature from the list of features to avoid duplication
+        decompressed_features.remove(feature)
+        # Handle all compressed features as bitstrings
+        bit_matrix = np.array([list(bitstring) for bitstring in df[feature]], dtype=np.uint8)
+        prefix = feature[:3]
+        # Create all new columns at once - avoids fragmentation
+        new_col_names = [f"{prefix}_{i}" for i in range(bit_matrix.shape[1])]
+        new_df = pd.DataFrame(bit_matrix, columns=new_col_names, index=df.index)
+        # Add to features list
+        decompressed_features.extend(new_col_names)
+        # Drop original column and concatenate new ones
+        df = df.drop(columns=[feature])
+        df = pd.concat([df, new_df], axis=1)
+    return df, decompressed_features
 if __name__ == "__main__":
     # Template Parameters
     id_column = TEMPLATE_PARAMS["id_column"]
-    features = TEMPLATE_PARAMS["features"]
     target = TEMPLATE_PARAMS["target"]
+    features = TEMPLATE_PARAMS["features"]
+    orig_features = features.copy()
+    compressed_features = TEMPLATE_PARAMS["compressed_features"]
     train_all_data = TEMPLATE_PARAMS["train_all_data"]
     track_columns = TEMPLATE_PARAMS["track_columns"]  # Can be None
     validation_split = 0.2
@@ -95,53 +177,62 @@ if __name__ == "__main__":
     )
     args = parser.parse_args()
-    # Load training data from the specified directory
-    training_files = [
-        os.path.join(args.train, file)
-        for file in os.listdir(args.train) if file.endswith(".csv")
-    ]
+    # Read the training data into DataFrames
+    training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
     print(f"Training Files: {training_files}")
     # Combine files and read them all into a single pandas dataframe
-    df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
+    all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
+    # Check if the dataframe is empty
+    check_dataframe(all_df, "training_df")
+    # Features/Target output
+    print(f"Target: {target}")
+    print(f"Features: {str(features)}")
-    # Check if the DataFrame is empty
-    check_dataframe(df, "training_df")
+    # Convert any features that might be categorical to 'category' type
+    all_df, category_mappings = convert_categorical_types(all_df, features)
-    # Training data split logic
+    # If we have compressed features, decompress them
+    if compressed_features:
+        print(f"Decompressing features {compressed_features}...")
+        all_df, features = decompress_features(all_df, features, compressed_features)
+    # Do we want to train on all the data?
     if train_all_data:
-        # Use all data for both training and validation
-        print("Training on all data...")
-        df_train = df.copy()
-        df_val = df.copy()
-    elif "training" in df.columns:
-        # Split data based on a 'training' column if it exists
-        print("Splitting data based on 'training' column...")
-        df_train = df[df["training"]].copy()
-        df_val = df[~df["training"]].copy()
+        print("Training on ALL of the data")
+        df_train = all_df.copy()
+        df_val = all_df.copy()
+    # Does the dataframe have a training column?
+    elif "training" in all_df.columns:
+        print("Found training column, splitting data based on training column")
+        df_train = all_df[all_df["training"]]
+        df_val = all_df[~all_df["training"]]
     else:
-        # Perform a random split if no 'training' column is found
-        print("Splitting data randomly...")
-        df_train, df_val = train_test_split(df, test_size=validation_split, random_state=42)
+        # Just do a random training Split
+        print("WARNING: No training column found, splitting data with random state=42")
+        df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
+    print(f"FIT/TRAIN: {df_train.shape}")
+    print(f"VALIDATION: {df_val.shape}")
     # We're using XGBoost for point predictions and NGBoost for uncertainty quantification
     xgb_model = XGBRegressor()
-    ngb_model = NGBRegressor()
+    ngb_model = NGBRegressor()  # Dist=Cauchy) Seems to give HUGE prediction intervals
     # Prepare features and targets for training
     X_train = df_train[features]
-    X_val = df_val[features]
+    X_validate = df_val[features]
     y_train = df_train[target]
-    y_val = df_val[target]
+    y_validate = df_val[target]
     # Train both models using the training data
     xgb_model.fit(X_train, y_train)
-    ngb_model.fit(X_train, y_train, X_val=X_val, Y_val=y_val)
+    ngb_model.fit(X_train, y_train, X_val=X_validate, Y_val=y_validate)
     # Make Predictions on the Validation Set
     print(f"Making Predictions on Validation Set...")
-    y_validate = df_val[target]
-    X_validate = df_val[features]
     preds = xgb_model.predict(X_validate)
     # Calculate various model performance metrics (regression)
@@ -159,9 +250,9 @@ if __name__ == "__main__":
     # Save the trained NGBoost model
     joblib.dump(ngb_model, os.path.join(args.model_dir, "ngb_model.joblib"))
-    # Save the feature list to validate input during predictions
+    # Save the features (this will validate input during predictions)
     with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
-        json.dump(features, fp)
+        json.dump(orig_features, fp)  # We save the original features, not the decompressed ones
     # Now the Proximity model
     model = Proximity(df_train, id_column, features, target, track_columns=track_columns)
@@ -187,11 +278,7 @@ def model_fn(model_dir) -> dict:
     # Deserialize the proximity model
     prox_model = Proximity.deserialize(model_dir)
-    return {
-        "xgboost": xgb_model,
-        "ngboost": ngb_model,
-        "proximity": prox_model
-    }
+    return {"xgboost": xgb_model, "ngboost": ngb_model, "proximity": prox_model}
 def input_fn(input_data, content_type):
@@ -251,20 +338,31 @@ def predict_fn(df, models) -> pd.DataFrame:
     dist_params = y_dists.params
     # Extract mean and std from distribution parameters
-    df["prediction_uq"] = dist_params['loc']  # mean
-    df["prediction_std"] = dist_params['scale']  # standard deviation
+    df["prediction_uq"] = dist_params["loc"]  # mean
+    df["prediction_std"] = dist_params["scale"]  # standard deviation
     # Add 95% prediction intervals using ppf (percent point function)
-    df["q_025"] = y_dists.ppf(0.025)  # 2.5th percentile
-    df["q_975"] = y_dists.ppf(0.975)  # 97.5th percentile
+    # Note: Our hybrid model uses XGB point prediction and NGBoost UQ
+    #  so we need to adjust the bounds to include the point prediction
+    df["q_025"] = np.minimum(y_dists.ppf(0.025), df["prediction"])
+    df["q_975"] = np.maximum(y_dists.ppf(0.975), df["prediction"])
+    # Add 90% prediction intervals
+    df["q_05"] = y_dists.ppf(0.05)  # 5th percentile
+    df["q_95"] = y_dists.ppf(0.95)  # 95th percentile
+    # Add 80% prediction intervals
+    df["q_10"] = y_dists.ppf(0.10)  # 10th percentile
+    df["q_90"] = y_dists.ppf(0.90)  # 90th percentile
     # Add 50% prediction intervals
-    df["q_25"] = y_dists.ppf(0.25)   # 25th percentile
-    df["q_75"] = y_dists.ppf(0.75)   # 75th percentile
+    df["q_25"] = y_dists.ppf(0.25)  # 25th percentile
+    df["q_75"] = y_dists.ppf(0.75)  # 75th percentile
-    # Adjust prediction intervals to include point predictions
-    df["q_025"] = df[["q_025", "prediction"]].min(axis=1)
-    df["q_975"] = df[["q_975", "prediction"]].max(axis=1)
+    # Reorder the quantile columns for easier reading
+    quantile_cols = ["q_025", "q_05", "q_10", "q_25", "q_75", "q_90", "q_95", "q_975"]
+    other_cols = [col for col in df.columns if col not in quantile_cols]
+    df = df[other_cols + quantile_cols]
     # Compute Nearest neighbors with Proximity model
     models["proximity"].neighbors(df)

workbench/model_scripts/custom_models/uq_models/ngboost.template CHANGED Viewed

@@ -3,11 +3,7 @@ from ngboost import NGBRegressor
 from sklearn.model_selection import train_test_split
 # Model Performance Scores
-from sklearn.metrics import (
-    mean_absolute_error,
-    r2_score,
-    root_mean_squared_error
-)
+from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
 from io import StringIO
 import json
@@ -21,7 +17,7 @@ import pandas as pd
 TEMPLATE_PARAMS = {
     "features": "{{feature_list}}",
     "target": "{{target_column}}",
-    "train_all_data": "{{train_all_data}}"
+    "train_all_data": "{{train_all_data}}",
 }
@@ -87,10 +83,7 @@ if __name__ == "__main__":
     args = parser.parse_args()
     # Load training data from the specified directory
-    training_files = [
-        os.path.join(args.train, file)
-        for file in os.listdir(args.train) if file.endswith(".csv")
-    ]
+    training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
     print(f"Training Files: {training_files}")
     # Combine files and read them all into a single pandas dataframe
@@ -212,16 +205,29 @@ def predict_fn(df, model) -> pd.DataFrame:
     dist_params = y_dists.params
     # Extract mean and std from distribution parameters
-    df["prediction"] = dist_params['loc']  # mean
-    df["prediction_std"] = dist_params['scale']  # standard deviation
+    df["prediction"] = dist_params["loc"]  # mean
+    df["prediction_std"] = dist_params["scale"]  # standard deviation
     # Add 95% prediction intervals using ppf (percent point function)
     df["q_025"] = y_dists.ppf(0.025)  # 2.5th percentile
     df["q_975"] = y_dists.ppf(0.975)  # 97.5th percentile
+    # Add 90% prediction intervals
+    df["q_05"] = y_dists.ppf(0.05)  # 5th percentile
+    df["q_95"] = y_dists.ppf(0.95)  # 95th percentile
+    # Add 80% prediction intervals
+    df["q_10"] = y_dists.ppf(0.10)  # 10th percentile
+    df["q_90"] = y_dists.ppf(0.90)  # 90th percentile
     # Add 50% prediction intervals
-    df["q_25"] = y_dists.ppf(0.25)   # 25th percentile
-    df["q_75"] = y_dists.ppf(0.75)   # 75th percentile
+    df["q_25"] = y_dists.ppf(0.25)  # 25th percentile
+    df["q_75"] = y_dists.ppf(0.75)  # 75th percentile
+    # Reorder the quantile columns for easier reading
+    quantile_cols = ["q_025", "q_05", "q_10", "q_25", "q_75", "q_90", "q_95", "q_975"]
+    other_cols = [col for col in df.columns if col not in quantile_cols]
+    df = df[other_cols + quantile_cols]
     # Return the modified DataFrame
     return df

workbench 0.8.161__py3-none-any.whl → 0.8.192__py3-none-any.whl

workbench 0.8.161py3-none-any.whl → 0.8.192py3-none-any.whl