PyPI - workbench - Versions diffs - 0.8.213__py3-none-any.whl → 0.8.219__py3-none-any.whl - Mend

workbench 0.8.213py3-none-any.whl → 0.8.219py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (58) hide show

workbench/algorithms/dataframe/feature_space_proximity.py +168 -75
workbench/algorithms/dataframe/fingerprint_proximity.py +257 -80
workbench/algorithms/dataframe/projection_2d.py +38 -21
workbench/algorithms/dataframe/proximity.py +75 -150
workbench/algorithms/graph/light/proximity_graph.py +5 -5
workbench/algorithms/models/cleanlab_model.py +382 -0
workbench/algorithms/models/noise_model.py +2 -2
workbench/algorithms/sql/outliers.py +3 -3
workbench/api/__init__.py +3 -0
workbench/api/endpoint.py +10 -5
workbench/api/feature_set.py +76 -6
workbench/api/meta_model.py +289 -0
workbench/api/model.py +43 -4
workbench/core/artifacts/endpoint_core.py +65 -117
workbench/core/artifacts/feature_set_core.py +3 -3
workbench/core/artifacts/model_core.py +6 -4
workbench/core/pipelines/pipeline_executor.py +1 -1
workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +30 -10
workbench/model_script_utils/model_script_utils.py +15 -11
workbench/model_script_utils/pytorch_utils.py +11 -1
workbench/model_scripts/chemprop/chemprop.template +147 -71
workbench/model_scripts/chemprop/generated_model_script.py +151 -75
workbench/model_scripts/chemprop/model_script_utils.py +15 -11
workbench/model_scripts/custom_models/chem_info/fingerprints.py +87 -46
workbench/model_scripts/custom_models/proximity/feature_space_proximity.py +194 -0
workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +6 -6
workbench/model_scripts/custom_models/uq_models/feature_space_proximity.py +194 -0
workbench/model_scripts/meta_model/generated_model_script.py +209 -0
workbench/model_scripts/meta_model/meta_model.template +209 -0
workbench/model_scripts/pytorch_model/generated_model_script.py +45 -27
workbench/model_scripts/pytorch_model/model_script_utils.py +15 -11
workbench/model_scripts/pytorch_model/pytorch.template +42 -24
workbench/model_scripts/pytorch_model/pytorch_utils.py +11 -1
workbench/model_scripts/script_generation.py +4 -0
workbench/model_scripts/xgb_model/generated_model_script.py +167 -156
workbench/model_scripts/xgb_model/model_script_utils.py +15 -11
workbench/model_scripts/xgb_model/xgb_model.template +163 -152
workbench/repl/workbench_shell.py +0 -5
workbench/scripts/endpoint_test.py +2 -2
workbench/scripts/meta_model_sim.py +35 -0
workbench/utils/chem_utils/fingerprints.py +87 -46
workbench/utils/chemprop_utils.py +23 -5
workbench/utils/meta_model_simulator.py +499 -0
workbench/utils/metrics_utils.py +94 -10
workbench/utils/model_utils.py +91 -9
workbench/utils/pytorch_utils.py +1 -1
workbench/utils/shap_utils.py +1 -55
workbench/web_interface/components/plugins/scatter_plot.py +4 -8
{workbench-0.8.213.dist-info → workbench-0.8.219.dist-info}/METADATA +2 -1
{workbench-0.8.213.dist-info → workbench-0.8.219.dist-info}/RECORD +54 -50
{workbench-0.8.213.dist-info → workbench-0.8.219.dist-info}/entry_points.txt +1 -0
workbench/model_scripts/custom_models/meta_endpoints/example.py +0 -53
workbench/model_scripts/custom_models/proximity/proximity.py +0 -410
workbench/model_scripts/custom_models/uq_models/meta_uq.template +0 -377
workbench/model_scripts/custom_models/uq_models/proximity.py +0 -410
{workbench-0.8.213.dist-info → workbench-0.8.219.dist-info}/WHEEL +0 -0
{workbench-0.8.213.dist-info → workbench-0.8.219.dist-info}/licenses/LICENSE +0 -0
{workbench-0.8.213.dist-info → workbench-0.8.219.dist-info}/top_level.txt +0 -0

workbench/model_scripts/custom_models/uq_models/meta_uq.template DELETED Viewed

@@ -1,377 +0,0 @@
-# Model: NGBoost Regressor with Distribution output
-from ngboost import NGBRegressor
-from ngboost.distns import Cauchy
-from xgboost import XGBRegressor  # Point Estimator
-from sklearn.model_selection import train_test_split
-# Model Performance Scores
-from sklearn.metrics import mean_absolute_error, median_absolute_error, r2_score, root_mean_squared_error
-from scipy.stats import spearmanr
-from io import StringIO
-import json
-import argparse
-import joblib
-import os
-import numpy as np
-import pandas as pd
-from typing import List, Tuple
-# Local Imports
-from proximity import Proximity
-# Template Placeholders
-TEMPLATE_PARAMS = {
-    "id_column": "{{id_column}}",
-    "target": "{{target_column}}",
-    "features": "{{feature_list}}",
-    "compressed_features": "{{compressed_features}}",
-    "train_all_data": "{{train_all_data}}",
-    "track_columns": "{{track_columns}}",
-}
-# Function to check if dataframe is empty
-def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
-    """
-    Check if the provided dataframe is empty and raise an exception if it is.
-    Args:
-        df (pd.DataFrame): DataFrame to check
-        df_name (str): Name of the DataFrame
-    """
-    if df.empty:
-        msg = f"*** The training data {df_name} has 0 rows! ***STOPPING***"
-        print(msg)
-        raise ValueError(msg)
-def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> pd.DataFrame:
-    """
-    Matches and renames DataFrame columns to match model feature names (case-insensitive).
-    Prioritizes exact matches, then case-insensitive matches.
-    Raises ValueError if any model features cannot be matched.
-    """
-    df_columns_lower = {col.lower(): col for col in df.columns}
-    rename_dict = {}
-    missing = []
-    for feature in model_features:
-        if feature in df.columns:
-            continue  # Exact match
-        elif feature.lower() in df_columns_lower:
-            rename_dict[df_columns_lower[feature.lower()]] = feature
-        else:
-            missing.append(feature)
-    if missing:
-        raise ValueError(f"Features not found: {missing}")
-    # Rename the DataFrame columns to match the model features
-    return df.rename(columns=rename_dict)
-def convert_categorical_types(df: pd.DataFrame, features: list, category_mappings={}) -> tuple:
-    """
-    Converts appropriate columns to categorical type with consistent mappings.
-    Args:
-        df (pd.DataFrame): The DataFrame to process.
-        features (list): List of feature names to consider for conversion.
-        category_mappings (dict, optional): Existing category mappings. If empty dict, we're in
-                                            training mode. If populated, we're in inference mode.
-    Returns:
-        tuple: (processed DataFrame, category mappings dictionary)
-    """
-    # Training mode
-    if category_mappings == {}:
-        for col in df.select_dtypes(include=["object", "string"]):
-            if col in features and df[col].nunique() < 20:
-                print(f"Training mode: Converting {col} to category")
-                df[col] = df[col].astype("category")
-                category_mappings[col] = df[col].cat.categories.tolist()  # Store category mappings
-    # Inference mode
-    else:
-        for col, categories in category_mappings.items():
-            if col in df.columns:
-                print(f"Inference mode: Applying categorical mapping for {col}")
-                df[col] = pd.Categorical(df[col], categories=categories)  # Apply consistent categorical mapping
-    return df, category_mappings
-def decompress_features(
-    df: pd.DataFrame, features: List[str], compressed_features: List[str]
-) -> Tuple[pd.DataFrame, List[str]]:
-    """Prepare features for the model by decompressing bitstring features
-    Args:
-        df (pd.DataFrame): The features DataFrame
-        features (List[str]): Full list of feature names
-        compressed_features (List[str]): List of feature names to decompress (bitstrings)
-    Returns:
-        pd.DataFrame: DataFrame with the decompressed features
-        List[str]: Updated list of feature names after decompression
-    Raises:
-        ValueError: If any missing values are found in the specified features
-    """
-    # Check for any missing values in the required features
-    missing_counts = df[features].isna().sum()
-    if missing_counts.any():
-        missing_features = missing_counts[missing_counts > 0]
-        print(
-            f"WARNING: Found missing values in features: {missing_features.to_dict()}. "
-            "WARNING: You might want to remove/replace all NaN values before processing."
-        )
-    # Decompress the specified compressed features
-    decompressed_features = features.copy()
-    for feature in compressed_features:
-        if (feature not in df.columns) or (feature not in features):
-            print(f"Feature '{feature}' not in the features list, skipping decompression.")
-            continue
-        # Remove the feature from the list of features to avoid duplication
-        decompressed_features.remove(feature)
-        # Handle all compressed features as bitstrings
-        bit_matrix = np.array([list(bitstring) for bitstring in df[feature]], dtype=np.uint8)
-        prefix = feature[:3]
-        # Create all new columns at once - avoids fragmentation
-        new_col_names = [f"{prefix}_{i}" for i in range(bit_matrix.shape[1])]
-        new_df = pd.DataFrame(bit_matrix, columns=new_col_names, index=df.index)
-        # Add to features list
-        decompressed_features.extend(new_col_names)
-        # Drop original column and concatenate new ones
-        df = df.drop(columns=[feature])
-        df = pd.concat([df, new_df], axis=1)
-    return df, decompressed_features
-if __name__ == "__main__":
-    # Template Parameters
-    id_column = TEMPLATE_PARAMS["id_column"]
-    target = TEMPLATE_PARAMS["target"]
-    features = TEMPLATE_PARAMS["features"]
-    orig_features = features.copy()
-    compressed_features = TEMPLATE_PARAMS["compressed_features"]
-    train_all_data = TEMPLATE_PARAMS["train_all_data"]
-    track_columns = TEMPLATE_PARAMS["track_columns"]  # Can be None
-    validation_split = 0.2
-    # Script arguments for input/output directories
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model"))
-    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train"))
-    parser.add_argument(
-        "--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data")
-    )
-    args = parser.parse_args()
-    # Read the training data into DataFrames
-    training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
-    print(f"Training Files: {training_files}")
-    # Combine files and read them all into a single pandas dataframe
-    all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
-    # Check if the dataframe is empty
-    check_dataframe(all_df, "training_df")
-    # Features/Target output
-    print(f"Target: {target}")
-    print(f"Features: {str(features)}")
-    # Convert any features that might be categorical to 'category' type
-    all_df, category_mappings = convert_categorical_types(all_df, features)
-    # If we have compressed features, decompress them
-    if compressed_features:
-        print(f"Decompressing features {compressed_features}...")
-        all_df, features = decompress_features(all_df, features, compressed_features)
-    # Do we want to train on all the data?
-    if train_all_data:
-        print("Training on ALL of the data")
-        df_train = all_df.copy()
-        df_val = all_df.copy()
-    # Does the dataframe have a training column?
-    elif "training" in all_df.columns:
-        print("Found training column, splitting data based on training column")
-        df_train = all_df[all_df["training"]]
-        df_val = all_df[~all_df["training"]]
-    else:
-        # Just do a random training Split
-        print("WARNING: No training column found, splitting data with random state=42")
-        df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
-    print(f"FIT/TRAIN: {df_train.shape}")
-    print(f"VALIDATION: {df_val.shape}")
-    # We're using XGBoost for point predictions and NGBoost for uncertainty quantification
-    xgb_model = XGBRegressor()
-    ngb_model = NGBRegressor()  # Dist=Cauchy) Seems to give HUGE prediction intervals
-    # Prepare features and targets for training
-    X_train = df_train[features]
-    X_validate = df_val[features]
-    y_train = df_train[target]
-    y_validate = df_val[target]
-    # Train both models using the training data
-    xgb_model.fit(X_train, y_train)
-    ngb_model.fit(X_train, y_train, X_val=X_validate, Y_val=y_validate)
-    # Make Predictions on the Validation Set
-    print(f"Making Predictions on Validation Set...")
-    preds = xgb_model.predict(X_validate)
-    # Calculate various model performance metrics (regression)
-    rmse = root_mean_squared_error(y_validate, preds)
-    mae = mean_absolute_error(y_validate, preds)
-    medae = median_absolute_error(y_validate, preds)
-    r2 = r2_score(y_validate, preds)
-    spearman_corr = spearmanr(y_validate, preds).correlation
-    support = len(df_val)
-    print(f"rmse: {rmse:.3f}")
-    print(f"mae: {mae:.3f}")
-    print(f"medae: {medae:.3f}")
-    print(f"r2: {r2:.3f}")
-    print(f"spearmanr: {spearman_corr:.3f}")
-    print(f"support: {support}")
-    # Save the trained XGBoost model
-    xgb_model.save_model(os.path.join(args.model_dir, "xgb_model.json"))
-    # Save the trained NGBoost model
-    joblib.dump(ngb_model, os.path.join(args.model_dir, "ngb_model.joblib"))
-    # Save the features (this will validate input during predictions)
-    with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
-        json.dump(orig_features, fp)  # We save the original features, not the decompressed ones
-    # Now the Proximity model
-    model = Proximity(df_train, id_column, features, target, track_columns=track_columns)
-    # Now serialize the model
-    model.serialize(args.model_dir)
-#
-# Inference Section
-#
-def model_fn(model_dir) -> dict:
-    """Load and return XGBoost, NGBoost, and Prox Model from model directory."""
-    # Load XGBoost regressor
-    xgb_path = os.path.join(model_dir, "xgb_model.json")
-    xgb_model = XGBRegressor(enable_categorical=True)
-    xgb_model.load_model(xgb_path)
-    # Load NGBoost regressor
-    ngb_model = joblib.load(os.path.join(model_dir, "ngb_model.joblib"))
-    # Deserialize the proximity model
-    prox_model = Proximity.deserialize(model_dir)
-    return {"xgboost": xgb_model, "ngboost": ngb_model, "proximity": prox_model}
-def input_fn(input_data, content_type):
-    """Parse input data and return a DataFrame."""
-    if not input_data:
-        raise ValueError("Empty input data is not supported!")
-    # Decode bytes to string if necessary
-    if isinstance(input_data, bytes):
-        input_data = input_data.decode("utf-8")
-    if "text/csv" in content_type:
-        return pd.read_csv(StringIO(input_data))
-    elif "application/json" in content_type:
-        return pd.DataFrame(json.loads(input_data))  # Assumes JSON array of records
-    else:
-        raise ValueError(f"{content_type} not supported!")
-def output_fn(output_df, accept_type):
-    """Supports both CSV and JSON output formats."""
-    if "text/csv" in accept_type:
-        csv_output = output_df.fillna("N/A").to_csv(index=False)  # CSV with N/A for missing values
-        return csv_output, "text/csv"
-    elif "application/json" in accept_type:
-        return output_df.to_json(orient="records"), "application/json"  # JSON array of records (NaNs -> null)
-    else:
-        raise RuntimeError(f"{accept_type} accept type is not supported by this script.")
-def predict_fn(df, models) -> pd.DataFrame:
-    """Make Predictions with our XGB Quantile Regression Model
-    Args:
-        df (pd.DataFrame): The input DataFrame
-        models (dict): The dictionary of models to use for predictions
-    Returns:
-        pd.DataFrame: The DataFrame with the predictions added
-    """
-    # Grab our feature columns (from training)
-    model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
-    with open(os.path.join(model_dir, "feature_columns.json")) as fp:
-        model_features = json.load(fp)
-    # Match features in a case-insensitive manner
-    matched_df = match_features_case_insensitive(df, model_features)
-    # Use XGBoost for point predictions
-    df["prediction"] = models["xgboost"].predict(matched_df[model_features])
-    # NGBoost predict returns distribution objects
-    y_dists = models["ngboost"].pred_dist(matched_df[model_features])
-    # Extract parameters from distribution
-    dist_params = y_dists.params
-    # Extract mean and std from distribution parameters
-    df["prediction_uq"] = dist_params["loc"]  # mean
-    df["prediction_std"] = dist_params["scale"]  # standard deviation
-    # Add 95% prediction intervals using ppf (percent point function)
-    # Note: Our hybrid model uses XGB point prediction and NGBoost UQ
-    #  so we need to adjust the bounds to include the point prediction
-    df["q_025"] = np.minimum(y_dists.ppf(0.025), df["prediction"])
-    df["q_975"] = np.maximum(y_dists.ppf(0.975), df["prediction"])
-    # Add 90% prediction intervals
-    df["q_05"] = y_dists.ppf(0.05)  # 5th percentile
-    df["q_95"] = y_dists.ppf(0.95)  # 95th percentile
-    # Add 80% prediction intervals
-    df["q_10"] = y_dists.ppf(0.10)  # 10th percentile
-    df["q_90"] = y_dists.ppf(0.90)  # 90th percentile
-    # Add 50% prediction intervals
-    df["q_25"] = y_dists.ppf(0.25)  # 25th percentile
-    df["q_75"] = y_dists.ppf(0.75)  # 75th percentile
-    # Reorder the quantile columns for easier reading
-    quantile_cols = ["q_025", "q_05", "q_10", "q_25", "q_75", "q_90", "q_95", "q_975"]
-    other_cols = [col for col in df.columns if col not in quantile_cols]
-    df = df[other_cols + quantile_cols]
-    # Compute Nearest neighbors with Proximity model
-    models["proximity"].neighbors(df)
-    # Return the modified DataFrame
-    return df

workbench 0.8.213__py3-none-any.whl → 0.8.219__py3-none-any.whl

workbench 0.8.213py3-none-any.whl → 0.8.219py3-none-any.whl