PyPI - workbench - Versions diffs - 0.8.170__py3-none-any.whl → 0.8.172__py3-none-any.whl - Mend

workbench 0.8.170py3-none-any.whl → 0.8.172py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of workbench might be problematic. Click here for more details.

Files changed (27) hide show

workbench/model_scripts/custom_models/uq_models/meta_uq.template CHANGED Viewed

@@ -1,6 +1,7 @@
 # Model: NGBoost Regressor with Distribution output
 from ngboost import NGBRegressor
-from xgboost import XGBRegressor  # Base Estimator
+from ngboost.distns import Cauchy
+from xgboost import XGBRegressor  # Point Estimator
 from sklearn.model_selection import train_test_split
 # Model Performance Scores
@@ -15,7 +16,9 @@ import json
 import argparse
 import joblib
 import os
+import numpy as np
 import pandas as pd
+from typing import List, Tuple
 # Local Imports
 from proximity import Proximity
@@ -25,8 +28,9 @@ from proximity import Proximity
 # Template Placeholders
 TEMPLATE_PARAMS = {
     "id_column": "{{id_column}}",
-    "features": "{{feature_list}}",
     "target": "{{target_column}}",
+    "features": "{{feature_list}}",
+    "compressed_features": "{{compressed_features}}",
     "train_all_data": "{{train_all_data}}",
     "track_columns": "{{track_columns}}"
 }
@@ -72,16 +76,99 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
     return df.rename(columns=rename_dict)
-# TRAINING SECTION
-#
-# This section (__main__) is where SageMaker will execute the training job
-# and save the model artifacts to the model directory.
-#
+def convert_categorical_types(df: pd.DataFrame, features: list, category_mappings={}) -> tuple:
+    """
+    Converts appropriate columns to categorical type with consistent mappings.
+    Args:
+        df (pd.DataFrame): The DataFrame to process.
+        features (list): List of feature names to consider for conversion.
+        category_mappings (dict, optional): Existing category mappings. If empty dict, we're in
+                                            training mode. If populated, we're in inference mode.
+    Returns:
+        tuple: (processed DataFrame, category mappings dictionary)
+    """
+    # Training mode
+    if category_mappings == {}:
+        for col in df.select_dtypes(include=["object", "string"]):
+            if col in features and df[col].nunique() < 20:
+                print(f"Training mode: Converting {col} to category")
+                df[col] = df[col].astype("category")
+                category_mappings[col] = df[col].cat.categories.tolist()  # Store category mappings
+    # Inference mode
+    else:
+        for col, categories in category_mappings.items():
+            if col in df.columns:
+                print(f"Inference mode: Applying categorical mapping for {col}")
+                df[col] = pd.Categorical(df[col], categories=categories)  # Apply consistent categorical mapping
+    return df, category_mappings
+def decompress_features(
+    df: pd.DataFrame, features: List[str], compressed_features: List[str]
+) -> Tuple[pd.DataFrame, List[str]]:
+    """Prepare features for the model by decompressing bitstring features
+    Args:
+        df (pd.DataFrame): The features DataFrame
+        features (List[str]): Full list of feature names
+        compressed_features (List[str]): List of feature names to decompress (bitstrings)
+    Returns:
+        pd.DataFrame: DataFrame with the decompressed features
+        List[str]: Updated list of feature names after decompression
+    Raises:
+        ValueError: If any missing values are found in the specified features
+    """
+    # Check for any missing values in the required features
+    missing_counts = df[features].isna().sum()
+    if missing_counts.any():
+        missing_features = missing_counts[missing_counts > 0]
+        print(
+            f"WARNING: Found missing values in features: {missing_features.to_dict()}. "
+            "WARNING: You might want to remove/replace all NaN values before processing."
+        )
+    # Decompress the specified compressed features
+    decompressed_features = features.copy()
+    for feature in compressed_features:
+        if (feature not in df.columns) or (feature not in features):
+            print(f"Feature '{feature}' not in the features list, skipping decompression.")
+            continue
+        # Remove the feature from the list of features to avoid duplication
+        decompressed_features.remove(feature)
+        # Handle all compressed features as bitstrings
+        bit_matrix = np.array([list(bitstring) for bitstring in df[feature]], dtype=np.uint8)
+        prefix = feature[:3]
+        # Create all new columns at once - avoids fragmentation
+        new_col_names = [f"{prefix}_{i}" for i in range(bit_matrix.shape[1])]
+        new_df = pd.DataFrame(bit_matrix, columns=new_col_names, index=df.index)
+        # Add to features list
+        decompressed_features.extend(new_col_names)
+        # Drop original column and concatenate new ones
+        df = df.drop(columns=[feature])
+        df = pd.concat([df, new_df], axis=1)
+    return df, decompressed_features
 if __name__ == "__main__":
     # Template Parameters
     id_column = TEMPLATE_PARAMS["id_column"]
-    features = TEMPLATE_PARAMS["features"]
     target = TEMPLATE_PARAMS["target"]
+    features = TEMPLATE_PARAMS["features"]
+    orig_features = features.copy()
+    compressed_features = TEMPLATE_PARAMS["compressed_features"]
     train_all_data = TEMPLATE_PARAMS["train_all_data"]
     track_columns = TEMPLATE_PARAMS["track_columns"]  # Can be None
     validation_split = 0.2
@@ -95,53 +182,68 @@ if __name__ == "__main__":
     )
     args = parser.parse_args()
-    # Load training data from the specified directory
+    # Read the training data into DataFrames
     training_files = [
         os.path.join(args.train, file)
-        for file in os.listdir(args.train) if file.endswith(".csv")
+        for file in os.listdir(args.train)
+        if file.endswith(".csv")
     ]
     print(f"Training Files: {training_files}")
     # Combine files and read them all into a single pandas dataframe
-    df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
+    all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
+    # Check if the dataframe is empty
+    check_dataframe(all_df, "training_df")
+    # Features/Target output
+    print(f"Target: {target}")
+    print(f"Features: {str(features)}")
-    # Check if the DataFrame is empty
-    check_dataframe(df, "training_df")
+    # Convert any features that might be categorical to 'category' type
+    all_df, category_mappings = convert_categorical_types(all_df, features)
-    # Training data split logic
+    # If we have compressed features, decompress them
+    if compressed_features:
+        print(f"Decompressing features {compressed_features}...")
+        all_df, features = decompress_features(all_df, features, compressed_features)
+    # Do we want to train on all the data?
     if train_all_data:
-        # Use all data for both training and validation
-        print("Training on all data...")
-        df_train = df.copy()
-        df_val = df.copy()
-    elif "training" in df.columns:
-        # Split data based on a 'training' column if it exists
-        print("Splitting data based on 'training' column...")
-        df_train = df[df["training"]].copy()
-        df_val = df[~df["training"]].copy()
+        print("Training on ALL of the data")
+        df_train = all_df.copy()
+        df_val = all_df.copy()
+    # Does the dataframe have a training column?
+    elif "training" in all_df.columns:
+        print("Found training column, splitting data based on training column")
+        df_train = all_df[all_df["training"]]
+        df_val = all_df[~all_df["training"]]
     else:
-        # Perform a random split if no 'training' column is found
-        print("Splitting data randomly...")
-        df_train, df_val = train_test_split(df, test_size=validation_split, random_state=42)
+        # Just do a random training Split
+        print("WARNING: No training column found, splitting data with random state=42")
+        df_train, df_val = train_test_split(
+            all_df, test_size=validation_split, random_state=42
+        )
+    print(f"FIT/TRAIN: {df_train.shape}")
+    print(f"VALIDATION: {df_val.shape}")
     # We're using XGBoost for point predictions and NGBoost for uncertainty quantification
     xgb_model = XGBRegressor()
-    ngb_model = NGBRegressor()
+    ngb_model = NGBRegressor()  # Dist=Cauchy) Seems to give HUGE prediction intervals
     # Prepare features and targets for training
     X_train = df_train[features]
-    X_val = df_val[features]
+    X_validate = df_val[features]
     y_train = df_train[target]
-    y_val = df_val[target]
+    y_validate = df_val[target]
     # Train both models using the training data
     xgb_model.fit(X_train, y_train)
-    ngb_model.fit(X_train, y_train, X_val=X_val, Y_val=y_val)
+    ngb_model.fit(X_train, y_train, X_val=X_validate, Y_val=y_validate)
     # Make Predictions on the Validation Set
     print(f"Making Predictions on Validation Set...")
-    y_validate = df_val[target]
-    X_validate = df_val[features]
     preds = xgb_model.predict(X_validate)
     # Calculate various model performance metrics (regression)
@@ -159,9 +261,9 @@ if __name__ == "__main__":
     # Save the trained NGBoost model
     joblib.dump(ngb_model, os.path.join(args.model_dir, "ngb_model.joblib"))
-    # Save the feature list to validate input during predictions
+    # Save the features (this will validate input during predictions)
     with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
-        json.dump(features, fp)
+        json.dump(orig_features, fp)  # We save the original features, not the decompressed ones
     # Now the Proximity model
     model = Proximity(df_train, id_column, features, target, track_columns=track_columns)
@@ -255,16 +357,27 @@ def predict_fn(df, models) -> pd.DataFrame:
     df["prediction_std"] = dist_params['scale']  # standard deviation
     # Add 95% prediction intervals using ppf (percent point function)
-    df["q_025"] = y_dists.ppf(0.025)  # 2.5th percentile
-    df["q_975"] = y_dists.ppf(0.975)  # 97.5th percentile
+    # Note: Our hybrid model uses XGB point prediction and NGBoost UQ
+    #  so we need to adjust the bounds to include the point prediction
+    df["q_025"] = np.minimum(y_dists.ppf(0.025), df["prediction"])
+    df["q_975"] = np.maximum(y_dists.ppf(0.975), df["prediction"])
+    # Add 90% prediction intervals
+    df["q_05"] = y_dists.ppf(0.05)  # 5th percentile
+    df["q_95"] = y_dists.ppf(0.95)  # 95th percentile
+    # Add 80% prediction intervals
+    df["q_10"] = y_dists.ppf(0.10)  # 10th percentile
+    df["q_90"] = y_dists.ppf(0.90)  # 90th percentile
     # Add 50% prediction intervals
-    df["q_25"] = y_dists.ppf(0.25)   # 25th percentile
-    df["q_75"] = y_dists.ppf(0.75)   # 75th percentile
+    df["q_25"] = y_dists.ppf(0.25)  # 25th percentile
+    df["q_75"] = y_dists.ppf(0.75)  # 75th percentile
-    # Adjust prediction intervals to include point predictions
-    df["q_025"] = df[["q_025", "prediction"]].min(axis=1)
-    df["q_975"] = df[["q_975", "prediction"]].max(axis=1)
+    # Reorder the quantile columns for easier reading
+    quantile_cols = ["q_025", "q_05", "q_10", "q_25", "q_75", "q_90", "q_95", "q_975"]
+    other_cols = [col for col in df.columns if col not in quantile_cols]
+    df = df[other_cols + quantile_cols]
     # Compute Nearest neighbors with Proximity model
     models["proximity"].neighbors(df)

workbench/model_scripts/custom_models/uq_models/ngboost.template CHANGED Viewed

@@ -219,9 +219,22 @@ def predict_fn(df, model) -> pd.DataFrame:
     df["q_025"] = y_dists.ppf(0.025)  # 2.5th percentile
     df["q_975"] = y_dists.ppf(0.975)  # 97.5th percentile
+    # Add 90% prediction intervals
+    df["q_05"] = y_dists.ppf(0.05)  # 5th percentile
+    df["q_95"] = y_dists.ppf(0.95)  # 95th percentile
+    # Add 80% prediction intervals
+    df["q_10"] = y_dists.ppf(0.10)  # 10th percentile
+    df["q_90"] = y_dists.ppf(0.90)  # 90th percentile
     # Add 50% prediction intervals
-    df["q_25"] = y_dists.ppf(0.25)   # 25th percentile
-    df["q_75"] = y_dists.ppf(0.75)   # 75th percentile
+    df["q_25"] = y_dists.ppf(0.25)  # 25th percentile
+    df["q_75"] = y_dists.ppf(0.75)  # 75th percentile
+    # Reorder the quantile columns for easier reading
+    quantile_cols = ["q_025", "q_05", "q_10", "q_25", "q_75", "q_90", "q_95", "q_975"]
+    other_cols = [col for col in df.columns if col not in quantile_cols]
+    df = df[other_cols + quantile_cols]
     # Return the modified DataFrame
     return df

workbench/model_scripts/custom_models/uq_models/requirements.txt CHANGED Viewed

@@ -1,3 +1 @@
-# Note: NGBoost is not included in the default inference image, so it must be specified here.
-ngboost
-mapie
+# Note: Most libs are already in the training/inference images, ONLY specify additional libs here

workbench/model_scripts/script_generation.py CHANGED Viewed

@@ -70,6 +70,11 @@ def fill_template(template_path: str, params: dict, output_script: str) -> str:
     # Sanity check to ensure all placeholders were replaced
     if "{{" in template and "}}" in template:
         msg = "Not all template placeholders were replaced. Please check your params."
+        # Show which placeholders are still present
+        start = template.index("{{")
+        end = template.index("}}", start) + 2
+        msg += f" Unreplaced placeholder: {template[start:end]}"
         log.critical(msg)
         raise ValueError(msg)

workbench/model_scripts/xgb_model/generated_model_script.py CHANGED Viewed

@@ -28,12 +28,12 @@ from typing import List, Tuple
 # Template Parameters
 TEMPLATE_PARAMS = {
-    "model_type": "classifier",
-    "target_column": "class",
-    "features": ['bcut2d_logplow', 'numradicalelectrons', 'smr_vsa5', 'fr_lactam', 'fr_morpholine', 'fr_aldehyde', 'slogp_vsa1', 'fr_amidine', 'bpol', 'fr_ester', 'fr_azo', 'kappa3', 'peoe_vsa5', 'fr_ketone_topliss', 'vsa_estate9', 'estate_vsa9', 'bcut2d_mrhi', 'fr_ndealkylation1', 'numrotatablebonds', 'minestateindex', 'fr_quatn', 'peoe_vsa3', 'fr_epoxide', 'fr_aniline', 'minpartialcharge', 'fr_nitroso', 'fpdensitymorgan2', 'fr_oxime', 'fr_sulfone', 'smr_vsa1', 'kappa1', 'fr_pyridine', 'numaromaticrings', 'vsa_estate6', 'molmr', 'estate_vsa1', 'fr_dihydropyridine', 'vsa_estate10', 'fr_alkyl_halide', 'chi2n', 'fr_thiocyan', 'fpdensitymorgan1', 'fr_unbrch_alkane', 'slogp_vsa9', 'chi4n', 'fr_nitro_arom', 'fr_al_oh', 'fr_furan', 'fr_c_s', 'peoe_vsa8', 'peoe_vsa14', 'numheteroatoms', 'fr_ndealkylation2', 'maxabspartialcharge', 'vsa_estate2', 'peoe_vsa7', 'apol', 'numhacceptors', 'fr_tetrazole', 'vsa_estate1', 'peoe_vsa9', 'naromatom', 'bcut2d_chghi', 'fr_sh', 'fr_halogen', 'slogp_vsa4', 'fr_benzodiazepine', 'molwt', 'fr_isocyan', 'fr_prisulfonamd', 'maxabsestateindex', 'minabsestateindex', 'peoe_vsa11', 'slogp_vsa12', 'estate_vsa5', 'numaliphaticcarbocycles', 'bcut2d_mwlow', 'slogp_vsa7', 'fr_allylic_oxid', 'fr_methoxy', 'fr_nh0', 'fr_coo2', 'fr_phenol', 'nacid', 'nbase', 'chi3v', 'fr_ar_nh', 'fr_nitrile', 'fr_imidazole', 'fr_urea', 'bcut2d_mrlow', 'chi1', 'smr_vsa6', 'fr_aryl_methyl', 'narombond', 'fr_alkyl_carbamate', 'fr_piperzine', 'exactmolwt', 'qed', 'chi0n', 'fr_sulfonamd', 'fr_thiazole', 'numvalenceelectrons', 'fr_phos_acid', 'peoe_vsa12', 'fr_nh1', 'fr_hdrzine', 'fr_c_o_nocoo', 'fr_lactone', 'estate_vsa6', 'bcut2d_logphi', 'vsa_estate7', 'peoe_vsa13', 'numsaturatedcarbocycles', 'fr_nitro', 'fr_phenol_noorthohbond', 'rotratio', 'fr_barbitur', 'fr_isothiocyan', 'balabanj', 'fr_arn', 'fr_imine', 'maxpartialcharge', 'fr_sulfide', 'slogp_vsa11', 'fr_hoccn', 'fr_n_o', 'peoe_vsa1', 'slogp_vsa6', 'heavyatommolwt', 'fractioncsp3', 'estate_vsa8', 'peoe_vsa10', 'numaliphaticrings', 'fr_thiophene', 'maxestateindex', 'smr_vsa10', 'labuteasa', 'smr_vsa2', 'fpdensitymorgan3', 'smr_vsa9', 'slogp_vsa10', 'numaromaticheterocycles', 'fr_nh2', 'fr_diazo', 'chi3n', 'fr_ar_coo', 'slogp_vsa5', 'fr_bicyclic', 'fr_amide', 'estate_vsa10', 'fr_guanido', 'chi1n', 'numsaturatedrings', 'fr_piperdine', 'fr_term_acetylene', 'estate_vsa4', 'slogp_vsa3', 'fr_coo', 'fr_ether', 'estate_vsa7', 'bcut2d_chglo', 'fr_oxazole', 'peoe_vsa6', 'hallkieralpha', 'peoe_vsa2', 'chi2v', 'nocount', 'vsa_estate5', 'fr_nhpyrrole', 'fr_al_coo', 'bertzct', 'estate_vsa11', 'minabspartialcharge', 'slogp_vsa8', 'fr_imide', 'kappa2', 'numaliphaticheterocycles', 'numsaturatedheterocycles', 'fr_hdrzone', 'smr_vsa4', 'fr_ar_n', 'nrot', 'smr_vsa8', 'slogp_vsa2', 'chi4v', 'fr_phos_ester', 'fr_para_hydroxylation', 'smr_vsa3', 'nhohcount', 'estate_vsa2', 'mollogp', 'tpsa', 'fr_azide', 'peoe_vsa4', 'numhdonors', 'fr_al_oh_notert', 'fr_c_o', 'chi0', 'fr_nitro_arom_nonortho', 'vsa_estate3', 'fr_benzene', 'fr_ketone', 'vsa_estate8', 'smr_vsa7', 'fr_ar_oh', 'fr_priamide', 'ringcount', 'estate_vsa3', 'numaromaticcarbocycles', 'bcut2d_mwhi', 'chi1v', 'heavyatomcount', 'vsa_estate4', 'chi0v', 'pred_pka_reg'],
+    "model_type": "regressor",
+    "target": "udm_asy_res_intrinsic_clearance_ul_per_min_per_mg_protein",
+    "features": ['bcut2d_logplow', 'numradicalelectrons', 'smr_vsa5', 'fr_lactam', 'fr_morpholine', 'fr_aldehyde', 'slogp_vsa1', 'fr_amidine', 'bpol', 'fr_ester', 'fr_azo', 'kappa3', 'peoe_vsa5', 'fr_ketone_topliss', 'vsa_estate9', 'estate_vsa9', 'bcut2d_mrhi', 'fr_ndealkylation1', 'numrotatablebonds', 'minestateindex', 'fr_quatn', 'peoe_vsa3', 'fr_epoxide', 'fr_aniline', 'minpartialcharge', 'fr_nitroso', 'fpdensitymorgan2', 'fr_oxime', 'fr_sulfone', 'smr_vsa1', 'kappa1', 'fr_pyridine', 'numaromaticrings', 'vsa_estate6', 'molmr', 'estate_vsa1', 'fr_dihydropyridine', 'vsa_estate10', 'fr_alkyl_halide', 'chi2n', 'fr_thiocyan', 'fpdensitymorgan1', 'fr_unbrch_alkane', 'slogp_vsa9', 'chi4n', 'fr_nitro_arom', 'fr_al_oh', 'fr_furan', 'fr_c_s', 'peoe_vsa8', 'peoe_vsa14', 'numheteroatoms', 'fr_ndealkylation2', 'maxabspartialcharge', 'vsa_estate2', 'peoe_vsa7', 'apol', 'numhacceptors', 'fr_tetrazole', 'vsa_estate1', 'peoe_vsa9', 'naromatom', 'bcut2d_chghi', 'fr_sh', 'fr_halogen', 'slogp_vsa4', 'fr_benzodiazepine', 'molwt', 'fr_isocyan', 'fr_prisulfonamd', 'maxabsestateindex', 'minabsestateindex', 'peoe_vsa11', 'slogp_vsa12', 'estate_vsa5', 'numaliphaticcarbocycles', 'bcut2d_mwlow', 'slogp_vsa7', 'fr_allylic_oxid', 'fr_methoxy', 'fr_nh0', 'fr_coo2', 'fr_phenol', 'nacid', 'nbase', 'chi3v', 'fr_ar_nh', 'fr_nitrile', 'fr_imidazole', 'fr_urea', 'bcut2d_mrlow', 'chi1', 'smr_vsa6', 'fr_aryl_methyl', 'narombond', 'fr_alkyl_carbamate', 'fr_piperzine', 'exactmolwt', 'qed', 'chi0n', 'fr_sulfonamd', 'fr_thiazole', 'numvalenceelectrons', 'fr_phos_acid', 'peoe_vsa12', 'fr_nh1', 'fr_hdrzine', 'fr_c_o_nocoo', 'fr_lactone', 'estate_vsa6', 'bcut2d_logphi', 'vsa_estate7', 'peoe_vsa13', 'numsaturatedcarbocycles', 'fr_nitro', 'fr_phenol_noorthohbond', 'rotratio', 'fr_barbitur', 'fr_isothiocyan', 'balabanj', 'fr_arn', 'fr_imine', 'maxpartialcharge', 'fr_sulfide', 'slogp_vsa11', 'fr_hoccn', 'fr_n_o', 'peoe_vsa1', 'slogp_vsa6', 'heavyatommolwt', 'fractioncsp3', 'estate_vsa8', 'peoe_vsa10', 'numaliphaticrings', 'fr_thiophene', 'maxestateindex', 'smr_vsa10', 'labuteasa', 'smr_vsa2', 'fpdensitymorgan3', 'smr_vsa9', 'slogp_vsa10', 'numaromaticheterocycles', 'fr_nh2', 'fr_diazo', 'chi3n', 'fr_ar_coo', 'slogp_vsa5', 'fr_bicyclic', 'fr_amide', 'estate_vsa10', 'fr_guanido', 'chi1n', 'numsaturatedrings', 'fr_piperdine', 'fr_term_acetylene', 'estate_vsa4', 'slogp_vsa3', 'fr_coo', 'fr_ether', 'estate_vsa7', 'bcut2d_chglo', 'fr_oxazole', 'peoe_vsa6', 'hallkieralpha', 'peoe_vsa2', 'chi2v', 'nocount', 'vsa_estate5', 'fr_nhpyrrole', 'fr_al_coo', 'bertzct', 'estate_vsa11', 'minabspartialcharge', 'slogp_vsa8', 'fr_imide', 'kappa2', 'numaliphaticheterocycles', 'numsaturatedheterocycles', 'fr_hdrzone', 'smr_vsa4', 'fr_ar_n', 'nrot', 'smr_vsa8', 'slogp_vsa2', 'chi4v', 'fr_phos_ester', 'fr_para_hydroxylation', 'smr_vsa3', 'nhohcount', 'estate_vsa2', 'mollogp', 'tpsa', 'fr_azide', 'peoe_vsa4', 'numhdonors', 'fr_al_oh_notert', 'fr_c_o', 'chi0', 'fr_nitro_arom_nonortho', 'vsa_estate3', 'fr_benzene', 'fr_ketone', 'vsa_estate8', 'smr_vsa7', 'fr_ar_oh', 'fr_priamide', 'ringcount', 'estate_vsa3', 'numaromaticcarbocycles', 'bcut2d_mwhi', 'chi1v', 'heavyatomcount', 'vsa_estate4', 'chi0v'],
     "compressed_features": [],
-    "model_metrics_s3_path": "s3://ideaya-sageworks-bucket/models/sol-with-pka-class-100-test/training",
-    "train_all_data": True
+    "model_metrics_s3_path": "s3://ideaya-sageworks-bucket/models/temp-hlm-phase1-reg-0-80/training",
+    "train_all_data": False
 }
 # Function to check if dataframe is empty
@@ -88,13 +88,12 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
     """
     Matches and renames DataFrame columns to match model feature names (case-insensitive).
     Prioritizes exact matches, then case-insensitive matches.
     Raises ValueError if any model features cannot be matched.
     """
     df_columns_lower = {col.lower(): col for col in df.columns}
     rename_dict = {}
     missing = []
     for feature in model_features:
         if feature in df.columns:
             continue  # Exact match
@@ -102,10 +101,11 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
             rename_dict[df_columns_lower[feature.lower()]] = feature
         else:
             missing.append(feature)
     if missing:
         raise ValueError(f"Features not found: {missing}")
+    # Rename the DataFrame columns to match the model features
     return df.rename(columns=rename_dict)
@@ -197,7 +197,7 @@ if __name__ == "__main__":
     """The main function is for training the XGBoost model"""
     # Harness Template Parameters
-    target = TEMPLATE_PARAMS["target_column"]
+    target = TEMPLATE_PARAMS["target"]
     features = TEMPLATE_PARAMS["features"]
     orig_features = features.copy()
     compressed_features = TEMPLATE_PARAMS["compressed_features"]
@@ -390,7 +390,7 @@ def input_fn(input_data, content_type):
     """Parse input data and return a DataFrame."""
     if not input_data:
         raise ValueError("Empty input data is not supported!")
     # Decode bytes to string if necessary
     if isinstance(input_data, bytes):
         input_data = input_data.decode("utf-8")

workbench/model_scripts/xgb_model/xgb_model.template CHANGED Viewed

@@ -29,7 +29,7 @@ from typing import List, Tuple
 # Template Parameters
 TEMPLATE_PARAMS = {
     "model_type": "{{model_type}}",
-    "target_column": "{{target_column}}",
+    "target": "{{target_column}}",
     "features": "{{feature_list}}",
     "compressed_features": "{{compressed_features}}",
     "model_metrics_s3_path": "{{model_metrics_s3_path}}",
@@ -88,13 +88,12 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
     """
     Matches and renames DataFrame columns to match model feature names (case-insensitive).
     Prioritizes exact matches, then case-insensitive matches.
     Raises ValueError if any model features cannot be matched.
     """
     df_columns_lower = {col.lower(): col for col in df.columns}
     rename_dict = {}
     missing = []
     for feature in model_features:
         if feature in df.columns:
             continue  # Exact match
@@ -102,10 +101,11 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
             rename_dict[df_columns_lower[feature.lower()]] = feature
         else:
             missing.append(feature)
     if missing:
         raise ValueError(f"Features not found: {missing}")
+    # Rename the DataFrame columns to match the model features
     return df.rename(columns=rename_dict)
@@ -197,7 +197,7 @@ if __name__ == "__main__":
     """The main function is for training the XGBoost model"""
     # Harness Template Parameters
-    target = TEMPLATE_PARAMS["target_column"]
+    target = TEMPLATE_PARAMS["target"]
     features = TEMPLATE_PARAMS["features"]
     orig_features = features.copy()
     compressed_features = TEMPLATE_PARAMS["compressed_features"]
@@ -390,7 +390,7 @@ def input_fn(input_data, content_type):
     """Parse input data and return a DataFrame."""
     if not input_data:
         raise ValueError("Empty input data is not supported!")
     # Decode bytes to string if necessary
     if isinstance(input_data, bytes):
         input_data = input_data.decode("utf-8")

workbench/scripts/{ml_pipeline_launcher.py → ml_pipeline_batch.py} RENAMED Viewed

@@ -76,7 +76,7 @@ def run_batch_job(script_path: str, size: str = "small") -> int:
     response = batch.submit_job(
         jobName=job_name,
         jobQueue="workbench-job-queue",
-        jobDefinition=f"workbench-ml-pipeline-{size}",
+        jobDefinition=f"workbench-batch-{size}",
         containerOverrides={
             "environment": [
                 {"name": "ML_PIPELINE_S3_PATH", "value": s3_path},

workbench/scripts/ml_pipeline_sqs.py ADDED Viewed

@@ -0,0 +1,139 @@
+import argparse
+import logging
+import json
+from pathlib import Path
+# Workbench Imports
+from workbench.core.cloud_platform.aws.aws_account_clamp import AWSAccountClamp
+from workbench.utils.config_manager import ConfigManager
+from workbench.utils.s3_utils import upload_content_to_s3
+log = logging.getLogger("workbench")
+cm = ConfigManager()
+workbench_bucket = cm.get_config("WORKBENCH_BUCKET")
+def submit_to_sqs(script_path: str, size: str = "small") -> None:
+    """
+    Upload script to S3 and submit message to SQS queue for processing.
+    Args:
+        script_path: Local path to the ML pipeline script
+        size: Job size tier - "small" (default), "medium", or "large"
+    """
+    print(f"\n{'=' * 60}")
+    print("🚀  SUBMITTING ML PIPELINE JOB")
+    print(f"{'=' * 60}")
+    if size not in ["small", "medium", "large"]:
+        raise ValueError(f"Invalid size '{size}'. Must be 'small', 'medium', or 'large'")
+    # Validate script exists
+    script_file = Path(script_path)
+    if not script_file.exists():
+        raise FileNotFoundError(f"Script not found: {script_path}")
+    print(f"📄  Script: {script_file.name}")
+    print(f"📏  Size tier: {size}")
+    print(f"🪣  Bucket: {workbench_bucket}")
+    sqs = AWSAccountClamp().boto3_session.client("sqs")
+    script_name = script_file.name
+    # List Workbench queues
+    print("\n📋  Listing Workbench SQS queues...")
+    try:
+        queues = sqs.list_queues(QueueNamePrefix="workbench-")
+        queue_urls = queues.get("QueueUrls", [])
+        if queue_urls:
+            print(f"✅  Found {len(queue_urls)} workbench queue(s):")
+            for url in queue_urls:
+                queue_name = url.split("/")[-1]
+                print(f"   • {queue_name}")
+        else:
+            print("⚠️  No workbench queues found")
+    except Exception as e:
+        print(f"❌  Error listing queues: {e}")
+    # Upload script to S3
+    s3_path = f"s3://{workbench_bucket}/batch-jobs/{script_name}"
+    print("\n📤  Uploading script to S3...")
+    print(f"   Source: {script_path}")
+    print(f"   Destination: {s3_path}")
+    try:
+        upload_content_to_s3(script_file.read_text(), s3_path)
+        print("✅  Script uploaded successfully")
+    except Exception as e:
+        print(f"❌  Upload failed: {e}")
+        raise
+    # Get queue URL and info
+    queue_name = "workbench-ml-pipeline-queue.fifo"
+    print("\n🎯  Getting queue information...")
+    print(f"   Queue name: {queue_name}")
+    try:
+        queue_url = sqs.get_queue_url(QueueName=queue_name)["QueueUrl"]
+        print(f"   Queue URL: {queue_url}")
+        # Get queue attributes for additional info
+        attrs = sqs.get_queue_attributes(
+            QueueUrl=queue_url, AttributeNames=["ApproximateNumberOfMessages", "ApproximateNumberOfMessagesNotVisible"]
+        )
+        messages_available = attrs["Attributes"].get("ApproximateNumberOfMessages", "0")
+        messages_in_flight = attrs["Attributes"].get("ApproximateNumberOfMessagesNotVisible", "0")
+        print(f"   Messages in queue: {messages_available}")
+        print(f"   Messages in flight: {messages_in_flight}")
+    except Exception as e:
+        print(f"❌  Error accessing queue: {e}")
+        raise
+    # Prepare message
+    message = {"script_path": s3_path, "size": size}
+    print("\n📨  Sending message to SQS...")
+    # Send the message to SQS
+    try:
+        response = sqs.send_message(
+            QueueUrl=queue_url,
+            MessageBody=json.dumps(message, indent=2),
+            MessageGroupId="ml-pipeline-jobs",  # Required for FIFO
+        )
+        message_id = response["MessageId"]
+        print("✅  Message sent successfully!")
+        print(f"   Message ID: {message_id}")
+    except Exception as e:
+        print(f"❌  Failed to send message: {e}")
+        raise
+    # Success summary
+    print(f"\n{'=' * 60}")
+    print("✅  JOB SUBMISSION COMPLETE")
+    print(f"{'=' * 60}")
+    print(f"📄  Script: {script_name}")
+    print(f"📏  Size: {size}")
+    print(f"🆔  Message ID: {message_id}")
+    print("\n🔍  MONITORING LOCATIONS:")
+    print(f"   • SQS Queue: AWS Console → SQS → {queue_name}")
+    print("   • Lambda Logs: AWS Console → Lambda → Functions")
+    print("   • Batch Jobs: AWS Console → Batch → Jobs")
+    print("   • CloudWatch: AWS Console → CloudWatch → Log groups")
+    print("\n⏳  Your job should start processing soon...")
+def main():
+    """CLI entry point for submitting ML pipelines via SQS."""
+    parser = argparse.ArgumentParser(description="Submit ML pipeline to SQS queue for Batch processing")
+    parser.add_argument("script_file", help="Local path to ML pipeline script")
+    parser.add_argument(
+        "--size", default="small", choices=["small", "medium", "large"], help="Job size tier (default: small)"
+    )
+    args = parser.parse_args()
+    try:
+        submit_to_sqs(args.script_file, args.size)
+    except Exception as e:
+        print(f"\n❌  ERROR: {e}")
+        log.error(f"Error: {e}")
+        exit(1)
+if __name__ == "__main__":
+    main()

workbench/utils/model_utils.py CHANGED Viewed

@@ -140,7 +140,7 @@ def uq_model(model: "Model", uq_model_name: str, train_all_data: bool = False) -
     from workbench.api import Model, ModelType, FeatureSet  # noqa: F401 (avoid circular import)
     # Get the custom script path for the UQ model
-    script_path = get_custom_script_path("uq_models", "meta_uq.template")
+    script_path = get_custom_script_path("uq_models", "mapie.template")
     # Get Feature and Target Columns from the existing given Model
     features = model.features()
@@ -220,6 +220,8 @@ def uq_metrics(df: pd.DataFrame, target_col: str) -> Dict[str, Any]:
     # --- Coverage and Interval Width ---
     if "q_025" in df.columns and "q_975" in df.columns:
         lower_95, upper_95 = df["q_025"], df["q_975"]
+        lower_90, upper_90 = df["q_05"], df["q_95"]
+        lower_80, upper_80 = df["q_10"], df["q_90"]
         lower_50, upper_50 = df["q_25"], df["q_75"]
     elif "prediction_std" in df.columns:
         lower_95 = df["prediction"] - 1.96 * df["prediction_std"]
@@ -231,8 +233,12 @@ def uq_metrics(df: pd.DataFrame, target_col: str) -> Dict[str, Any]:
             "Either quantile columns (q_025, q_975, q_25, q_75) or 'prediction_std' column must be present."
         )
     coverage_95 = np.mean((df[target_col] >= lower_95) & (df[target_col] <= upper_95))
+    coverage_90 = np.mean((df[target_col] >= lower_90) & (df[target_col] <= upper_90))
+    coverage_80 = np.mean((df[target_col] >= lower_80) & (df[target_col] <= upper_80))
     coverage_50 = np.mean((df[target_col] >= lower_50) & (df[target_col] <= upper_50))
     avg_width_95 = np.mean(upper_95 - lower_95)
+    avg_width_90 = np.mean(upper_90 - lower_90)
+    avg_width_80 = np.mean(upper_80 - lower_80)
     avg_width_50 = np.mean(upper_50 - lower_50)
     # --- CRPS (measures calibration + sharpness) ---
@@ -260,6 +266,8 @@ def uq_metrics(df: pd.DataFrame, target_col: str) -> Dict[str, Any]:
     # Collect results
     results = {
         "coverage_95": coverage_95,
+        "coverage_90": coverage_90,
+        "coverage_80": coverage_80,
         "coverage_50": coverage_50,
         "avg_width_95": avg_width_95,
         "avg_width_50": avg_width_50,
@@ -271,8 +279,12 @@ def uq_metrics(df: pd.DataFrame, target_col: str) -> Dict[str, Any]:
     print("\n=== UQ Metrics ===")
     print(f"Coverage @ 95%: {coverage_95:.3f} (target: 0.95)")
+    print(f"Coverage @ 90%: {coverage_90:.3f} (target: 0.90)")
+    print(f"Coverage @ 80%: {coverage_80:.3f} (target: 0.80)")
     print(f"Coverage @ 50%: {coverage_50:.3f} (target: 0.50)")
     print(f"Average 95% Width: {avg_width_95:.3f}")
+    print(f"Average 90% Width: {avg_width_90:.3f}")
+    print(f"Average 80% Width: {avg_width_80:.3f}")
     print(f"Average 50% Width: {avg_width_50:.3f}")
     print(f"CRPS: {mean_crps:.3f} (lower is better)")
     print(f"Interval Score 95%: {mean_is_95:.3f} (lower is better)")

workbench 0.8.170__py3-none-any.whl → 0.8.172__py3-none-any.whl

Potentially problematic release.

workbench 0.8.170py3-none-any.whl → 0.8.172py3-none-any.whl