PyPI - workbench - Versions diffs - 0.8.202__py3-none-any.whl → 0.8.220__py3-none-any.whl - Mend

workbench 0.8.202py3-none-any.whl → 0.8.220py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of workbench might be problematic. Click here for more details.

Files changed (84) hide show

workbench/algorithms/dataframe/compound_dataset_overlap.py +321 -0
workbench/algorithms/dataframe/feature_space_proximity.py +168 -75
workbench/algorithms/dataframe/fingerprint_proximity.py +421 -85
workbench/algorithms/dataframe/projection_2d.py +44 -21
workbench/algorithms/dataframe/proximity.py +78 -150
workbench/algorithms/graph/light/proximity_graph.py +5 -5
workbench/algorithms/models/cleanlab_model.py +382 -0
workbench/algorithms/models/noise_model.py +388 -0
workbench/algorithms/sql/outliers.py +3 -3
workbench/api/__init__.py +3 -0
workbench/api/df_store.py +17 -108
workbench/api/endpoint.py +13 -11
workbench/api/feature_set.py +111 -8
workbench/api/meta_model.py +289 -0
workbench/api/model.py +45 -12
workbench/api/parameter_store.py +3 -52
workbench/cached/cached_model.py +4 -4
workbench/core/artifacts/artifact.py +5 -5
workbench/core/artifacts/df_store_core.py +114 -0
workbench/core/artifacts/endpoint_core.py +228 -237
workbench/core/artifacts/feature_set_core.py +185 -230
workbench/core/artifacts/model_core.py +34 -26
workbench/core/artifacts/parameter_store_core.py +98 -0
workbench/core/pipelines/pipeline_executor.py +1 -1
workbench/core/transforms/features_to_model/features_to_model.py +22 -10
workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +41 -10
workbench/core/transforms/pandas_transforms/pandas_to_features.py +11 -2
workbench/model_script_utils/model_script_utils.py +339 -0
workbench/model_script_utils/pytorch_utils.py +405 -0
workbench/model_script_utils/uq_harness.py +278 -0
workbench/model_scripts/chemprop/chemprop.template +428 -631
workbench/model_scripts/chemprop/generated_model_script.py +432 -635
workbench/model_scripts/chemprop/model_script_utils.py +339 -0
workbench/model_scripts/chemprop/requirements.txt +2 -10
workbench/model_scripts/custom_models/chem_info/fingerprints.py +87 -46
workbench/model_scripts/custom_models/proximity/feature_space_proximity.py +194 -0
workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +6 -6
workbench/model_scripts/custom_models/uq_models/feature_space_proximity.py +194 -0
workbench/model_scripts/meta_model/generated_model_script.py +209 -0
workbench/model_scripts/meta_model/meta_model.template +209 -0
workbench/model_scripts/pytorch_model/generated_model_script.py +374 -613
workbench/model_scripts/pytorch_model/model_script_utils.py +339 -0
workbench/model_scripts/pytorch_model/pytorch.template +370 -609
workbench/model_scripts/pytorch_model/pytorch_utils.py +405 -0
workbench/model_scripts/pytorch_model/requirements.txt +1 -1
workbench/model_scripts/pytorch_model/uq_harness.py +278 -0
workbench/model_scripts/script_generation.py +6 -5
workbench/model_scripts/uq_models/generated_model_script.py +65 -422
workbench/model_scripts/xgb_model/generated_model_script.py +372 -395
workbench/model_scripts/xgb_model/model_script_utils.py +339 -0
workbench/model_scripts/xgb_model/uq_harness.py +278 -0
workbench/model_scripts/xgb_model/xgb_model.template +366 -396
workbench/repl/workbench_shell.py +0 -5
workbench/resources/open_source_api.key +1 -1
workbench/scripts/endpoint_test.py +2 -2
workbench/scripts/meta_model_sim.py +35 -0
workbench/scripts/training_test.py +85 -0
workbench/utils/chem_utils/fingerprints.py +87 -46
workbench/utils/chem_utils/projections.py +16 -6
workbench/utils/chemprop_utils.py +36 -655
workbench/utils/meta_model_simulator.py +499 -0
workbench/utils/metrics_utils.py +256 -0
workbench/utils/model_utils.py +192 -54
workbench/utils/pytorch_utils.py +33 -472
workbench/utils/shap_utils.py +1 -55
workbench/utils/xgboost_local_crossfold.py +267 -0
workbench/utils/xgboost_model_utils.py +49 -356
workbench/web_interface/components/model_plot.py +7 -1
workbench/web_interface/components/plugins/model_details.py +30 -68
workbench/web_interface/components/plugins/scatter_plot.py +4 -8
{workbench-0.8.202.dist-info → workbench-0.8.220.dist-info}/METADATA +6 -5
{workbench-0.8.202.dist-info → workbench-0.8.220.dist-info}/RECORD +76 -60
{workbench-0.8.202.dist-info → workbench-0.8.220.dist-info}/entry_points.txt +2 -0
workbench/core/cloud_platform/aws/aws_df_store.py +0 -404
workbench/core/cloud_platform/aws/aws_parameter_store.py +0 -296
workbench/model_scripts/custom_models/meta_endpoints/example.py +0 -53
workbench/model_scripts/custom_models/proximity/proximity.py +0 -410
workbench/model_scripts/custom_models/uq_models/meta_uq.template +0 -377
workbench/model_scripts/custom_models/uq_models/proximity.py +0 -410
workbench/model_scripts/uq_models/mapie.template +0 -605
workbench/model_scripts/uq_models/requirements.txt +0 -1
{workbench-0.8.202.dist-info → workbench-0.8.220.dist-info}/WHEEL +0 -0
{workbench-0.8.202.dist-info → workbench-0.8.220.dist-info}/licenses/LICENSE +0 -0
{workbench-0.8.202.dist-info → workbench-0.8.220.dist-info}/top_level.txt +0 -0

workbench/model_scripts/xgb_model/generated_model_script.py CHANGED Viewed

@@ -1,468 +1,445 @@
-# Imports for XGB Model
-import xgboost as xgb
-import awswrangler as wr
-import numpy as np
-# Model Performance Scores
-from sklearn.metrics import (
-    mean_absolute_error,
-    r2_score,
-    root_mean_squared_error,
-    precision_recall_fscore_support,
-    confusion_matrix,
-)
+# XGBoost Model Template for Workbench
+#
+# This template handles both classification and regression models with:
+# - K-fold cross-validation ensemble training (or single train/val split)
+# - Out-of-fold predictions for validation metrics
+# - Uncertainty quantification for regression models
+# - Sample weights support
+# - Categorical feature handling
+# - Compressed feature decompression
+#
+# NOTE: Imports are structured to minimize serverless endpoint startup time.
+# Heavy imports (sklearn, awswrangler) are deferred to training time.
-# Classification Encoder
-from sklearn.preprocessing import LabelEncoder
-# Scikit Learn Imports
-from sklearn.model_selection import train_test_split
-from io import StringIO
 import json
-import argparse
-import joblib
 import os
-import pandas as pd
-from typing import List, Tuple
-# Template Parameters
-TEMPLATE_PARAMS = {
-    "model_type": "regressor",
-    "target": "class_number_of_rings",
-    "features": ['length', 'diameter', 'height', 'whole_weight', 'shucked_weight', 'viscera_weight', 'shell_weight', 'sex'],
-    "compressed_features": [],
-    "model_metrics_s3_path": "s3://sandbox-sageworks-artifacts/models/abalone-regression/training",
-    "train_all_data": False,
-    "hyperparameters": {},
-}
-# Function to check if dataframe is empty
-def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
-    """
-    Check if the provided dataframe is empty and raise an exception if it is.
-    Args:
-        df (pd.DataFrame): DataFrame to check
-        df_name (str): Name of the DataFrame
-    """
-    if df.empty:
-        msg = f"*** The training data {df_name} has 0 rows! ***STOPPING***"
-        print(msg)
-        raise ValueError(msg)
+import joblib
+import numpy as np
+import pandas as pd
+import xgboost as xgb
-def expand_proba_column(df: pd.DataFrame, class_labels: List[str]) -> pd.DataFrame:
-    """
-    Expands a column in a DataFrame containing a list of probabilities into separate columns.
+from model_script_utils import (
+    convert_categorical_types,
+    decompress_features,
+    expand_proba_column,
+    input_fn,
+    match_features_case_insensitive,
+    output_fn,
+)
+from uq_harness import (
+    compute_confidence,
+    load_uq_models,
+    predict_intervals,
+)
-    Args:
-        df (pd.DataFrame): DataFrame containing a "pred_proba" column
-        class_labels (List[str]): List of class labels
+# =============================================================================
+# Default Hyperparameters
+# =============================================================================
+DEFAULT_HYPERPARAMETERS = {
+    # Training parameters
+    "n_folds": 5,  # Number of CV folds (1 = single train/val split)
+    # Core tree parameters
+    "n_estimators": 300,
+    "max_depth": 7,
+    "learning_rate": 0.05,
+    # Sampling parameters (less aggressive - ensemble provides regularization)
+    "subsample": 0.8,
+    "colsample_bytree": 0.8,
+    # Regularization (lighter - ensemble averaging reduces overfitting)
+    "min_child_weight": 3,
+    "gamma": 0.1,
+    "reg_alpha": 0.1,
+    "reg_lambda": 1.0,
+    # Random seed
+    "seed": 42,
+}
-    Returns:
-        pd.DataFrame: DataFrame with the "pred_proba" expanded into separate columns
-    """
+# Workbench-specific parameters (not passed to XGBoost)
+WORKBENCH_PARAMS = {"n_folds"}
-    # Sanity check
-    proba_column = "pred_proba"
-    if proba_column not in df.columns:
-        raise ValueError('DataFrame does not contain a "pred_proba" column')
+# Regression-only parameters (filtered out for classifiers)
+REGRESSION_ONLY_PARAMS = {"objective"}
-    # Construct new column names with '_proba' suffix
-    proba_splits = [f"{label}_proba" for label in class_labels]
+# Template parameters (filled in by Workbench)
+TEMPLATE_PARAMS = {
+    "model_type": "uq_regressor",
+    "target": "udm_asy_res_efflux_ratio",
+    "features": ['chi2v', 'fr_sulfone', 'chi1v', 'bcut2d_logplow', 'fr_piperzine', 'kappa3', 'smr_vsa1', 'slogp_vsa5', 'fr_ketone_topliss', 'fr_sulfonamd', 'fr_imine', 'fr_benzene', 'fr_ester', 'chi2n', 'labuteasa', 'peoe_vsa2', 'smr_vsa6', 'bcut2d_chglo', 'fr_sh', 'peoe_vsa1', 'fr_allylic_oxid', 'chi4n', 'fr_ar_oh', 'fr_nh0', 'fr_term_acetylene', 'slogp_vsa7', 'slogp_vsa4', 'estate_vsa1', 'vsa_estate4', 'numbridgeheadatoms', 'numheterocycles', 'fr_ketone', 'fr_morpholine', 'fr_guanido', 'estate_vsa2', 'numheteroatoms', 'fr_nitro_arom_nonortho', 'fr_piperdine', 'nocount', 'numspiroatoms', 'fr_aniline', 'fr_thiophene', 'slogp_vsa10', 'fr_amide', 'slogp_vsa2', 'fr_epoxide', 'vsa_estate7', 'fr_ar_coo', 'fr_imidazole', 'fr_nitrile', 'fr_oxazole', 'numsaturatedrings', 'fr_pyridine', 'fr_hoccn', 'fr_ndealkylation1', 'numaliphaticheterocycles', 'fr_phenol', 'maxpartialcharge', 'vsa_estate5', 'peoe_vsa13', 'minpartialcharge', 'qed', 'fr_al_oh', 'slogp_vsa11', 'chi0n', 'fr_bicyclic', 'peoe_vsa12', 'fpdensitymorgan1', 'fr_oxime', 'molwt', 'fr_dihydropyridine', 'smr_vsa5', 'peoe_vsa5', 'fr_nitro', 'hallkieralpha', 'heavyatommolwt', 'fr_alkyl_halide', 'peoe_vsa8', 'fr_nhpyrrole', 'fr_isocyan', 'bcut2d_chghi', 'fr_lactam', 'peoe_vsa11', 'smr_vsa9', 'tpsa', 'chi4v', 'slogp_vsa1', 'phi', 'bcut2d_logphi', 'avgipc', 'estate_vsa11', 'fr_coo', 'bcut2d_mwhi', 'numunspecifiedatomstereocenters', 'vsa_estate10', 'estate_vsa8', 'numvalenceelectrons', 'fr_nh2', 'fr_lactone', 'vsa_estate1', 'estate_vsa4', 'numatomstereocenters', 'vsa_estate8', 'fr_para_hydroxylation', 'peoe_vsa3', 'fr_thiazole', 'peoe_vsa10', 'fr_ndealkylation2', 'slogp_vsa12', 'peoe_vsa9', 'maxestateindex', 'fr_quatn', 'smr_vsa7', 'minestateindex', 'numaromaticheterocycles', 'numrotatablebonds', 'fr_ar_nh', 'fr_ether', 'exactmolwt', 'fr_phenol_noorthohbond', 'slogp_vsa3', 'fr_ar_n', 'sps', 'fr_c_o_nocoo', 'bertzct', 'peoe_vsa7', 'slogp_vsa8', 'numradicalelectrons', 'molmr', 'fr_tetrazole', 'numsaturatedcarbocycles', 'bcut2d_mrhi', 'kappa1', 'numamidebonds', 'fpdensitymorgan2', 'smr_vsa8', 'chi1n', 'estate_vsa6', 'fr_barbitur', 'fr_diazo', 'kappa2', 'chi0', 'bcut2d_mrlow', 'balabanj', 'peoe_vsa4', 'numhacceptors', 'fr_sulfide', 'chi3n', 'smr_vsa2', 'fr_al_oh_notert', 'fr_benzodiazepine', 'fr_phos_ester', 'fr_aldehyde', 'fr_coo2', 'estate_vsa5', 'fr_prisulfonamd', 'numaromaticcarbocycles', 'fr_unbrch_alkane', 'fr_urea', 'fr_nitroso', 'smr_vsa10', 'fr_c_s', 'smr_vsa3', 'fr_methoxy', 'maxabspartialcharge', 'slogp_vsa9', 'heavyatomcount', 'fr_azide', 'chi3v', 'smr_vsa4', 'mollogp', 'chi0v', 'fr_aryl_methyl', 'fr_nh1', 'fpdensitymorgan3', 'fr_furan', 'fr_hdrzine', 'fr_arn', 'numaromaticrings', 'vsa_estate3', 'fr_azo', 'fr_halogen', 'estate_vsa9', 'fr_hdrzone', 'numhdonors', 'fr_alkyl_carbamate', 'fr_isothiocyan', 'minabspartialcharge', 'fr_al_coo', 'ringcount', 'chi1', 'estate_vsa7', 'fr_nitro_arom', 'vsa_estate9', 'minabsestateindex', 'maxabsestateindex', 'vsa_estate6', 'estate_vsa10', 'estate_vsa3', 'fr_n_o', 'fr_amidine', 'fr_thiocyan', 'fr_phos_acid', 'fr_c_o', 'fr_imide', 'numaliphaticrings', 'peoe_vsa6', 'vsa_estate2', 'nhohcount', 'numsaturatedheterocycles', 'slogp_vsa6', 'peoe_vsa14', 'fractioncsp3', 'bcut2d_mwlow', 'numaliphaticcarbocycles', 'fr_priamide', 'nacid', 'nbase', 'naromatom', 'narombond', 'sz', 'sm', 'sv', 'sse', 'spe', 'sare', 'sp', 'si', 'mz', 'mm', 'mv', 'mse', 'mpe', 'mare', 'mp', 'mi', 'xch_3d', 'xch_4d', 'xch_5d', 'xch_6d', 'xch_7d', 'xch_3dv', 'xch_4dv', 'xch_5dv', 'xch_6dv', 'xch_7dv', 'xc_3d', 'xc_4d', 'xc_5d', 'xc_6d', 'xc_3dv', 'xc_4dv', 'xc_5dv', 'xc_6dv', 'xpc_4d', 'xpc_5d', 'xpc_6d', 'xpc_4dv', 'xpc_5dv', 'xpc_6dv', 'xp_0d', 'xp_1d', 'xp_2d', 'xp_3d', 'xp_4d', 'xp_5d', 'xp_6d', 'xp_7d', 'axp_0d', 'axp_1d', 'axp_2d', 'axp_3d', 'axp_4d', 'axp_5d', 'axp_6d', 'axp_7d', 'xp_0dv', 'xp_1dv', 'xp_2dv', 'xp_3dv', 'xp_4dv', 'xp_5dv', 'xp_6dv', 'xp_7dv', 'axp_0dv', 'axp_1dv', 'axp_2dv', 'axp_3dv', 'axp_4dv', 'axp_5dv', 'axp_6dv', 'axp_7dv', 'c1sp1', 'c2sp1', 'c1sp2', 'c2sp2', 'c3sp2', 'c1sp3', 'c2sp3', 'c3sp3', 'c4sp3', 'hybratio', 'fcsp3', 'num_stereocenters', 'num_unspecified_stereocenters', 'num_defined_stereocenters', 'num_r_centers', 'num_s_centers', 'num_stereobonds', 'num_e_bonds', 'num_z_bonds', 'stereo_complexity', 'frac_defined_stereo'],
+    "id_column": "udm_mol_bat_id",
+    "compressed_features": ['fingerprint'],
+    "model_metrics_s3_path": "s3://ideaya-sageworks-bucket/models/caco2-er-reg-temporal/training",
+    "hyperparameters": {'n_folds': 1},
+}
-    # Expand the proba_column into separate columns for each probability
-    proba_df = pd.DataFrame(df[proba_column].tolist(), columns=proba_splits)
-    # Drop any proba columns and reset the index in prep for the concat
-    df = df.drop(columns=[proba_column] + proba_splits, errors="ignore")
-    df = df.reset_index(drop=True)
+# =============================================================================
+# Model Loading (for SageMaker inference)
+# =============================================================================
+def model_fn(model_dir: str) -> dict:
+    """Load XGBoost ensemble from the specified directory."""
+    # Load ensemble metadata
+    metadata_path = os.path.join(model_dir, "ensemble_metadata.json")
+    if os.path.exists(metadata_path):
+        with open(metadata_path) as f:
+            metadata = json.load(f)
+        n_ensemble = metadata["n_ensemble"]
+    else:
+        n_ensemble = 1  # Legacy single model
-    # Concatenate the new columns with the original DataFrame
-    df = pd.concat([df, proba_df], axis=1)
-    print(df)
-    return df
+    # Load ensemble models
+    ensemble_models = []
+    for i in range(n_ensemble):
+        model_path = os.path.join(model_dir, f"xgb_model_{i}.joblib")
+        if not os.path.exists(model_path):
+            model_path = os.path.join(model_dir, "xgb_model.joblib")  # Legacy fallback
+        ensemble_models.append(joblib.load(model_path))
+    print(f"Loaded {len(ensemble_models)} model(s)")
-def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> pd.DataFrame:
-    """
-    Matches and renames DataFrame columns to match model feature names (case-insensitive).
-    Prioritizes exact matches, then case-insensitive matches.
-    Raises ValueError if any model features cannot be matched.
-    """
-    df_columns_lower = {col.lower(): col for col in df.columns}
-    rename_dict = {}
-    missing = []
-    for feature in model_features:
-        if feature in df.columns:
-            continue  # Exact match
-        elif feature.lower() in df_columns_lower:
-            rename_dict[df_columns_lower[feature.lower()]] = feature
-        else:
-            missing.append(feature)
+    # Load label encoder (classifier only)
+    label_encoder = None
+    encoder_path = os.path.join(model_dir, "label_encoder.joblib")
+    if os.path.exists(encoder_path):
+        label_encoder = joblib.load(encoder_path)
+    # Load category mappings
+    category_mappings = {}
+    category_path = os.path.join(model_dir, "category_mappings.json")
+    if os.path.exists(category_path):
+        with open(category_path) as f:
+            category_mappings = json.load(f)
+    # Load UQ models (regression only)
+    uq_models, uq_metadata = None, None
+    uq_path = os.path.join(model_dir, "uq_metadata.json")
+    if os.path.exists(uq_path):
+        uq_models, uq_metadata = load_uq_models(model_dir)
+    return {
+        "ensemble_models": ensemble_models,
+        "n_ensemble": n_ensemble,
+        "label_encoder": label_encoder,
+        "category_mappings": category_mappings,
+        "uq_models": uq_models,
+        "uq_metadata": uq_metadata,
+    }
+# =============================================================================
+# Inference (for SageMaker inference)
+# =============================================================================
+def predict_fn(df: pd.DataFrame, model_dict: dict) -> pd.DataFrame:
+    """Make predictions with XGBoost ensemble."""
+    model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
+    with open(os.path.join(model_dir, "feature_columns.json")) as f:
+        features = json.load(f)
+    print(f"Model Features: {features}")
-    if missing:
-        raise ValueError(f"Features not found: {missing}")
+    # Extract model components
+    ensemble_models = model_dict["ensemble_models"]
+    label_encoder = model_dict.get("label_encoder")
+    category_mappings = model_dict.get("category_mappings", {})
+    uq_models = model_dict.get("uq_models")
+    uq_metadata = model_dict.get("uq_metadata")
+    compressed_features = TEMPLATE_PARAMS["compressed_features"]
-    # Rename the DataFrame columns to match the model features
-    return df.rename(columns=rename_dict)
+    # Prepare features
+    matched_df = match_features_case_insensitive(df, features)
+    matched_df, _ = convert_categorical_types(matched_df, features, category_mappings)
+    if compressed_features:
+        print("Decompressing features for prediction...")
+        matched_df, features = decompress_features(matched_df, features, compressed_features)
-def convert_categorical_types(df: pd.DataFrame, features: list, category_mappings={}) -> tuple:
-    """
-    Converts appropriate columns to categorical type with consistent mappings.
+    X = matched_df[features]
-    Args:
-        df (pd.DataFrame): The DataFrame to process.
-        features (list): List of feature names to consider for conversion.
-        category_mappings (dict, optional): Existing category mappings. If empty dict, we're in
-                                            training mode. If populated, we're in inference mode.
+    # Collect ensemble predictions
+    all_preds = [m.predict(X) for m in ensemble_models]
+    ensemble_preds = np.stack(all_preds, axis=0)
-    Returns:
-        tuple: (processed DataFrame, category mappings dictionary)
-    """
-    # Training mode
-    if category_mappings == {}:
-        for col in df.select_dtypes(include=["object", "string"]):
-            if col in features and df[col].nunique() < 20:
-                print(f"Training mode: Converting {col} to category")
-                df[col] = df[col].astype("category")
-                category_mappings[col] = df[col].cat.categories.tolist()  # Store category mappings
+    if label_encoder is not None:
+        # Classification: average probabilities, then argmax
+        all_probs = [m.predict_proba(X) for m in ensemble_models]
+        avg_probs = np.mean(np.stack(all_probs, axis=0), axis=0)
+        class_preds = np.argmax(avg_probs, axis=1)
-    # Inference mode
+        df["prediction"] = label_encoder.inverse_transform(class_preds)
+        df["pred_proba"] = [p.tolist() for p in avg_probs]
+        df = expand_proba_column(df, label_encoder.classes_)
     else:
-        for col, categories in category_mappings.items():
-            if col in df.columns:
-                print(f"Inference mode: Applying categorical mapping for {col}")
-                df[col] = pd.Categorical(df[col], categories=categories)  # Apply consistent categorical mapping
-    return df, category_mappings
-def decompress_features(
-    df: pd.DataFrame, features: List[str], compressed_features: List[str]
-) -> Tuple[pd.DataFrame, List[str]]:
-    """Prepare features for the model by decompressing bitstring features
-    Args:
-        df (pd.DataFrame): The features DataFrame
-        features (List[str]): Full list of feature names
-        compressed_features (List[str]): List of feature names to decompress (bitstrings)
-    Returns:
-        pd.DataFrame: DataFrame with the decompressed features
-        List[str]: Updated list of feature names after decompression
-    Raises:
-        ValueError: If any missing values are found in the specified features
-    """
-    # Check for any missing values in the required features
-    missing_counts = df[features].isna().sum()
-    if missing_counts.any():
-        missing_features = missing_counts[missing_counts > 0]
-        print(
-            f"WARNING: Found missing values in features: {missing_features.to_dict()}. "
-            "WARNING: You might want to remove/replace all NaN values before processing."
-        )
-    # Decompress the specified compressed features
-    decompressed_features = features.copy()
-    for feature in compressed_features:
-        if (feature not in df.columns) or (feature not in features):
-            print(f"Feature '{feature}' not in the features list, skipping decompression.")
-            continue
-        # Remove the feature from the list of features to avoid duplication
-        decompressed_features.remove(feature)
-        # Handle all compressed features as bitstrings
-        bit_matrix = np.array([list(bitstring) for bitstring in df[feature]], dtype=np.uint8)
-        prefix = feature[:3]
-        # Create all new columns at once - avoids fragmentation
-        new_col_names = [f"{prefix}_{i}" for i in range(bit_matrix.shape[1])]
-        new_df = pd.DataFrame(bit_matrix, columns=new_col_names, index=df.index)
-        # Add to features list
-        decompressed_features.extend(new_col_names)
+        # Regression: average predictions
+        df["prediction"] = np.mean(ensemble_preds, axis=0)
+        df["prediction_std"] = np.std(ensemble_preds, axis=0)
-        # Drop original column and concatenate new ones
-        df = df.drop(columns=[feature])
-        df = pd.concat([df, new_df], axis=1)
+        # Add UQ intervals if available
+        if uq_models and uq_metadata:
+            df = predict_intervals(df, X, uq_models, uq_metadata)
+            df = compute_confidence(df, uq_metadata["median_interval_width"], "q_10", "q_90")
-    return df, decompressed_features
+    print(f"Inference complete: {len(df)} predictions, {len(ensemble_models)} ensemble members")
+    return df
+# =============================================================================
+# Training
+# =============================================================================
 if __name__ == "__main__":
-    """The main function is for training the XGBoost model"""
+    # -------------------------------------------------------------------------
+    # Training-only imports (deferred to reduce serverless startup time)
+    # -------------------------------------------------------------------------
+    import argparse
+    import awswrangler as wr
+    from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
+    from sklearn.preprocessing import LabelEncoder
+    from model_script_utils import (
+        check_dataframe,
+        compute_classification_metrics,
+        compute_regression_metrics,
+        print_classification_metrics,
+        print_confusion_matrix,
+        print_regression_metrics,
+    )
+    from uq_harness import (
+        save_uq_models,
+        train_uq_models,
+    )
+    # -------------------------------------------------------------------------
+    # Setup: Parse arguments and load data
+    # -------------------------------------------------------------------------
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model"))
+    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train"))
+    parser.add_argument("--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data"))
+    args = parser.parse_args()
-    # Harness Template Parameters
+    # Extract template parameters
     target = TEMPLATE_PARAMS["target"]
     features = TEMPLATE_PARAMS["features"]
     orig_features = features.copy()
+    id_column = TEMPLATE_PARAMS["id_column"]
     compressed_features = TEMPLATE_PARAMS["compressed_features"]
     model_type = TEMPLATE_PARAMS["model_type"]
     model_metrics_s3_path = TEMPLATE_PARAMS["model_metrics_s3_path"]
-    train_all_data = TEMPLATE_PARAMS["train_all_data"]
-    hyperparameters = TEMPLATE_PARAMS["hyperparameters"]
-    validation_split = 0.2
-    # Script arguments for input/output directories
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model"))
-    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train"))
-    parser.add_argument(
-        "--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data")
-    )
-    args = parser.parse_args()
+    hyperparameters = {**DEFAULT_HYPERPARAMETERS, **(TEMPLATE_PARAMS["hyperparameters"] or {})}
-    # Read the training data into DataFrames
-    training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
+    # Load training data
+    training_files = [os.path.join(args.train, f) for f in os.listdir(args.train) if f.endswith(".csv")]
     print(f"Training Files: {training_files}")
-    # Combine files and read them all into a single pandas dataframe
-    all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
-    # Check if the dataframe is empty
+    all_df = pd.concat([pd.read_csv(f, engine="python") for f in training_files])
     check_dataframe(all_df, "training_df")
-    # Features/Target output
     print(f"Target: {target}")
-    print(f"Features: {str(features)}")
+    print(f"Features: {features}")
+    print(f"Hyperparameters: {hyperparameters}")
-    # Convert any features that might be categorical to 'category' type
+    # -------------------------------------------------------------------------
+    # Preprocessing: Categorical features and decompression
+    # -------------------------------------------------------------------------
     all_df, category_mappings = convert_categorical_types(all_df, features)
-    # If we have compressed features, decompress them
     if compressed_features:
-        print(f"Decompressing features {compressed_features}...")
+        print(f"Decompressing features: {compressed_features}")
         all_df, features = decompress_features(all_df, features, compressed_features)
-    # Do we want to train on all the data?
-    if train_all_data:
-        print("Training on ALL of the data")
-        df_train = all_df.copy()
-        df_val = all_df.copy()
-    # Does the dataframe have a training column?
-    elif "training" in all_df.columns:
-        print("Found training column, splitting data based on training column")
-        df_train = all_df[all_df["training"]]
-        df_val = all_df[~all_df["training"]]
-    else:
-        # Just do a random training Split
-        print("WARNING: No training column found, splitting data with random state=42")
-        df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
-    print(f"FIT/TRAIN: {df_train.shape}")
-    print(f"VALIDATION: {df_val.shape}")
-    # Use any hyperparameters to set up both the trainer and model configurations
-    print(f"Hyperparameters: {hyperparameters}")
-    # Now spin up our XGB Model
+    # -------------------------------------------------------------------------
+    # Classification setup
+    # -------------------------------------------------------------------------
+    label_encoder = None
     if model_type == "classifier":
-        xgb_model = xgb.XGBClassifier(enable_categorical=True, **hyperparameters)
-        # Encode the target column
         label_encoder = LabelEncoder()
-        df_train[target] = label_encoder.fit_transform(df_train[target])
-        df_val[target] = label_encoder.transform(df_val[target])
-    else:
-        xgb_model = xgb.XGBRegressor(enable_categorical=True, **hyperparameters)
-        label_encoder = None  # We don't need this for regression
-    # Grab our Features, Target and Train the Model
-    y_train = df_train[target]
-    X_train = df_train[features]
-    xgb_model.fit(X_train, y_train)
-    # Make Predictions on the Validation Set
-    print(f"Making Predictions on Validation Set...")
-    y_validate = df_val[target]
-    X_validate = df_val[features]
-    preds = xgb_model.predict(X_validate)
-    if model_type == "classifier":
-        # Also get the probabilities for each class
-        print("Processing Probabilities...")
-        probs = xgb_model.predict_proba(X_validate)
-        df_val["pred_proba"] = [p.tolist() for p in probs]
-        # Expand the pred_proba column into separate columns for each class
-        print(df_val.columns)
-        df_val = expand_proba_column(df_val, label_encoder.classes_)
-        print(df_val.columns)
-        # Decode the target and prediction labels
-        y_validate = label_encoder.inverse_transform(y_validate)
-        preds = label_encoder.inverse_transform(preds)
-    # Save predictions to S3 (just the target, prediction, and '_proba' columns)
-    df_val["prediction"] = preds
-    output_columns = [target, "prediction"]
-    output_columns += [col for col in df_val.columns if col.endswith("_proba")]
-    wr.s3.to_csv(
-        df_val[output_columns],
-        path=f"{model_metrics_s3_path}/validation_predictions.csv",
-        index=False,
-    )
+        all_df[target] = label_encoder.fit_transform(all_df[target])
+        print(f"Class labels: {label_encoder.classes_.tolist()}")
-    # Report Performance Metrics
-    if model_type == "classifier":
-        # Get the label names and their integer mapping
-        label_names = label_encoder.classes_
+    # -------------------------------------------------------------------------
+    # Cross-validation setup
+    # -------------------------------------------------------------------------
+    n_folds = hyperparameters["n_folds"]
+    xgb_params = {k: v for k, v in hyperparameters.items() if k not in WORKBENCH_PARAMS}
-        # Calculate various model performance metrics
-        scores = precision_recall_fscore_support(y_validate, preds, average=None, labels=label_names)
-        # Put the scores into a dataframe
-        score_df = pd.DataFrame(
-            {
-                target: label_names,
-                "precision": scores[0],
-                "recall": scores[1],
-                "f1": scores[2],
-                "support": scores[3],
-            }
-        )
-        # We need to get creative with the Classification Metrics
-        metrics = ["precision", "recall", "f1", "support"]
-        for t in label_names:
-            for m in metrics:
-                value = score_df.loc[score_df[target] == t, m].iloc[0]
-                print(f"Metrics:{t}:{m} {value}")
-        # Compute and output the confusion matrix
-        conf_mtx = confusion_matrix(y_validate, preds, labels=label_names)
-        for i, row_name in enumerate(label_names):
-            for j, col_name in enumerate(label_names):
-                value = conf_mtx[i, j]
-                print(f"ConfusionMatrix:{row_name}:{col_name} {value}")
+    # Map 'seed' to 'random_state' for XGBoost
+    if "seed" in xgb_params:
+        xgb_params["random_state"] = xgb_params.pop("seed")
+    # Handle objective: filter regression-only params for classifiers, set default for regressors
+    if model_type == "classifier":
+        xgb_params = {k: v for k, v in xgb_params.items() if k not in REGRESSION_ONLY_PARAMS}
     else:
-        # Calculate various model performance metrics (regression)
-        rmse = root_mean_squared_error(y_validate, preds)
-        mae = mean_absolute_error(y_validate, preds)
-        r2 = r2_score(y_validate, preds)
-        print(f"RMSE: {rmse:.3f}")
-        print(f"MAE: {mae:.3f}")
-        print(f"R2: {r2:.3f}")
-        print(f"NumRows: {len(df_val)}")
-    # Now save the model to the standard place/name
-    joblib.dump(xgb_model, os.path.join(args.model_dir, "xgb_model.joblib"))
-    # Save the label encoder if we have one
-    if label_encoder:
-        joblib.dump(label_encoder, os.path.join(args.model_dir, "label_encoder.joblib"))
-    # Save the features (this will validate input during predictions)
-    with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
-        json.dump(orig_features, fp)  # We save the original features, not the decompressed ones
-    # Save the category mappings
-    with open(os.path.join(args.model_dir, "category_mappings.json"), "w") as fp:
-        json.dump(category_mappings, fp)
+        # Default to MAE (reg:absoluteerror) for regression if not specified
+        xgb_params.setdefault("objective", "reg:absoluteerror")
+    print(f"XGBoost params: {xgb_params}")
-def model_fn(model_dir):
-    """Deserialize and return fitted XGBoost model"""
-    model_path = os.path.join(model_dir, "xgb_model.joblib")
-    model = joblib.load(model_path)
-    return model
-def input_fn(input_data, content_type):
-    """Parse input data and return a DataFrame."""
-    if not input_data:
-        raise ValueError("Empty input data is not supported!")
+    if n_folds == 1:
+        # Single train/val split
+        if "training" in all_df.columns:
+            print("Using 'training' column for train/val split")
+            train_idx = np.where(all_df["training"])[0]
+            val_idx = np.where(~all_df["training"])[0]
+        else:
+            print("WARNING: No 'training' column found, using random 80/20 split")
+            indices = np.arange(len(all_df))
+            train_idx, val_idx = train_test_split(indices, test_size=0.2, random_state=42)
+        folds = [(train_idx, val_idx)]
+    else:
+        # K-fold cross-validation
+        if model_type == "classifier":
+            kfold = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
+            folds = list(kfold.split(all_df, all_df[target]))
+        else:
+            kfold = KFold(n_splits=n_folds, shuffle=True, random_state=42)
+            folds = list(kfold.split(all_df))
-    # Decode bytes to string if necessary
-    if isinstance(input_data, bytes):
-        input_data = input_data.decode("utf-8")
+    print(f"Training {'single model' if n_folds == 1 else f'{n_folds}-fold ensemble'}...")
-    if "text/csv" in content_type:
-        return pd.read_csv(StringIO(input_data))
-    elif "application/json" in content_type:
-        return pd.DataFrame(json.loads(input_data))  # Assumes JSON array of records
+    # -------------------------------------------------------------------------
+    # Training loop
+    # -------------------------------------------------------------------------
+    # Initialize out-of-fold storage
+    oof_predictions = np.full(len(all_df), np.nan, dtype=np.float64)
+    if model_type == "classifier":
+        num_classes = len(label_encoder.classes_)
+        oof_proba = np.full((len(all_df), num_classes), np.nan, dtype=np.float64)
     else:
-        raise ValueError(f"{content_type} not supported!")
+        oof_proba = None
+    # Check for sample weights
+    has_sample_weights = "sample_weight" in all_df.columns
+    if has_sample_weights:
+        sw = all_df["sample_weight"]
+        print(f"Using sample weights: min={sw.min():.2f}, max={sw.max():.2f}, mean={sw.mean():.2f}")
+    # Train ensemble
+    ensemble_models = []
+    for fold_idx, (train_idx, val_idx) in enumerate(folds):
+        print(f"\n{'='*50}")
+        print(f"Fold {fold_idx + 1}/{len(folds)} - Train: {len(train_idx)}, Val: {len(val_idx)}")
+        print(f"{'='*50}")
+        # Prepare fold data
+        X_train = all_df.iloc[train_idx][features]
+        y_train = all_df.iloc[train_idx][target]
+        X_val = all_df.iloc[val_idx][features]
+        sample_weights = all_df.iloc[train_idx]["sample_weight"] if has_sample_weights else None
+        # Create model with fold-specific random state for diversity
+        fold_params = {**xgb_params, "random_state": xgb_params.get("random_state", 42) + fold_idx}
+        if model_type == "classifier":
+            model = xgb.XGBClassifier(enable_categorical=True, **fold_params)
+        else:
+            model = xgb.XGBRegressor(enable_categorical=True, **fold_params)
+        # Train
+        model.fit(X_train, y_train, sample_weight=sample_weights)
+        ensemble_models.append(model)
+        # Out-of-fold predictions
+        oof_predictions[val_idx] = model.predict(X_val)
+        if model_type == "classifier":
+            oof_proba[val_idx] = model.predict_proba(X_val)
+    print(f"\nTraining complete! Trained {len(ensemble_models)} model(s).")
+    # -------------------------------------------------------------------------
+    # Prepare validation results
+    # -------------------------------------------------------------------------
+    if n_folds == 1:
+        # Single fold: only validation rows
+        val_mask = ~np.isnan(oof_predictions)
+        df_val = all_df[val_mask].copy()
+        predictions = oof_predictions[val_mask]
+        if oof_proba is not None:
+            oof_proba = oof_proba[val_mask]
+    else:
+        # K-fold: all rows have out-of-fold predictions
+        df_val = all_df.copy()
+        predictions = oof_predictions
-def output_fn(output_df, accept_type):
-    """Supports both CSV and JSON output formats."""
-    if "text/csv" in accept_type:
-        csv_output = output_df.fillna("N/A").to_csv(index=False)  # CSV with N/A for missing values
-        return csv_output, "text/csv"
-    elif "application/json" in accept_type:
-        return output_df.to_json(orient="records"), "application/json"  # JSON array of records (NaNs -> null)
+    # Decode labels for classification
+    if model_type == "classifier":
+        df_val[target] = label_encoder.inverse_transform(df_val[target].astype(int))
+        df_val["prediction"] = label_encoder.inverse_transform(predictions.astype(int))
+        if oof_proba is not None:
+            df_val["pred_proba"] = [p.tolist() for p in oof_proba]
+            df_val = expand_proba_column(df_val, label_encoder.classes_)
     else:
-        raise RuntimeError(f"{accept_type} accept type is not supported by this script.")
+        df_val["prediction"] = predictions
+    # -------------------------------------------------------------------------
+    # Compute and print metrics
+    # -------------------------------------------------------------------------
+    y_true = df_val[target].values
+    y_pred = df_val["prediction"].values
-def predict_fn(df, model) -> pd.DataFrame:
-    """Make Predictions with our XGB Model
+    if model_type == "classifier":
+        label_names = label_encoder.classes_
+        score_df = compute_classification_metrics(y_true, y_pred, label_names, target)
+        print_classification_metrics(score_df, target, label_names)
+        print_confusion_matrix(y_true, y_pred, label_names)
+    else:
+        metrics = compute_regression_metrics(y_true, y_pred)
+        print_regression_metrics(metrics)
+        # Compute ensemble prediction_std
+        if n_folds > 1:
+            all_preds = np.stack([m.predict(all_df[features]) for m in ensemble_models])
+            df_val["prediction_std"] = np.std(all_preds, axis=0)
+            print(f"Ensemble std - mean: {df_val['prediction_std'].mean():.4f}, max: {df_val['prediction_std'].max():.4f}")
+        else:
+            df_val["prediction_std"] = 0.0
+        # Train UQ models for uncertainty quantification
+        print("\n" + "=" * 50)
+        print("Training UQ Models")
+        print("=" * 50)
+        uq_models, uq_metadata = train_uq_models(
+            all_df[features], all_df[target], df_val[features], y_true
+        )
+        df_val = predict_intervals(df_val, df_val[features], uq_models, uq_metadata)
+        df_val = compute_confidence(df_val, uq_metadata["median_interval_width"])
-    Args:
-        df (pd.DataFrame): The input DataFrame
-        model: The model use for predictions
+    # -------------------------------------------------------------------------
+    # Save validation predictions to S3
+    # -------------------------------------------------------------------------
+    output_columns = []
+    if id_column in df_val.columns:
+        output_columns.append(id_column)
+    output_columns += [target, "prediction"]
-    Returns:
-        pd.DataFrame: The DataFrame with the predictions added
-    """
-    compressed_features = TEMPLATE_PARAMS["compressed_features"]
+    if model_type != "classifier":
+        output_columns.append("prediction_std")
+        output_columns += [c for c in df_val.columns if c.startswith("q_") or c == "confidence"]
-    # Grab our feature columns (from training)
-    model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
-    with open(os.path.join(model_dir, "feature_columns.json")) as fp:
-        features = json.load(fp)
-    print(f"Model Features: {features}")
+    output_columns += [c for c in df_val.columns if c.endswith("_proba")]
-    # Load the category mappings (from training)
-    with open(os.path.join(model_dir, "category_mappings.json")) as fp:
-        category_mappings = json.load(fp)
+    wr.s3.to_csv(df_val[output_columns], f"{model_metrics_s3_path}/validation_predictions.csv", index=False)
-    # Load our Label Encoder if we have one
-    label_encoder = None
-    if os.path.exists(os.path.join(model_dir, "label_encoder.joblib")):
-        label_encoder = joblib.load(os.path.join(model_dir, "label_encoder.joblib"))
+    # -------------------------------------------------------------------------
+    # Save model artifacts
+    # -------------------------------------------------------------------------
+    for idx, m in enumerate(ensemble_models):
+        joblib.dump(m, os.path.join(args.model_dir, f"xgb_model_{idx}.joblib"))
+    print(f"Saved {len(ensemble_models)} model(s)")
-    # We're going match features in a case-insensitive manner, accounting for all the permutations
-    # - Model has a feature list that's any case ("Id", "taCos", "cOunT", "likes_tacos")
-    # - Incoming data has columns that are mixed case ("ID", "Tacos", "Count", "Likes_Tacos")
-    matched_df = match_features_case_insensitive(df, features)
+    with open(os.path.join(args.model_dir, "ensemble_metadata.json"), "w") as f:
+        json.dump({"n_ensemble": len(ensemble_models), "n_folds": n_folds}, f)
-    # Detect categorical types in the incoming DataFrame
-    matched_df, _ = convert_categorical_types(matched_df, features, category_mappings)
+    with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as f:
+        json.dump(orig_features, f)
-    # If we have compressed features, decompress them
-    if compressed_features:
-        print("Decompressing features for prediction...")
-        matched_df, features = decompress_features(matched_df, features, compressed_features)
+    with open(os.path.join(args.model_dir, "category_mappings.json"), "w") as f:
+        json.dump(category_mappings, f)
-    # Predict the features against our XGB Model
-    X = matched_df[features]
-    predictions = model.predict(X)
+    with open(os.path.join(args.model_dir, "hyperparameters.json"), "w") as f:
+        json.dump(hyperparameters, f, indent=2)
-    # If we have a label encoder, decode the predictions
     if label_encoder:
-        predictions = label_encoder.inverse_transform(predictions)
-    # Set the predictions on the DataFrame
-    df["prediction"] = predictions
-    # Does our model have a 'predict_proba' method? If so we will call it and add the results to the DataFrame
-    if getattr(model, "predict_proba", None):
-        probs = model.predict_proba(matched_df[features])
-        df["pred_proba"] = [p.tolist() for p in probs]
+        joblib.dump(label_encoder, os.path.join(args.model_dir, "label_encoder.joblib"))
-        # Expand the pred_proba column into separate columns for each class
-        df = expand_proba_column(df, label_encoder.classes_)
+    if model_type != "classifier":
+        save_uq_models(uq_models, uq_metadata, args.model_dir)
-    # All done, return the DataFrame with new columns for the predictions
-    return df
+    print(f"\nModel training complete! Artifacts saved to {args.model_dir}")

workbench 0.8.202__py3-none-any.whl → 0.8.220__py3-none-any.whl

Potentially problematic release.

workbench 0.8.202py3-none-any.whl → 0.8.220py3-none-any.whl