PyPI - workbench - Versions diffs - 0.8.213__py3-none-any.whl → 0.8.219__py3-none-any.whl - Mend

workbench 0.8.213py3-none-any.whl → 0.8.219py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (58) hide show

workbench/algorithms/dataframe/feature_space_proximity.py +168 -75
workbench/algorithms/dataframe/fingerprint_proximity.py +257 -80
workbench/algorithms/dataframe/projection_2d.py +38 -21
workbench/algorithms/dataframe/proximity.py +75 -150
workbench/algorithms/graph/light/proximity_graph.py +5 -5
workbench/algorithms/models/cleanlab_model.py +382 -0
workbench/algorithms/models/noise_model.py +2 -2
workbench/algorithms/sql/outliers.py +3 -3
workbench/api/__init__.py +3 -0
workbench/api/endpoint.py +10 -5
workbench/api/feature_set.py +76 -6
workbench/api/meta_model.py +289 -0
workbench/api/model.py +43 -4
workbench/core/artifacts/endpoint_core.py +65 -117
workbench/core/artifacts/feature_set_core.py +3 -3
workbench/core/artifacts/model_core.py +6 -4
workbench/core/pipelines/pipeline_executor.py +1 -1
workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +30 -10
workbench/model_script_utils/model_script_utils.py +15 -11
workbench/model_script_utils/pytorch_utils.py +11 -1
workbench/model_scripts/chemprop/chemprop.template +147 -71
workbench/model_scripts/chemprop/generated_model_script.py +151 -75
workbench/model_scripts/chemprop/model_script_utils.py +15 -11
workbench/model_scripts/custom_models/chem_info/fingerprints.py +87 -46
workbench/model_scripts/custom_models/proximity/feature_space_proximity.py +194 -0
workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +6 -6
workbench/model_scripts/custom_models/uq_models/feature_space_proximity.py +194 -0
workbench/model_scripts/meta_model/generated_model_script.py +209 -0
workbench/model_scripts/meta_model/meta_model.template +209 -0
workbench/model_scripts/pytorch_model/generated_model_script.py +45 -27
workbench/model_scripts/pytorch_model/model_script_utils.py +15 -11
workbench/model_scripts/pytorch_model/pytorch.template +42 -24
workbench/model_scripts/pytorch_model/pytorch_utils.py +11 -1
workbench/model_scripts/script_generation.py +4 -0
workbench/model_scripts/xgb_model/generated_model_script.py +167 -156
workbench/model_scripts/xgb_model/model_script_utils.py +15 -11
workbench/model_scripts/xgb_model/xgb_model.template +163 -152
workbench/repl/workbench_shell.py +0 -5
workbench/scripts/endpoint_test.py +2 -2
workbench/scripts/meta_model_sim.py +35 -0
workbench/utils/chem_utils/fingerprints.py +87 -46
workbench/utils/chemprop_utils.py +23 -5
workbench/utils/meta_model_simulator.py +499 -0
workbench/utils/metrics_utils.py +94 -10
workbench/utils/model_utils.py +91 -9
workbench/utils/pytorch_utils.py +1 -1
workbench/utils/shap_utils.py +1 -55
workbench/web_interface/components/plugins/scatter_plot.py +4 -8
{workbench-0.8.213.dist-info → workbench-0.8.219.dist-info}/METADATA +2 -1
{workbench-0.8.213.dist-info → workbench-0.8.219.dist-info}/RECORD +54 -50
{workbench-0.8.213.dist-info → workbench-0.8.219.dist-info}/entry_points.txt +1 -0
workbench/model_scripts/custom_models/meta_endpoints/example.py +0 -53
workbench/model_scripts/custom_models/proximity/proximity.py +0 -410
workbench/model_scripts/custom_models/uq_models/meta_uq.template +0 -377
workbench/model_scripts/custom_models/uq_models/proximity.py +0 -410
{workbench-0.8.213.dist-info → workbench-0.8.219.dist-info}/WHEEL +0 -0
{workbench-0.8.213.dist-info → workbench-0.8.219.dist-info}/licenses/LICENSE +0 -0
{workbench-0.8.213.dist-info → workbench-0.8.219.dist-info}/top_level.txt +0 -0

workbench/model_scripts/pytorch_model/pytorch.template CHANGED Viewed

@@ -5,51 +5,36 @@
 # - Out-of-fold predictions for validation metrics
 # - Categorical feature embedding via TabularMLP
 # - Compressed feature decompression
+#
+# NOTE: Imports are structured to minimize serverless endpoint startup time.
+# Heavy imports (sklearn, awswrangler) are deferred to training time.
-import argparse
 import json
 import os
-import awswrangler as wr
 import joblib
 import numpy as np
 import pandas as pd
 import torch
-from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
-from sklearn.preprocessing import LabelEncoder
-# Enable Tensor Core optimization for GPUs that support it
-torch.set_float32_matmul_precision("medium")
 from model_script_utils import (
-    check_dataframe,
-    compute_classification_metrics,
-    compute_regression_metrics,
     convert_categorical_types,
     decompress_features,
     expand_proba_column,
     input_fn,
     match_features_case_insensitive,
     output_fn,
-    print_classification_metrics,
-    print_confusion_matrix,
-    print_regression_metrics,
 )
 from pytorch_utils import (
     FeatureScaler,
-    create_model,
     load_model,
     predict,
     prepare_data,
-    save_model,
-    train_model,
 )
 from uq_harness import (
     compute_confidence,
     load_uq_models,
     predict_intervals,
-    save_uq_models,
-    train_uq_models,
 )
 # =============================================================================
@@ -59,13 +44,15 @@ DEFAULT_HYPERPARAMETERS = {
     # Training parameters
     "n_folds": 5,
     "max_epochs": 200,
-    "early_stopping_patience": 20,
+    "early_stopping_patience": 30,
     "batch_size": 128,
-    # Model architecture
-    "layers": "256-128-64",
+    # Model architecture (larger capacity - ensemble provides regularization)
+    "layers": "512-256-128",
     "learning_rate": 1e-3,
-    "dropout": 0.1,
+    "dropout": 0.05,
     "use_batch_norm": True,
+    # Loss function for regression (L1Loss=MAE, MSELoss=MSE, HuberLoss, SmoothL1Loss)
+    "loss": "L1Loss",
     # Random seed
     "seed": 42,
 }
@@ -86,7 +73,7 @@ TEMPLATE_PARAMS = {
 # Model Loading (for SageMaker inference)
 # =============================================================================
 def model_fn(model_dir: str) -> dict:
-    """Load TabularMLP ensemble from the specified directory."""
+    """Load PyTorch TabularMLP ensemble from the specified directory."""
     # Load ensemble metadata
     metadata_path = os.path.join(model_dir, "ensemble_metadata.joblib")
     if os.path.exists(metadata_path):
@@ -129,7 +116,7 @@ def model_fn(model_dir: str) -> dict:
 # Inference (for SageMaker inference)
 # =============================================================================
 def predict_fn(df: pd.DataFrame, model_dict: dict) -> pd.DataFrame:
-    """Make predictions with TabularMLP ensemble."""
+    """Make predictions with PyTorch TabularMLP ensemble."""
     model_type = TEMPLATE_PARAMS["model_type"]
     compressed_features = TEMPLATE_PARAMS["compressed_features"]
     model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
@@ -233,6 +220,36 @@ def predict_fn(df: pd.DataFrame, model_dict: dict) -> pd.DataFrame:
 # Training
 # =============================================================================
 if __name__ == "__main__":
+    # -------------------------------------------------------------------------
+    # Training-only imports (deferred to reduce serverless startup time)
+    # -------------------------------------------------------------------------
+    import argparse
+    import awswrangler as wr
+    from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
+    from sklearn.preprocessing import LabelEncoder
+    # Enable Tensor Core optimization for GPUs that support it
+    torch.set_float32_matmul_precision("medium")
+    from model_script_utils import (
+        check_dataframe,
+        compute_classification_metrics,
+        compute_regression_metrics,
+        print_classification_metrics,
+        print_confusion_matrix,
+        print_regression_metrics,
+    )
+    from pytorch_utils import (
+        create_model,
+        save_model,
+        train_model,
+    )
+    from uq_harness import (
+        save_uq_models,
+        train_uq_models,
+    )
     # -------------------------------------------------------------------------
     # Setup: Parse arguments and load data
     # -------------------------------------------------------------------------
@@ -377,6 +394,7 @@ if __name__ == "__main__":
             patience=hyperparameters["early_stopping_patience"],
             batch_size=hyperparameters["batch_size"],
             learning_rate=hyperparameters["learning_rate"],
+            loss=hyperparameters.get("loss", "L1Loss"),
             device=device,
         )
         ensemble_models.append(model)

workbench/model_scripts/pytorch_model/pytorch_utils.py CHANGED Viewed

@@ -245,6 +245,7 @@ def train_model(
     patience: int = 20,
     batch_size: int = 128,
     learning_rate: float = 1e-3,
+    loss: str = "L1Loss",
     device: str = "cpu",
 ) -> tuple[TabularMLP, dict]:
     """Train the model with early stopping.
@@ -272,7 +273,16 @@ def train_model(
     if task == "classification":
         criterion = nn.CrossEntropyLoss()
     else:
-        criterion = nn.MSELoss()
+        # Map loss name to PyTorch loss class
+        loss_map = {
+            "L1Loss": nn.L1Loss,
+            "MSELoss": nn.MSELoss,
+            "HuberLoss": nn.HuberLoss,
+            "SmoothL1Loss": nn.SmoothL1Loss,
+        }
+        if loss not in loss_map:
+            raise ValueError(f"Unknown loss '{loss}'. Supported: {list(loss_map.keys())}")
+        criterion = loss_map[loss]()
     optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

workbench/model_scripts/script_generation.py CHANGED Viewed

@@ -100,6 +100,7 @@ def generate_model_script(template_params: dict) -> str:
             - model_metrics_s3_path (str): The S3 path to store the model metrics
             - train_all_data (bool): Whether to train on all (100%) of the data
             - hyperparameters (dict, optional): Hyperparameters for the model (default: None)
+            - child_endpoints (list[str], optional): For META models, list of child endpoint names
     Returns:
         str: The name of the generated model script
@@ -116,6 +117,9 @@ def generate_model_script(template_params: dict) -> str:
     elif template_params["model_framework"] == ModelFramework.CHEMPROP:
         template_name = "chemprop.template"
         model_script_dir = "chemprop"
+    elif template_params["model_framework"] == ModelFramework.META:
+        template_name = "meta_model.template"
+        model_script_dir = "meta_model"
     elif template_params["model_type"] in [ModelType.REGRESSOR, ModelType.UQ_REGRESSOR, ModelType.CLASSIFIER]:
         template_name = "xgb_model.template"
         model_script_dir = "xgb_model"

workbench/model_scripts/xgb_model/generated_model_script.py CHANGED Viewed

@@ -7,39 +7,30 @@
 # - Sample weights support
 # - Categorical feature handling
 # - Compressed feature decompression
+#
+# NOTE: Imports are structured to minimize serverless endpoint startup time.
+# Heavy imports (sklearn, awswrangler) are deferred to training time.
-import argparse
 import json
 import os
-import awswrangler as wr
 import joblib
 import numpy as np
 import pandas as pd
 import xgboost as xgb
-from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
-from sklearn.preprocessing import LabelEncoder
 from model_script_utils import (
-    check_dataframe,
-    compute_classification_metrics,
-    compute_regression_metrics,
     convert_categorical_types,
     decompress_features,
     expand_proba_column,
     input_fn,
     match_features_case_insensitive,
     output_fn,
-    print_classification_metrics,
-    print_confusion_matrix,
-    print_regression_metrics,
 )
 from uq_harness import (
     compute_confidence,
     load_uq_models,
     predict_intervals,
-    save_uq_models,
-    train_uq_models,
 )
 # =============================================================================
@@ -49,41 +40,173 @@ DEFAULT_HYPERPARAMETERS = {
     # Training parameters
     "n_folds": 5,  # Number of CV folds (1 = single train/val split)
     # Core tree parameters
-    "n_estimators": 200,
-    "max_depth": 6,
+    "n_estimators": 300,
+    "max_depth": 7,
     "learning_rate": 0.05,
-    # Sampling parameters
-    "subsample": 0.7,
-    "colsample_bytree": 0.6,
-    "colsample_bylevel": 0.8,
-    # Regularization
-    "min_child_weight": 5,
-    "gamma": 0.2,
-    "reg_alpha": 0.5,
-    "reg_lambda": 2.0,
+    # Sampling parameters (less aggressive - ensemble provides regularization)
+    "subsample": 0.8,
+    "colsample_bytree": 0.8,
+    # Regularization (lighter - ensemble averaging reduces overfitting)
+    "min_child_weight": 3,
+    "gamma": 0.1,
+    "reg_alpha": 0.1,
+    "reg_lambda": 1.0,
     # Random seed
-    "random_state": 42,
+    "seed": 42,
 }
 # Workbench-specific parameters (not passed to XGBoost)
 WORKBENCH_PARAMS = {"n_folds"}
+# Regression-only parameters (filtered out for classifiers)
+REGRESSION_ONLY_PARAMS = {"objective"}
 # Template parameters (filled in by Workbench)
 TEMPLATE_PARAMS = {
     "model_type": "uq_regressor",
     "target": "udm_asy_res_efflux_ratio",
-    "features": ['smr_vsa4', 'tpsa', 'numhdonors', 'nhohcount', 'nbase', 'vsa_estate3', 'fr_guanido', 'mollogp', 'peoe_vsa8', 'peoe_vsa1', 'fr_imine', 'vsa_estate2', 'estate_vsa10', 'asphericity', 'xc_3dv', 'smr_vsa3', 'charge_centroid_distance', 'c3sp3', 'nitrogen_span', 'estate_vsa2', 'minpartialcharge', 'hba_hbd_ratio', 'slogp_vsa1', 'axp_7d', 'nocount', 'vsa_estate4', 'vsa_estate6', 'estate_vsa4', 'xc_4dv', 'xc_4d', 'num_s_centers', 'vsa_estate9', 'chi2v', 'axp_5d', 'mi', 'mse', 'bcut2d_mrhi', 'smr_vsa6', 'hallkieralpha', 'balabanj', 'amphiphilic_moment', 'type_ii_pattern_count', 'minabsestateindex', 'bcut2d_mwlow', 'axp_0dv', 'slogp_vsa5', 'axp_2d', 'axp_1dv', 'xch_5d', 'peoe_vsa10', 'molecular_asymmetry', 'kappa3', 'estate_vsa3', 'sse', 'bcut2d_logphi', 'fr_imidazole', 'molecular_volume_3d', 'bertzct', 'maxestateindex', 'aromatic_interaction_score', 'axp_3d', 'radius_of_gyration', 'vsa_estate7', 'si', 'axp_5dv', 'molecular_axis_length', 'estate_vsa6', 'fpdensitymorgan1', 'axp_6d', 'estate_vsa9', 'fpdensitymorgan2', 'xp_0dv', 'xp_6dv', 'molmr', 'qed', 'estate_vsa8', 'peoe_vsa9', 'xch_6dv', 'xp_7d', 'slogp_vsa2', 'xp_5dv', 'bcut2d_chghi', 'xch_6d', 'chi0n', 'slogp_vsa3', 'chi1v', 'chi3v', 'bcut2d_chglo', 'axp_1d', 'mp', 'num_defined_stereocenters', 'xp_3dv', 'bcut2d_mrlow', 'fr_al_oh', 'peoe_vsa7', 'chi2n', 'axp_6dv', 'axp_2dv', 'chi4n', 'xc_3d', 'axp_7dv', 'vsa_estate8', 'xch_7d', 'maxpartialcharge', 'chi1n', 'peoe_vsa2', 'axp_3dv', 'bcut2d_logplow', 'mv', 'xpc_5dv', 'kappa2', 'vsa_estate5', 'xp_5d', 'mm', 'maxabspartialcharge', 'axp_4dv', 'maxabsestateindex', 'axp_4d', 'xch_4dv', 'xp_2dv', 'heavyatommolwt', 'numatomstereocenters', 'xp_7dv', 'numsaturatedheterocycles', 'xp_3d', 'kappa1', 'mz', 'axp_0d', 'chi1', 'xch_4d', 'smr_vsa1', 'xp_2d', 'estate_vsa5', 'phi', 'fr_ether', 'xc_5d', 'c1sp3', 'estate_vsa7', 'estate_vsa1', 'vsa_estate1', 'slogp_vsa4', 'avgipc', 'smr_vsa10', 'numvalenceelectrons', 'xc_5dv', 'peoe_vsa12', 'peoe_vsa6', 'xpc_5d', 'xpc_6d', 'minestateindex', 'chi3n', 'smr_vsa5', 'xp_4d', 'numheteroatoms', 'fpdensitymorgan3', 'xpc_4d', 'sps', 'xp_1d', 'sv', 'fr_ar_n', 'slogp_vsa10', 'c2sp3', 'xpc_4dv', 'chi0v', 'xpc_6dv', 'xp_1dv', 'vsa_estate10', 'sare', 'c2sp2', 'mpe', 'xch_7dv', 'chi4v', 'type_i_pattern_count', 'sp', 'slogp_vsa8', 'amide_count', 'num_stereocenters', 'num_r_centers', 'tertiary_amine_count', 'spe', 'xp_4dv', 'numsaturatedrings', 'mare', 'numhacceptors', 'chi0', 'fractioncsp3', 'fr_nh0', 'xch_5dv', 'fr_aniline', 'smr_vsa7', 'labuteasa', 'c3sp2', 'xp_0d', 'xp_6d', 'peoe_vsa11', 'fr_ar_nh', 'molwt', 'intramolecular_hbond_potential', 'peoe_vsa3', 'fr_nhpyrrole', 'numaliphaticrings', 'hybratio', 'smr_vsa9', 'peoe_vsa13', 'bcut2d_mwhi', 'c1sp2', 'slogp_vsa11', 'numrotatablebonds', 'numaliphaticcarbocycles', 'slogp_vsa6', 'peoe_vsa4', 'numunspecifiedatomstereocenters', 'xc_6d', 'xc_6dv', 'num_unspecified_stereocenters', 'sz', 'minabspartialcharge', 'fcsp3', 'c1sp1', 'fr_piperzine', 'numaliphaticheterocycles', 'numamidebonds', 'fr_benzene', 'numaromaticheterocycles', 'sm', 'fr_priamide', 'fr_piperdine', 'fr_methoxy', 'c4sp3', 'fr_c_o_nocoo', 'exactmolwt', 'stereo_complexity', 'fr_hoccn', 'numaromaticcarbocycles', 'fr_nh2', 'numheterocycles', 'fr_morpholine', 'fr_ketone', 'fr_nh1', 'frac_defined_stereo', 'fr_aryl_methyl', 'fr_alkyl_halide', 'fr_phenol', 'fr_al_oh_notert', 'fr_ar_oh', 'fr_pyridine', 'fr_amide', 'slogp_vsa7', 'fr_halogen', 'numsaturatedcarbocycles', 'slogp_vsa12', 'fr_ndealkylation1', 'xch_3d', 'fr_bicyclic', 'naromatom', 'narombond'],
+    "features": ['fingerprint'],
     "id_column": "udm_mol_bat_id",
-    "compressed_features": [],
-    "model_metrics_s3_path": "s3://ideaya-sageworks-bucket/models/caco2-er-reg-test-log/training",
-    "hyperparameters": {'target_transform': 'log'},
+    "compressed_features": ['fingerprint'],
+    "model_metrics_s3_path": "s3://ideaya-sageworks-bucket/models/caco2-er-reg-fp/training",
+    "hyperparameters": {},
 }
+# =============================================================================
+# Model Loading (for SageMaker inference)
+# =============================================================================
+def model_fn(model_dir: str) -> dict:
+    """Load XGBoost ensemble from the specified directory."""
+    # Load ensemble metadata
+    metadata_path = os.path.join(model_dir, "ensemble_metadata.json")
+    if os.path.exists(metadata_path):
+        with open(metadata_path) as f:
+            metadata = json.load(f)
+        n_ensemble = metadata["n_ensemble"]
+    else:
+        n_ensemble = 1  # Legacy single model
+    # Load ensemble models
+    ensemble_models = []
+    for i in range(n_ensemble):
+        model_path = os.path.join(model_dir, f"xgb_model_{i}.joblib")
+        if not os.path.exists(model_path):
+            model_path = os.path.join(model_dir, "xgb_model.joblib")  # Legacy fallback
+        ensemble_models.append(joblib.load(model_path))
+    print(f"Loaded {len(ensemble_models)} model(s)")
+    # Load label encoder (classifier only)
+    label_encoder = None
+    encoder_path = os.path.join(model_dir, "label_encoder.joblib")
+    if os.path.exists(encoder_path):
+        label_encoder = joblib.load(encoder_path)
+    # Load category mappings
+    category_mappings = {}
+    category_path = os.path.join(model_dir, "category_mappings.json")
+    if os.path.exists(category_path):
+        with open(category_path) as f:
+            category_mappings = json.load(f)
+    # Load UQ models (regression only)
+    uq_models, uq_metadata = None, None
+    uq_path = os.path.join(model_dir, "uq_metadata.json")
+    if os.path.exists(uq_path):
+        uq_models, uq_metadata = load_uq_models(model_dir)
+    return {
+        "ensemble_models": ensemble_models,
+        "n_ensemble": n_ensemble,
+        "label_encoder": label_encoder,
+        "category_mappings": category_mappings,
+        "uq_models": uq_models,
+        "uq_metadata": uq_metadata,
+    }
+# =============================================================================
+# Inference (for SageMaker inference)
+# =============================================================================
+def predict_fn(df: pd.DataFrame, model_dict: dict) -> pd.DataFrame:
+    """Make predictions with XGBoost ensemble."""
+    model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
+    with open(os.path.join(model_dir, "feature_columns.json")) as f:
+        features = json.load(f)
+    print(f"Model Features: {features}")
+    # Extract model components
+    ensemble_models = model_dict["ensemble_models"]
+    label_encoder = model_dict.get("label_encoder")
+    category_mappings = model_dict.get("category_mappings", {})
+    uq_models = model_dict.get("uq_models")
+    uq_metadata = model_dict.get("uq_metadata")
+    compressed_features = TEMPLATE_PARAMS["compressed_features"]
+    # Prepare features
+    matched_df = match_features_case_insensitive(df, features)
+    matched_df, _ = convert_categorical_types(matched_df, features, category_mappings)
+    if compressed_features:
+        print("Decompressing features for prediction...")
+        matched_df, features = decompress_features(matched_df, features, compressed_features)
+    X = matched_df[features]
+    # Collect ensemble predictions
+    all_preds = [m.predict(X) for m in ensemble_models]
+    ensemble_preds = np.stack(all_preds, axis=0)
+    if label_encoder is not None:
+        # Classification: average probabilities, then argmax
+        all_probs = [m.predict_proba(X) for m in ensemble_models]
+        avg_probs = np.mean(np.stack(all_probs, axis=0), axis=0)
+        class_preds = np.argmax(avg_probs, axis=1)
+        df["prediction"] = label_encoder.inverse_transform(class_preds)
+        df["pred_proba"] = [p.tolist() for p in avg_probs]
+        df = expand_proba_column(df, label_encoder.classes_)
+    else:
+        # Regression: average predictions
+        df["prediction"] = np.mean(ensemble_preds, axis=0)
+        df["prediction_std"] = np.std(ensemble_preds, axis=0)
+        # Add UQ intervals if available
+        if uq_models and uq_metadata:
+            df = predict_intervals(df, X, uq_models, uq_metadata)
+            df = compute_confidence(df, uq_metadata["median_interval_width"], "q_10", "q_90")
+    print(f"Inference complete: {len(df)} predictions, {len(ensemble_models)} ensemble members")
+    return df
 # =============================================================================
 # Training
 # =============================================================================
 if __name__ == "__main__":
+    # -------------------------------------------------------------------------
+    # Training-only imports (deferred to reduce serverless startup time)
+    # -------------------------------------------------------------------------
+    import argparse
+    import awswrangler as wr
+    from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
+    from sklearn.preprocessing import LabelEncoder
+    from model_script_utils import (
+        check_dataframe,
+        compute_classification_metrics,
+        compute_regression_metrics,
+        print_classification_metrics,
+        print_confusion_matrix,
+        print_regression_metrics,
+    )
+    from uq_harness import (
+        save_uq_models,
+        train_uq_models,
+    )
     # -------------------------------------------------------------------------
     # Setup: Parse arguments and load data
     # -------------------------------------------------------------------------
@@ -123,7 +246,7 @@ if __name__ == "__main__":
         all_df, features = decompress_features(all_df, features, compressed_features)
     # -------------------------------------------------------------------------
-    # Classification setup: Encode target labels
+    # Classification setup
     # -------------------------------------------------------------------------
     label_encoder = None
     if model_type == "classifier":
@@ -136,6 +259,18 @@ if __name__ == "__main__":
     # -------------------------------------------------------------------------
     n_folds = hyperparameters["n_folds"]
     xgb_params = {k: v for k, v in hyperparameters.items() if k not in WORKBENCH_PARAMS}
+    # Map 'seed' to 'random_state' for XGBoost
+    if "seed" in xgb_params:
+        xgb_params["random_state"] = xgb_params.pop("seed")
+    # Handle objective: filter regression-only params for classifiers, set default for regressors
+    if model_type == "classifier":
+        xgb_params = {k: v for k, v in xgb_params.items() if k not in REGRESSION_ONLY_PARAMS}
+    else:
+        # Default to MAE (reg:absoluteerror) for regression if not specified
+        xgb_params.setdefault("objective", "reg:absoluteerror")
     print(f"XGBoost params: {xgb_params}")
     if n_folds == 1:
@@ -285,12 +420,10 @@ if __name__ == "__main__":
     # -------------------------------------------------------------------------
     # Save model artifacts
     # -------------------------------------------------------------------------
-    # Ensemble models
-    for idx, ens_model in enumerate(ensemble_models):
-        joblib.dump(ens_model, os.path.join(args.model_dir, f"xgb_model_{idx}.joblib"))
-    print(f"Saved {len(ensemble_models)} XGBoost model(s)")
+    for idx, m in enumerate(ensemble_models):
+        joblib.dump(m, os.path.join(args.model_dir, f"xgb_model_{idx}.joblib"))
+    print(f"Saved {len(ensemble_models)} model(s)")
-    # Metadata files
     with open(os.path.join(args.model_dir, "ensemble_metadata.json"), "w") as f:
         json.dump({"n_ensemble": len(ensemble_models), "n_folds": n_folds}, f)
@@ -310,125 +443,3 @@ if __name__ == "__main__":
         save_uq_models(uq_models, uq_metadata, args.model_dir)
     print(f"\nModel training complete! Artifacts saved to {args.model_dir}")
-# =============================================================================
-# Model Loading (for SageMaker inference)
-# =============================================================================
-def model_fn(model_dir: str) -> dict:
-    """Load XGBoost ensemble and associated artifacts.
-    Args:
-        model_dir: Directory containing model artifacts
-    Returns:
-        Dictionary with ensemble_models, label_encoder, category_mappings, uq_models, etc.
-    """
-    # Load ensemble metadata
-    metadata_path = os.path.join(model_dir, "ensemble_metadata.json")
-    if os.path.exists(metadata_path):
-        with open(metadata_path) as f:
-            metadata = json.load(f)
-        n_ensemble = metadata["n_ensemble"]
-    else:
-        n_ensemble = 1  # Legacy single model
-    # Load ensemble models
-    ensemble_models = []
-    for i in range(n_ensemble):
-        model_path = os.path.join(model_dir, f"xgb_model_{i}.joblib")
-        if not os.path.exists(model_path):
-            model_path = os.path.join(model_dir, "xgb_model.joblib")  # Legacy fallback
-        ensemble_models.append(joblib.load(model_path))
-    # Load label encoder (classifier only)
-    label_encoder = None
-    encoder_path = os.path.join(model_dir, "label_encoder.joblib")
-    if os.path.exists(encoder_path):
-        label_encoder = joblib.load(encoder_path)
-    # Load category mappings
-    category_mappings = {}
-    category_path = os.path.join(model_dir, "category_mappings.json")
-    if os.path.exists(category_path):
-        with open(category_path) as f:
-            category_mappings = json.load(f)
-    # Load UQ models (regression only)
-    uq_models, uq_metadata = None, None
-    uq_path = os.path.join(model_dir, "uq_metadata.json")
-    if os.path.exists(uq_path):
-        uq_models, uq_metadata = load_uq_models(model_dir)
-    return {
-        "ensemble_models": ensemble_models,
-        "n_ensemble": n_ensemble,
-        "label_encoder": label_encoder,
-        "category_mappings": category_mappings,
-        "uq_models": uq_models,
-        "uq_metadata": uq_metadata,
-    }
-# =============================================================================
-# Inference (for SageMaker inference)
-# =============================================================================
-def predict_fn(df: pd.DataFrame, models: dict) -> pd.DataFrame:
-    """Make predictions with XGBoost ensemble.
-    Args:
-        df: Input DataFrame with features
-        models: Dictionary from model_fn containing ensemble and metadata
-    Returns:
-        DataFrame with predictions added
-    """
-    # Load feature columns
-    model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
-    with open(os.path.join(model_dir, "feature_columns.json")) as f:
-        features = json.load(f)
-    print(f"Model Features: {features}")
-    # Extract model components
-    ensemble_models = models["ensemble_models"]
-    label_encoder = models.get("label_encoder")
-    category_mappings = models.get("category_mappings", {})
-    uq_models = models.get("uq_models")
-    uq_metadata = models.get("uq_metadata")
-    compressed_features = TEMPLATE_PARAMS["compressed_features"]
-    # Prepare features
-    matched_df = match_features_case_insensitive(df, features)
-    matched_df, _ = convert_categorical_types(matched_df, features, category_mappings)
-    if compressed_features:
-        print("Decompressing features for prediction...")
-        matched_df, features = decompress_features(matched_df, features, compressed_features)
-    X = matched_df[features]
-    # Collect ensemble predictions
-    all_preds = [m.predict(X) for m in ensemble_models]
-    ensemble_preds = np.stack(all_preds, axis=0)
-    if label_encoder is not None:
-        # Classification: average probabilities, then argmax
-        all_probs = [m.predict_proba(X) for m in ensemble_models]
-        avg_probs = np.mean(np.stack(all_probs, axis=0), axis=0)
-        class_preds = np.argmax(avg_probs, axis=1)
-        df["prediction"] = label_encoder.inverse_transform(class_preds)
-        df["pred_proba"] = [p.tolist() for p in avg_probs]
-        df = expand_proba_column(df, label_encoder.classes_)
-    else:
-        # Regression: average predictions
-        df["prediction"] = np.mean(ensemble_preds, axis=0)
-        df["prediction_std"] = np.std(ensemble_preds, axis=0)
-        # Add UQ intervals if available
-        if uq_models and uq_metadata:
-            df = predict_intervals(df, X, uq_models, uq_metadata)
-            df = compute_confidence(df, uq_metadata["median_interval_width"], "q_10", "q_90")
-    print(f"Inference complete: {len(df)} predictions, {len(ensemble_models)} ensemble members")
-    return df

workbench/model_scripts/xgb_model/model_script_utils.py CHANGED Viewed

@@ -148,12 +148,16 @@ def convert_categorical_types(
 def decompress_features(
     df: pd.DataFrame, features: list[str], compressed_features: list[str]
 ) -> tuple[pd.DataFrame, list[str]]:
-    """Decompress bitstring features into individual bit columns.
+    """Decompress compressed features (bitstrings or count vectors) into individual columns.
+    Supports two formats (auto-detected):
+        - Bitstrings: "10110010..." → individual uint8 columns (0 or 1)
+        - Count vectors: "0,3,0,1,5,..." → individual uint8 columns (0-255)
     Args:
         df: The features DataFrame
         features: Full list of feature names
-        compressed_features: List of feature names to decompress (bitstrings)
+        compressed_features: List of feature names to decompress
     Returns:
         Tuple of (DataFrame with decompressed features, updated feature list)
@@ -178,18 +182,18 @@ def decompress_features(
         # Remove the feature from the list to avoid duplication
         decompressed_features.remove(feature)
-        # Handle all compressed features as bitstrings
-        bit_matrix = np.array([list(bitstring) for bitstring in df[feature]], dtype=np.uint8)
-        prefix = feature[:3]
+        # Auto-detect format and parse: comma-separated counts or bitstring
+        sample = str(df[feature].dropna().iloc[0]) if not df[feature].dropna().empty else ""
+        parse_fn = (lambda s: list(map(int, s.split(",")))) if "," in sample else list
+        feature_matrix = np.array([parse_fn(s) for s in df[feature]], dtype=np.uint8)
-        # Create all new columns at once - avoids fragmentation
-        new_col_names = [f"{prefix}_{i}" for i in range(bit_matrix.shape[1])]
-        new_df = pd.DataFrame(bit_matrix, columns=new_col_names, index=df.index)
+        # Create new columns with prefix from feature name
+        prefix = feature[:3]
+        new_col_names = [f"{prefix}_{i}" for i in range(feature_matrix.shape[1])]
+        new_df = pd.DataFrame(feature_matrix, columns=new_col_names, index=df.index)
-        # Add to features list
+        # Update features list and dataframe
         decompressed_features.extend(new_col_names)
-        # Drop original column and concatenate new ones
         df = df.drop(columns=[feature])
         df = pd.concat([df, new_df], axis=1)

workbench 0.8.213__py3-none-any.whl → 0.8.219__py3-none-any.whl

workbench 0.8.213py3-none-any.whl → 0.8.219py3-none-any.whl