workbench 0.8.205__py3-none-any.whl → 0.8.213__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. workbench/algorithms/models/noise_model.py +388 -0
  2. workbench/api/endpoint.py +3 -6
  3. workbench/api/feature_set.py +1 -1
  4. workbench/api/model.py +5 -11
  5. workbench/cached/cached_model.py +4 -4
  6. workbench/core/artifacts/endpoint_core.py +63 -153
  7. workbench/core/artifacts/model_core.py +21 -19
  8. workbench/core/transforms/features_to_model/features_to_model.py +2 -2
  9. workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +1 -1
  10. workbench/model_script_utils/model_script_utils.py +335 -0
  11. workbench/model_script_utils/pytorch_utils.py +395 -0
  12. workbench/model_script_utils/uq_harness.py +278 -0
  13. workbench/model_scripts/chemprop/chemprop.template +289 -666
  14. workbench/model_scripts/chemprop/generated_model_script.py +292 -669
  15. workbench/model_scripts/chemprop/model_script_utils.py +335 -0
  16. workbench/model_scripts/chemprop/requirements.txt +2 -10
  17. workbench/model_scripts/pytorch_model/generated_model_script.py +355 -612
  18. workbench/model_scripts/pytorch_model/model_script_utils.py +335 -0
  19. workbench/model_scripts/pytorch_model/pytorch.template +350 -607
  20. workbench/model_scripts/pytorch_model/pytorch_utils.py +395 -0
  21. workbench/model_scripts/pytorch_model/requirements.txt +1 -1
  22. workbench/model_scripts/pytorch_model/uq_harness.py +278 -0
  23. workbench/model_scripts/script_generation.py +2 -5
  24. workbench/model_scripts/uq_models/generated_model_script.py +65 -422
  25. workbench/model_scripts/xgb_model/generated_model_script.py +349 -412
  26. workbench/model_scripts/xgb_model/model_script_utils.py +335 -0
  27. workbench/model_scripts/xgb_model/uq_harness.py +278 -0
  28. workbench/model_scripts/xgb_model/xgb_model.template +344 -407
  29. workbench/scripts/training_test.py +85 -0
  30. workbench/utils/chemprop_utils.py +18 -656
  31. workbench/utils/metrics_utils.py +172 -0
  32. workbench/utils/model_utils.py +104 -47
  33. workbench/utils/pytorch_utils.py +32 -472
  34. workbench/utils/xgboost_local_crossfold.py +267 -0
  35. workbench/utils/xgboost_model_utils.py +49 -356
  36. workbench/web_interface/components/plugins/model_details.py +30 -68
  37. {workbench-0.8.205.dist-info → workbench-0.8.213.dist-info}/METADATA +5 -5
  38. {workbench-0.8.205.dist-info → workbench-0.8.213.dist-info}/RECORD +42 -31
  39. {workbench-0.8.205.dist-info → workbench-0.8.213.dist-info}/entry_points.txt +1 -0
  40. workbench/model_scripts/uq_models/mapie.template +0 -605
  41. workbench/model_scripts/uq_models/requirements.txt +0 -1
  42. {workbench-0.8.205.dist-info → workbench-0.8.213.dist-info}/WHEEL +0 -0
  43. {workbench-0.8.205.dist-info → workbench-0.8.213.dist-info}/licenses/LICENSE +0 -0
  44. {workbench-0.8.205.dist-info → workbench-0.8.213.dist-info}/top_level.txt +0 -0
@@ -1,497 +1,434 @@
1
- # Imports for XGB Model
2
- import xgboost as xgb
3
- import awswrangler as wr
4
- import numpy as np
5
-
6
- # Model Performance Scores
7
- from sklearn.metrics import (
8
- mean_absolute_error,
9
- median_absolute_error,
10
- r2_score,
11
- root_mean_squared_error,
12
- precision_recall_fscore_support,
13
- confusion_matrix,
14
- )
15
- from scipy.stats import spearmanr
1
+ # XGBoost Model Template for Workbench
2
+ #
3
+ # This template handles both classification and regression models with:
4
+ # - K-fold cross-validation ensemble training (or single train/val split)
5
+ # - Out-of-fold predictions for validation metrics
6
+ # - Uncertainty quantification for regression models
7
+ # - Sample weights support
8
+ # - Categorical feature handling
9
+ # - Compressed feature decompression
16
10
 
17
- # Classification Encoder
18
- from sklearn.preprocessing import LabelEncoder
19
-
20
- # Scikit Learn Imports
21
- from sklearn.model_selection import train_test_split
22
-
23
- from io import StringIO
24
- import json
25
11
  import argparse
26
- import joblib
12
+ import json
27
13
  import os
14
+
15
+ import awswrangler as wr
16
+ import joblib
17
+ import numpy as np
28
18
  import pandas as pd
29
- from typing import List, Tuple
19
+ import xgboost as xgb
20
+ from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
21
+ from sklearn.preprocessing import LabelEncoder
30
22
 
31
- # Default Hyperparameters for XGBoost
23
+ from model_script_utils import (
24
+ check_dataframe,
25
+ compute_classification_metrics,
26
+ compute_regression_metrics,
27
+ convert_categorical_types,
28
+ decompress_features,
29
+ expand_proba_column,
30
+ input_fn,
31
+ match_features_case_insensitive,
32
+ output_fn,
33
+ print_classification_metrics,
34
+ print_confusion_matrix,
35
+ print_regression_metrics,
36
+ )
37
+ from uq_harness import (
38
+ compute_confidence,
39
+ load_uq_models,
40
+ predict_intervals,
41
+ save_uq_models,
42
+ train_uq_models,
43
+ )
44
+
45
+ # =============================================================================
46
+ # Default Hyperparameters
47
+ # =============================================================================
32
48
  DEFAULT_HYPERPARAMETERS = {
49
+ # Training parameters
50
+ "n_folds": 5, # Number of CV folds (1 = single train/val split)
33
51
  # Core tree parameters
34
- "n_estimators": 200, # More trees for better signal capture when we have lots of features
35
- "max_depth": 6, # Medium depth
36
- "learning_rate": 0.05, # Lower rate with more estimators for smoother learning
37
-
52
+ "n_estimators": 200,
53
+ "max_depth": 6,
54
+ "learning_rate": 0.05,
38
55
  # Sampling parameters
39
- "subsample": 0.7, # Moderate row sampling to reduce overfitting
40
- "colsample_bytree": 0.6, # More aggressive feature sampling given lots of features
41
- "colsample_bylevel": 0.8, # Additional feature sampling at each tree level
42
-
56
+ "subsample": 0.7,
57
+ "colsample_bytree": 0.6,
58
+ "colsample_bylevel": 0.8,
43
59
  # Regularization
44
- "min_child_weight": 5, # Higher to prevent overfitting on small groups
45
- "gamma": 0.2, # Moderate pruning - you have real signal so don't over-prune
46
- "reg_alpha": 0.5, # L1 for feature selection (useful with many features)
47
- "reg_lambda": 2.0, # Strong L2 to smooth predictions
48
-
60
+ "min_child_weight": 5,
61
+ "gamma": 0.2,
62
+ "reg_alpha": 0.5,
63
+ "reg_lambda": 2.0,
49
64
  # Random seed
50
65
  "random_state": 42,
51
66
  }
52
67
 
53
- # Template Parameters
68
+ # Workbench-specific parameters (not passed to XGBoost)
69
+ WORKBENCH_PARAMS = {"n_folds"}
70
+
71
+ # Template parameters (filled in by Workbench)
54
72
  TEMPLATE_PARAMS = {
55
- "model_type": "classifier",
56
- "target": "wine_class",
57
- "features": ['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280_od315_of_diluted_wines', 'proline'],
73
+ "model_type": "uq_regressor",
74
+ "target": "udm_asy_res_efflux_ratio",
75
+ "features": ['smr_vsa4', 'tpsa', 'numhdonors', 'nhohcount', 'nbase', 'vsa_estate3', 'fr_guanido', 'mollogp', 'peoe_vsa8', 'peoe_vsa1', 'fr_imine', 'vsa_estate2', 'estate_vsa10', 'asphericity', 'xc_3dv', 'smr_vsa3', 'charge_centroid_distance', 'c3sp3', 'nitrogen_span', 'estate_vsa2', 'minpartialcharge', 'hba_hbd_ratio', 'slogp_vsa1', 'axp_7d', 'nocount', 'vsa_estate4', 'vsa_estate6', 'estate_vsa4', 'xc_4dv', 'xc_4d', 'num_s_centers', 'vsa_estate9', 'chi2v', 'axp_5d', 'mi', 'mse', 'bcut2d_mrhi', 'smr_vsa6', 'hallkieralpha', 'balabanj', 'amphiphilic_moment', 'type_ii_pattern_count', 'minabsestateindex', 'bcut2d_mwlow', 'axp_0dv', 'slogp_vsa5', 'axp_2d', 'axp_1dv', 'xch_5d', 'peoe_vsa10', 'molecular_asymmetry', 'kappa3', 'estate_vsa3', 'sse', 'bcut2d_logphi', 'fr_imidazole', 'molecular_volume_3d', 'bertzct', 'maxestateindex', 'aromatic_interaction_score', 'axp_3d', 'radius_of_gyration', 'vsa_estate7', 'si', 'axp_5dv', 'molecular_axis_length', 'estate_vsa6', 'fpdensitymorgan1', 'axp_6d', 'estate_vsa9', 'fpdensitymorgan2', 'xp_0dv', 'xp_6dv', 'molmr', 'qed', 'estate_vsa8', 'peoe_vsa9', 'xch_6dv', 'xp_7d', 'slogp_vsa2', 'xp_5dv', 'bcut2d_chghi', 'xch_6d', 'chi0n', 'slogp_vsa3', 'chi1v', 'chi3v', 'bcut2d_chglo', 'axp_1d', 'mp', 'num_defined_stereocenters', 'xp_3dv', 'bcut2d_mrlow', 'fr_al_oh', 'peoe_vsa7', 'chi2n', 'axp_6dv', 'axp_2dv', 'chi4n', 'xc_3d', 'axp_7dv', 'vsa_estate8', 'xch_7d', 'maxpartialcharge', 'chi1n', 'peoe_vsa2', 'axp_3dv', 'bcut2d_logplow', 'mv', 'xpc_5dv', 'kappa2', 'vsa_estate5', 'xp_5d', 'mm', 'maxabspartialcharge', 'axp_4dv', 'maxabsestateindex', 'axp_4d', 'xch_4dv', 'xp_2dv', 'heavyatommolwt', 'numatomstereocenters', 'xp_7dv', 'numsaturatedheterocycles', 'xp_3d', 'kappa1', 'mz', 'axp_0d', 'chi1', 'xch_4d', 'smr_vsa1', 'xp_2d', 'estate_vsa5', 'phi', 'fr_ether', 'xc_5d', 'c1sp3', 'estate_vsa7', 'estate_vsa1', 'vsa_estate1', 'slogp_vsa4', 'avgipc', 'smr_vsa10', 'numvalenceelectrons', 'xc_5dv', 'peoe_vsa12', 'peoe_vsa6', 'xpc_5d', 'xpc_6d', 'minestateindex', 'chi3n', 'smr_vsa5', 'xp_4d', 'numheteroatoms', 'fpdensitymorgan3', 'xpc_4d', 'sps', 'xp_1d', 'sv', 'fr_ar_n', 'slogp_vsa10', 'c2sp3', 'xpc_4dv', 'chi0v', 'xpc_6dv', 'xp_1dv', 'vsa_estate10', 'sare', 'c2sp2', 'mpe', 'xch_7dv', 'chi4v', 'type_i_pattern_count', 'sp', 'slogp_vsa8', 'amide_count', 'num_stereocenters', 'num_r_centers', 'tertiary_amine_count', 'spe', 'xp_4dv', 'numsaturatedrings', 'mare', 'numhacceptors', 'chi0', 'fractioncsp3', 'fr_nh0', 'xch_5dv', 'fr_aniline', 'smr_vsa7', 'labuteasa', 'c3sp2', 'xp_0d', 'xp_6d', 'peoe_vsa11', 'fr_ar_nh', 'molwt', 'intramolecular_hbond_potential', 'peoe_vsa3', 'fr_nhpyrrole', 'numaliphaticrings', 'hybratio', 'smr_vsa9', 'peoe_vsa13', 'bcut2d_mwhi', 'c1sp2', 'slogp_vsa11', 'numrotatablebonds', 'numaliphaticcarbocycles', 'slogp_vsa6', 'peoe_vsa4', 'numunspecifiedatomstereocenters', 'xc_6d', 'xc_6dv', 'num_unspecified_stereocenters', 'sz', 'minabspartialcharge', 'fcsp3', 'c1sp1', 'fr_piperzine', 'numaliphaticheterocycles', 'numamidebonds', 'fr_benzene', 'numaromaticheterocycles', 'sm', 'fr_priamide', 'fr_piperdine', 'fr_methoxy', 'c4sp3', 'fr_c_o_nocoo', 'exactmolwt', 'stereo_complexity', 'fr_hoccn', 'numaromaticcarbocycles', 'fr_nh2', 'numheterocycles', 'fr_morpholine', 'fr_ketone', 'fr_nh1', 'frac_defined_stereo', 'fr_aryl_methyl', 'fr_alkyl_halide', 'fr_phenol', 'fr_al_oh_notert', 'fr_ar_oh', 'fr_pyridine', 'fr_amide', 'slogp_vsa7', 'fr_halogen', 'numsaturatedcarbocycles', 'slogp_vsa12', 'fr_ndealkylation1', 'xch_3d', 'fr_bicyclic', 'naromatom', 'narombond'],
76
+ "id_column": "udm_mol_bat_id",
58
77
  "compressed_features": [],
59
- "model_metrics_s3_path": "s3://sandbox-sageworks-artifacts/models/wine-classification/training",
60
- "train_all_data": False,
61
- "hyperparameters": {},
78
+ "model_metrics_s3_path": "s3://ideaya-sageworks-bucket/models/caco2-er-reg-test-log/training",
79
+ "hyperparameters": {'target_transform': 'log'},
62
80
  }
63
81
 
64
82
 
65
- # Function to check if dataframe is empty
66
- def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
67
- """
68
- Check if the provided dataframe is empty and raise an exception if it is.
69
-
70
- Args:
71
- df (pd.DataFrame): DataFrame to check
72
- df_name (str): Name of the DataFrame
73
- """
74
- if df.empty:
75
- msg = f"*** The training data {df_name} has 0 rows! ***STOPPING***"
76
- print(msg)
77
- raise ValueError(msg)
78
-
79
-
80
- def expand_proba_column(df: pd.DataFrame, class_labels: List[str]) -> pd.DataFrame:
81
- """
82
- Expands a column in a DataFrame containing a list of probabilities into separate columns.
83
-
84
- Args:
85
- df (pd.DataFrame): DataFrame containing a "pred_proba" column
86
- class_labels (List[str]): List of class labels
87
-
88
- Returns:
89
- pd.DataFrame: DataFrame with the "pred_proba" expanded into separate columns
90
- """
91
-
92
- # Sanity check
93
- proba_column = "pred_proba"
94
- if proba_column not in df.columns:
95
- raise ValueError('DataFrame does not contain a "pred_proba" column')
96
-
97
- # Construct new column names with '_proba' suffix
98
- proba_splits = [f"{label}_proba" for label in class_labels]
99
-
100
- # Expand the proba_column into separate columns for each probability
101
- proba_df = pd.DataFrame(df[proba_column].tolist(), columns=proba_splits)
102
-
103
- # Drop any proba columns and reset the index in prep for the concat
104
- df = df.drop(columns=[proba_column] + proba_splits, errors="ignore")
105
- df = df.reset_index(drop=True)
106
-
107
- # Concatenate the new columns with the original DataFrame
108
- df = pd.concat([df, proba_df], axis=1)
109
- print(df)
110
- return df
111
-
112
-
113
- def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> pd.DataFrame:
114
- """
115
- Matches and renames DataFrame columns to match model feature names (case-insensitive).
116
- Prioritizes exact matches, then case-insensitive matches.
117
-
118
- Raises ValueError if any model features cannot be matched.
119
- """
120
- df_columns_lower = {col.lower(): col for col in df.columns}
121
- rename_dict = {}
122
- missing = []
123
- for feature in model_features:
124
- if feature in df.columns:
125
- continue # Exact match
126
- elif feature.lower() in df_columns_lower:
127
- rename_dict[df_columns_lower[feature.lower()]] = feature
128
- else:
129
- missing.append(feature)
130
-
131
- if missing:
132
- raise ValueError(f"Features not found: {missing}")
133
-
134
- # Rename the DataFrame columns to match the model features
135
- return df.rename(columns=rename_dict)
136
-
137
-
138
- def convert_categorical_types(df: pd.DataFrame, features: list, category_mappings={}) -> tuple:
139
- """
140
- Converts appropriate columns to categorical type with consistent mappings.
141
-
142
- Args:
143
- df (pd.DataFrame): The DataFrame to process.
144
- features (list): List of feature names to consider for conversion.
145
- category_mappings (dict, optional): Existing category mappings. If empty dict, we're in
146
- training mode. If populated, we're in inference mode.
147
-
148
- Returns:
149
- tuple: (processed DataFrame, category mappings dictionary)
150
- """
151
- # Training mode
152
- if category_mappings == {}:
153
- for col in df.select_dtypes(include=["object", "string"]):
154
- if col in features and df[col].nunique() < 20:
155
- print(f"Training mode: Converting {col} to category")
156
- df[col] = df[col].astype("category")
157
- category_mappings[col] = df[col].cat.categories.tolist() # Store category mappings
158
-
159
- # Inference mode
160
- else:
161
- for col, categories in category_mappings.items():
162
- if col in df.columns:
163
- print(f"Inference mode: Applying categorical mapping for {col}")
164
- df[col] = pd.Categorical(df[col], categories=categories) # Apply consistent categorical mapping
165
-
166
- return df, category_mappings
167
-
168
-
169
- def decompress_features(
170
- df: pd.DataFrame, features: List[str], compressed_features: List[str]
171
- ) -> Tuple[pd.DataFrame, List[str]]:
172
- """Prepare features for the model by decompressing bitstring features
173
-
174
- Args:
175
- df (pd.DataFrame): The features DataFrame
176
- features (List[str]): Full list of feature names
177
- compressed_features (List[str]): List of feature names to decompress (bitstrings)
178
-
179
- Returns:
180
- pd.DataFrame: DataFrame with the decompressed features
181
- List[str]: Updated list of feature names after decompression
182
-
183
- Raises:
184
- ValueError: If any missing values are found in the specified features
185
- """
186
-
187
- # Check for any missing values in the required features
188
- missing_counts = df[features].isna().sum()
189
- if missing_counts.any():
190
- missing_features = missing_counts[missing_counts > 0]
191
- print(
192
- f"WARNING: Found missing values in features: {missing_features.to_dict()}. "
193
- "WARNING: You might want to remove/replace all NaN values before processing."
194
- )
195
-
196
- # Decompress the specified compressed features
197
- decompressed_features = features.copy()
198
- for feature in compressed_features:
199
- if (feature not in df.columns) or (feature not in features):
200
- print(f"Feature '{feature}' not in the features list, skipping decompression.")
201
- continue
202
-
203
- # Remove the feature from the list of features to avoid duplication
204
- decompressed_features.remove(feature)
205
-
206
- # Handle all compressed features as bitstrings
207
- bit_matrix = np.array([list(bitstring) for bitstring in df[feature]], dtype=np.uint8)
208
- prefix = feature[:3]
209
-
210
- # Create all new columns at once - avoids fragmentation
211
- new_col_names = [f"{prefix}_{i}" for i in range(bit_matrix.shape[1])]
212
- new_df = pd.DataFrame(bit_matrix, columns=new_col_names, index=df.index)
213
-
214
- # Add to features list
215
- decompressed_features.extend(new_col_names)
216
-
217
- # Drop original column and concatenate new ones
218
- df = df.drop(columns=[feature])
219
- df = pd.concat([df, new_df], axis=1)
220
-
221
- return df, decompressed_features
222
-
223
-
83
+ # =============================================================================
84
+ # Training
85
+ # =============================================================================
224
86
  if __name__ == "__main__":
225
- """The main function is for training the XGBoost model"""
87
+ # -------------------------------------------------------------------------
88
+ # Setup: Parse arguments and load data
89
+ # -------------------------------------------------------------------------
90
+ parser = argparse.ArgumentParser()
91
+ parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model"))
92
+ parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train"))
93
+ parser.add_argument("--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data"))
94
+ args = parser.parse_args()
226
95
 
227
- # Harness Template Parameters
96
+ # Extract template parameters
228
97
  target = TEMPLATE_PARAMS["target"]
229
98
  features = TEMPLATE_PARAMS["features"]
230
99
  orig_features = features.copy()
100
+ id_column = TEMPLATE_PARAMS["id_column"]
231
101
  compressed_features = TEMPLATE_PARAMS["compressed_features"]
232
102
  model_type = TEMPLATE_PARAMS["model_type"]
233
103
  model_metrics_s3_path = TEMPLATE_PARAMS["model_metrics_s3_path"]
234
- train_all_data = TEMPLATE_PARAMS["train_all_data"]
235
- hyperparameters = {**DEFAULT_HYPERPARAMETERS, **TEMPLATE_PARAMS["hyperparameters"]}
236
- validation_split = 0.2
237
-
238
- # Script arguments for input/output directories
239
- parser = argparse.ArgumentParser()
240
- parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model"))
241
- parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train"))
242
- parser.add_argument(
243
- "--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data")
244
- )
245
- args = parser.parse_args()
104
+ hyperparameters = {**DEFAULT_HYPERPARAMETERS, **(TEMPLATE_PARAMS["hyperparameters"] or {})}
246
105
 
247
- # Read the training data into DataFrames
248
- training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
106
+ # Load training data
107
+ training_files = [os.path.join(args.train, f) for f in os.listdir(args.train) if f.endswith(".csv")]
249
108
  print(f"Training Files: {training_files}")
250
-
251
- # Combine files and read them all into a single pandas dataframe
252
- all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
253
-
254
- # Check if the dataframe is empty
109
+ all_df = pd.concat([pd.read_csv(f, engine="python") for f in training_files])
255
110
  check_dataframe(all_df, "training_df")
256
111
 
257
- # Features/Target output
258
112
  print(f"Target: {target}")
259
- print(f"Features: {str(features)}")
113
+ print(f"Features: {features}")
114
+ print(f"Hyperparameters: {hyperparameters}")
260
115
 
261
- # Convert any features that might be categorical to 'category' type
116
+ # -------------------------------------------------------------------------
117
+ # Preprocessing: Categorical features and decompression
118
+ # -------------------------------------------------------------------------
262
119
  all_df, category_mappings = convert_categorical_types(all_df, features)
263
120
 
264
- # If we have compressed features, decompress them
265
121
  if compressed_features:
266
- print(f"Decompressing features {compressed_features}...")
122
+ print(f"Decompressing features: {compressed_features}")
267
123
  all_df, features = decompress_features(all_df, features, compressed_features)
268
124
 
269
- # Do we want to train on all the data?
270
- if train_all_data:
271
- print("Training on ALL of the data")
272
- df_train = all_df.copy()
273
- df_val = all_df.copy()
274
-
275
- # Does the dataframe have a training column?
276
- elif "training" in all_df.columns:
277
- print("Found training column, splitting data based on training column")
278
- df_train = all_df[all_df["training"]]
279
- df_val = all_df[~all_df["training"]]
125
+ # -------------------------------------------------------------------------
126
+ # Classification setup: Encode target labels
127
+ # -------------------------------------------------------------------------
128
+ label_encoder = None
129
+ if model_type == "classifier":
130
+ label_encoder = LabelEncoder()
131
+ all_df[target] = label_encoder.fit_transform(all_df[target])
132
+ print(f"Class labels: {label_encoder.classes_.tolist()}")
133
+
134
+ # -------------------------------------------------------------------------
135
+ # Cross-validation setup
136
+ # -------------------------------------------------------------------------
137
+ n_folds = hyperparameters["n_folds"]
138
+ xgb_params = {k: v for k, v in hyperparameters.items() if k not in WORKBENCH_PARAMS}
139
+ print(f"XGBoost params: {xgb_params}")
140
+
141
+ if n_folds == 1:
142
+ # Single train/val split
143
+ if "training" in all_df.columns:
144
+ print("Using 'training' column for train/val split")
145
+ train_idx = np.where(all_df["training"])[0]
146
+ val_idx = np.where(~all_df["training"])[0]
147
+ else:
148
+ print("WARNING: No 'training' column found, using random 80/20 split")
149
+ indices = np.arange(len(all_df))
150
+ train_idx, val_idx = train_test_split(indices, test_size=0.2, random_state=42)
151
+ folds = [(train_idx, val_idx)]
280
152
  else:
281
- # Just do a random training Split
282
- print("WARNING: No training column found, splitting data with random state=42")
283
- df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
284
- print(f"FIT/TRAIN: {df_train.shape}")
285
- print(f"VALIDATION: {df_val.shape}")
153
+ # K-fold cross-validation
154
+ if model_type == "classifier":
155
+ kfold = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
156
+ folds = list(kfold.split(all_df, all_df[target]))
157
+ else:
158
+ kfold = KFold(n_splits=n_folds, shuffle=True, random_state=42)
159
+ folds = list(kfold.split(all_df))
286
160
 
287
- # Use any hyperparameters to set up both the trainer and model configurations
288
- print(f"Hyperparameters: {hyperparameters}")
161
+ print(f"Training {'single model' if n_folds == 1 else f'{n_folds}-fold ensemble'}...")
289
162
 
290
- # Now spin up our XGB Model
163
+ # -------------------------------------------------------------------------
164
+ # Training loop
165
+ # -------------------------------------------------------------------------
166
+ # Initialize out-of-fold storage
167
+ oof_predictions = np.full(len(all_df), np.nan, dtype=np.float64)
291
168
  if model_type == "classifier":
292
- xgb_model = xgb.XGBClassifier(enable_categorical=True, **hyperparameters)
293
-
294
- # Encode the target column
295
- label_encoder = LabelEncoder()
296
- df_train[target] = label_encoder.fit_transform(df_train[target])
297
- df_val[target] = label_encoder.transform(df_val[target])
298
-
169
+ num_classes = len(label_encoder.classes_)
170
+ oof_proba = np.full((len(all_df), num_classes), np.nan, dtype=np.float64)
299
171
  else:
300
- xgb_model = xgb.XGBRegressor(enable_categorical=True, **hyperparameters)
301
- label_encoder = None # We don't need this for regression
302
-
303
- # Grab our Features, Target and Train the Model
304
- y_train = df_train[target]
305
- X_train = df_train[features]
306
- xgb_model.fit(X_train, y_train)
307
-
308
- # Make Predictions on the Validation Set
309
- print(f"Making Predictions on Validation Set...")
310
- y_validate = df_val[target]
311
- X_validate = df_val[features]
312
- preds = xgb_model.predict(X_validate)
172
+ oof_proba = None
173
+
174
+ # Check for sample weights
175
+ has_sample_weights = "sample_weight" in all_df.columns
176
+ if has_sample_weights:
177
+ sw = all_df["sample_weight"]
178
+ print(f"Using sample weights: min={sw.min():.2f}, max={sw.max():.2f}, mean={sw.mean():.2f}")
179
+
180
+ # Train ensemble
181
+ ensemble_models = []
182
+ for fold_idx, (train_idx, val_idx) in enumerate(folds):
183
+ print(f"\n{'='*50}")
184
+ print(f"Fold {fold_idx + 1}/{len(folds)} - Train: {len(train_idx)}, Val: {len(val_idx)}")
185
+ print(f"{'='*50}")
186
+
187
+ # Prepare fold data
188
+ X_train = all_df.iloc[train_idx][features]
189
+ y_train = all_df.iloc[train_idx][target]
190
+ X_val = all_df.iloc[val_idx][features]
191
+ sample_weights = all_df.iloc[train_idx]["sample_weight"] if has_sample_weights else None
192
+
193
+ # Create model with fold-specific random state for diversity
194
+ fold_params = {**xgb_params, "random_state": xgb_params.get("random_state", 42) + fold_idx}
195
+ if model_type == "classifier":
196
+ model = xgb.XGBClassifier(enable_categorical=True, **fold_params)
197
+ else:
198
+ model = xgb.XGBRegressor(enable_categorical=True, **fold_params)
199
+
200
+ # Train
201
+ model.fit(X_train, y_train, sample_weight=sample_weights)
202
+ ensemble_models.append(model)
203
+
204
+ # Out-of-fold predictions
205
+ oof_predictions[val_idx] = model.predict(X_val)
206
+ if model_type == "classifier":
207
+ oof_proba[val_idx] = model.predict_proba(X_val)
208
+
209
+ print(f"\nTraining complete! Trained {len(ensemble_models)} model(s).")
210
+
211
+ # -------------------------------------------------------------------------
212
+ # Prepare validation results
213
+ # -------------------------------------------------------------------------
214
+ if n_folds == 1:
215
+ # Single fold: only validation rows
216
+ val_mask = ~np.isnan(oof_predictions)
217
+ df_val = all_df[val_mask].copy()
218
+ predictions = oof_predictions[val_mask]
219
+ if oof_proba is not None:
220
+ oof_proba = oof_proba[val_mask]
221
+ else:
222
+ # K-fold: all rows have out-of-fold predictions
223
+ df_val = all_df.copy()
224
+ predictions = oof_predictions
225
+
226
+ # Decode labels for classification
313
227
  if model_type == "classifier":
314
- # Also get the probabilities for each class
315
- print("Processing Probabilities...")
316
- probs = xgb_model.predict_proba(X_validate)
317
- df_val["pred_proba"] = [p.tolist() for p in probs]
318
-
319
- # Expand the pred_proba column into separate columns for each class
320
- print(df_val.columns)
321
- df_val = expand_proba_column(df_val, label_encoder.classes_)
322
- print(df_val.columns)
323
-
324
- # Decode the target and prediction labels
325
- y_validate = label_encoder.inverse_transform(y_validate)
326
- preds = label_encoder.inverse_transform(preds)
327
-
328
- # Save predictions to S3 (just the target, prediction, and '_proba' columns)
329
- df_val["prediction"] = preds
330
- output_columns = [target, "prediction"]
331
- output_columns += [col for col in df_val.columns if col.endswith("_proba")]
332
- wr.s3.to_csv(
333
- df_val[output_columns],
334
- path=f"{model_metrics_s3_path}/validation_predictions.csv",
335
- index=False,
336
- )
337
-
338
- # Report Performance Metrics
228
+ df_val[target] = label_encoder.inverse_transform(df_val[target].astype(int))
229
+ df_val["prediction"] = label_encoder.inverse_transform(predictions.astype(int))
230
+ if oof_proba is not None:
231
+ df_val["pred_proba"] = [p.tolist() for p in oof_proba]
232
+ df_val = expand_proba_column(df_val, label_encoder.classes_)
233
+ else:
234
+ df_val["prediction"] = predictions
235
+
236
+ # -------------------------------------------------------------------------
237
+ # Compute and print metrics
238
+ # -------------------------------------------------------------------------
239
+ y_true = df_val[target].values
240
+ y_pred = df_val["prediction"].values
241
+
339
242
  if model_type == "classifier":
340
- # Get the label names and their integer mapping
341
243
  label_names = label_encoder.classes_
342
-
343
- # Calculate various model performance metrics
344
- scores = precision_recall_fscore_support(y_validate, preds, average=None, labels=label_names)
345
-
346
- # Put the scores into a dataframe
347
- score_df = pd.DataFrame(
348
- {
349
- target: label_names,
350
- "precision": scores[0],
351
- "recall": scores[1],
352
- "f1": scores[2],
353
- "support": scores[3],
354
- }
244
+ score_df = compute_classification_metrics(y_true, y_pred, label_names, target)
245
+ print_classification_metrics(score_df, target, label_names)
246
+ print_confusion_matrix(y_true, y_pred, label_names)
247
+ else:
248
+ metrics = compute_regression_metrics(y_true, y_pred)
249
+ print_regression_metrics(metrics)
250
+
251
+ # Compute ensemble prediction_std
252
+ if n_folds > 1:
253
+ all_preds = np.stack([m.predict(all_df[features]) for m in ensemble_models])
254
+ df_val["prediction_std"] = np.std(all_preds, axis=0)
255
+ print(f"Ensemble std - mean: {df_val['prediction_std'].mean():.4f}, max: {df_val['prediction_std'].max():.4f}")
256
+ else:
257
+ df_val["prediction_std"] = 0.0
258
+
259
+ # Train UQ models for uncertainty quantification
260
+ print("\n" + "=" * 50)
261
+ print("Training UQ Models")
262
+ print("=" * 50)
263
+ uq_models, uq_metadata = train_uq_models(
264
+ all_df[features], all_df[target], df_val[features], y_true
355
265
  )
266
+ df_val = predict_intervals(df_val, df_val[features], uq_models, uq_metadata)
267
+ df_val = compute_confidence(df_val, uq_metadata["median_interval_width"])
356
268
 
357
- # We need to get creative with the Classification Metrics
358
- metrics = ["precision", "recall", "f1", "support"]
359
- for t in label_names:
360
- for m in metrics:
361
- value = score_df.loc[score_df[target] == t, m].iloc[0]
362
- print(f"Metrics:{t}:{m} {value}")
269
+ # -------------------------------------------------------------------------
270
+ # Save validation predictions to S3
271
+ # -------------------------------------------------------------------------
272
+ output_columns = []
273
+ if id_column in df_val.columns:
274
+ output_columns.append(id_column)
275
+ output_columns += [target, "prediction"]
363
276
 
364
- # Compute and output the confusion matrix
365
- conf_mtx = confusion_matrix(y_validate, preds, labels=label_names)
366
- for i, row_name in enumerate(label_names):
367
- for j, col_name in enumerate(label_names):
368
- value = conf_mtx[i, j]
369
- print(f"ConfusionMatrix:{row_name}:{col_name} {value}")
277
+ if model_type != "classifier":
278
+ output_columns.append("prediction_std")
279
+ output_columns += [c for c in df_val.columns if c.startswith("q_") or c == "confidence"]
370
280
 
371
- else:
372
- # Calculate various model performance metrics (regression)
373
- rmse = root_mean_squared_error(y_validate, preds)
374
- mae = mean_absolute_error(y_validate, preds)
375
- medae = median_absolute_error(y_validate, preds)
376
- r2 = r2_score(y_validate, preds)
377
- spearman_corr = spearmanr(y_validate, preds).correlation
378
- support = len(df_val)
379
- print(f"rmse: {rmse:.3f}")
380
- print(f"mae: {mae:.3f}")
381
- print(f"medae: {medae:.3f}")
382
- print(f"r2: {r2:.3f}")
383
- print(f"spearmanr: {spearman_corr:.3f}")
384
- print(f"support: {support}")
385
-
386
- # Now save the model to the standard place/name
387
- joblib.dump(xgb_model, os.path.join(args.model_dir, "xgb_model.joblib"))
388
-
389
- # Save the label encoder if we have one
390
- if label_encoder:
391
- joblib.dump(label_encoder, os.path.join(args.model_dir, "label_encoder.joblib"))
281
+ output_columns += [c for c in df_val.columns if c.endswith("_proba")]
392
282
 
393
- # Save the features (this will validate input during predictions)
394
- with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
395
- json.dump(orig_features, fp) # We save the original features, not the decompressed ones
283
+ wr.s3.to_csv(df_val[output_columns], f"{model_metrics_s3_path}/validation_predictions.csv", index=False)
396
284
 
397
- # Save the category mappings
398
- with open(os.path.join(args.model_dir, "category_mappings.json"), "w") as fp:
399
- json.dump(category_mappings, fp)
285
+ # -------------------------------------------------------------------------
286
+ # Save model artifacts
287
+ # -------------------------------------------------------------------------
288
+ # Ensemble models
289
+ for idx, ens_model in enumerate(ensemble_models):
290
+ joblib.dump(ens_model, os.path.join(args.model_dir, f"xgb_model_{idx}.joblib"))
291
+ print(f"Saved {len(ensemble_models)} XGBoost model(s)")
400
292
 
293
+ # Metadata files
294
+ with open(os.path.join(args.model_dir, "ensemble_metadata.json"), "w") as f:
295
+ json.dump({"n_ensemble": len(ensemble_models), "n_folds": n_folds}, f)
401
296
 
402
- def model_fn(model_dir):
403
- """Deserialize and return fitted XGBoost model"""
404
- model_path = os.path.join(model_dir, "xgb_model.joblib")
405
- model = joblib.load(model_path)
406
- return model
297
+ with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as f:
298
+ json.dump(orig_features, f)
407
299
 
300
+ with open(os.path.join(args.model_dir, "category_mappings.json"), "w") as f:
301
+ json.dump(category_mappings, f)
408
302
 
409
- def input_fn(input_data, content_type):
410
- """Parse input data and return a DataFrame."""
411
- if not input_data:
412
- raise ValueError("Empty input data is not supported!")
303
+ with open(os.path.join(args.model_dir, "hyperparameters.json"), "w") as f:
304
+ json.dump(hyperparameters, f, indent=2)
413
305
 
414
- # Decode bytes to string if necessary
415
- if isinstance(input_data, bytes):
416
- input_data = input_data.decode("utf-8")
306
+ if label_encoder:
307
+ joblib.dump(label_encoder, os.path.join(args.model_dir, "label_encoder.joblib"))
417
308
 
418
- if "text/csv" in content_type:
419
- return pd.read_csv(StringIO(input_data))
420
- elif "application/json" in content_type:
421
- return pd.DataFrame(json.loads(input_data)) # Assumes JSON array of records
422
- else:
423
- raise ValueError(f"{content_type} not supported!")
309
+ if model_type != "classifier":
310
+ save_uq_models(uq_models, uq_metadata, args.model_dir)
311
+
312
+ print(f"\nModel training complete! Artifacts saved to {args.model_dir}")
424
313
 
425
314
 
426
- def output_fn(output_df, accept_type):
427
- """Supports both CSV and JSON output formats."""
428
- if "text/csv" in accept_type:
429
- csv_output = output_df.fillna("N/A").to_csv(index=False) # CSV with N/A for missing values
430
- return csv_output, "text/csv"
431
- elif "application/json" in accept_type:
432
- return output_df.to_json(orient="records"), "application/json" # JSON array of records (NaNs -> null)
315
+ # =============================================================================
316
+ # Model Loading (for SageMaker inference)
317
+ # =============================================================================
318
+ def model_fn(model_dir: str) -> dict:
319
+ """Load XGBoost ensemble and associated artifacts.
320
+
321
+ Args:
322
+ model_dir: Directory containing model artifacts
323
+
324
+ Returns:
325
+ Dictionary with ensemble_models, label_encoder, category_mappings, uq_models, etc.
326
+ """
327
+ # Load ensemble metadata
328
+ metadata_path = os.path.join(model_dir, "ensemble_metadata.json")
329
+ if os.path.exists(metadata_path):
330
+ with open(metadata_path) as f:
331
+ metadata = json.load(f)
332
+ n_ensemble = metadata["n_ensemble"]
433
333
  else:
434
- raise RuntimeError(f"{accept_type} accept type is not supported by this script.")
334
+ n_ensemble = 1 # Legacy single model
435
335
 
336
+ # Load ensemble models
337
+ ensemble_models = []
338
+ for i in range(n_ensemble):
339
+ model_path = os.path.join(model_dir, f"xgb_model_{i}.joblib")
340
+ if not os.path.exists(model_path):
341
+ model_path = os.path.join(model_dir, "xgb_model.joblib") # Legacy fallback
342
+ ensemble_models.append(joblib.load(model_path))
436
343
 
437
- def predict_fn(df, model) -> pd.DataFrame:
438
- """Make Predictions with our XGB Model
344
+ # Load label encoder (classifier only)
345
+ label_encoder = None
346
+ encoder_path = os.path.join(model_dir, "label_encoder.joblib")
347
+ if os.path.exists(encoder_path):
348
+ label_encoder = joblib.load(encoder_path)
349
+
350
+ # Load category mappings
351
+ category_mappings = {}
352
+ category_path = os.path.join(model_dir, "category_mappings.json")
353
+ if os.path.exists(category_path):
354
+ with open(category_path) as f:
355
+ category_mappings = json.load(f)
356
+
357
+ # Load UQ models (regression only)
358
+ uq_models, uq_metadata = None, None
359
+ uq_path = os.path.join(model_dir, "uq_metadata.json")
360
+ if os.path.exists(uq_path):
361
+ uq_models, uq_metadata = load_uq_models(model_dir)
362
+
363
+ return {
364
+ "ensemble_models": ensemble_models,
365
+ "n_ensemble": n_ensemble,
366
+ "label_encoder": label_encoder,
367
+ "category_mappings": category_mappings,
368
+ "uq_models": uq_models,
369
+ "uq_metadata": uq_metadata,
370
+ }
371
+
372
+
373
+ # =============================================================================
374
+ # Inference (for SageMaker inference)
375
+ # =============================================================================
376
+ def predict_fn(df: pd.DataFrame, models: dict) -> pd.DataFrame:
377
+ """Make predictions with XGBoost ensemble.
439
378
 
440
379
  Args:
441
- df (pd.DataFrame): The input DataFrame
442
- model: The model use for predictions
380
+ df: Input DataFrame with features
381
+ models: Dictionary from model_fn containing ensemble and metadata
443
382
 
444
383
  Returns:
445
- pd.DataFrame: The DataFrame with the predictions added
384
+ DataFrame with predictions added
446
385
  """
447
- compressed_features = TEMPLATE_PARAMS["compressed_features"]
448
-
449
- # Grab our feature columns (from training)
386
+ # Load feature columns
450
387
  model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
451
- with open(os.path.join(model_dir, "feature_columns.json")) as fp:
452
- features = json.load(fp)
388
+ with open(os.path.join(model_dir, "feature_columns.json")) as f:
389
+ features = json.load(f)
453
390
  print(f"Model Features: {features}")
454
391
 
455
- # Load the category mappings (from training)
456
- with open(os.path.join(model_dir, "category_mappings.json")) as fp:
457
- category_mappings = json.load(fp)
458
-
459
- # Load our Label Encoder if we have one
460
- label_encoder = None
461
- if os.path.exists(os.path.join(model_dir, "label_encoder.joblib")):
462
- label_encoder = joblib.load(os.path.join(model_dir, "label_encoder.joblib"))
392
+ # Extract model components
393
+ ensemble_models = models["ensemble_models"]
394
+ label_encoder = models.get("label_encoder")
395
+ category_mappings = models.get("category_mappings", {})
396
+ uq_models = models.get("uq_models")
397
+ uq_metadata = models.get("uq_metadata")
398
+ compressed_features = TEMPLATE_PARAMS["compressed_features"]
463
399
 
464
- # We're going match features in a case-insensitive manner, accounting for all the permutations
465
- # - Model has a feature list that's any case ("Id", "taCos", "cOunT", "likes_tacos")
466
- # - Incoming data has columns that are mixed case ("ID", "Tacos", "Count", "Likes_Tacos")
400
+ # Prepare features
467
401
  matched_df = match_features_case_insensitive(df, features)
468
-
469
- # Detect categorical types in the incoming DataFrame
470
402
  matched_df, _ = convert_categorical_types(matched_df, features, category_mappings)
471
403
 
472
- # If we have compressed features, decompress them
473
404
  if compressed_features:
474
405
  print("Decompressing features for prediction...")
475
406
  matched_df, features = decompress_features(matched_df, features, compressed_features)
476
407
 
477
- # Predict the features against our XGB Model
478
408
  X = matched_df[features]
479
- predictions = model.predict(X)
480
409
 
481
- # If we have a label encoder, decode the predictions
482
- if label_encoder:
483
- predictions = label_encoder.inverse_transform(predictions)
484
-
485
- # Set the predictions on the DataFrame
486
- df["prediction"] = predictions
410
+ # Collect ensemble predictions
411
+ all_preds = [m.predict(X) for m in ensemble_models]
412
+ ensemble_preds = np.stack(all_preds, axis=0)
487
413
 
488
- # Does our model have a 'predict_proba' method? If so we will call it and add the results to the DataFrame
489
- if getattr(model, "predict_proba", None):
490
- probs = model.predict_proba(matched_df[features])
491
- df["pred_proba"] = [p.tolist() for p in probs]
414
+ if label_encoder is not None:
415
+ # Classification: average probabilities, then argmax
416
+ all_probs = [m.predict_proba(X) for m in ensemble_models]
417
+ avg_probs = np.mean(np.stack(all_probs, axis=0), axis=0)
418
+ class_preds = np.argmax(avg_probs, axis=1)
492
419
 
493
- # Expand the pred_proba column into separate columns for each class
420
+ df["prediction"] = label_encoder.inverse_transform(class_preds)
421
+ df["pred_proba"] = [p.tolist() for p in avg_probs]
494
422
  df = expand_proba_column(df, label_encoder.classes_)
423
+ else:
424
+ # Regression: average predictions
425
+ df["prediction"] = np.mean(ensemble_preds, axis=0)
426
+ df["prediction_std"] = np.std(ensemble_preds, axis=0)
427
+
428
+ # Add UQ intervals if available
429
+ if uq_models and uq_metadata:
430
+ df = predict_intervals(df, X, uq_models, uq_metadata)
431
+ df = compute_confidence(df, uq_metadata["median_interval_width"], "q_10", "q_90")
495
432
 
496
- # All done, return the DataFrame with new columns for the predictions
433
+ print(f"Inference complete: {len(df)} predictions, {len(ensemble_models)} ensemble members")
497
434
  return df