workbench 0.8.202__py3-none-any.whl → 0.8.220__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of workbench might be problematic. Click here for more details.

Files changed (84) hide show
  1. workbench/algorithms/dataframe/compound_dataset_overlap.py +321 -0
  2. workbench/algorithms/dataframe/feature_space_proximity.py +168 -75
  3. workbench/algorithms/dataframe/fingerprint_proximity.py +421 -85
  4. workbench/algorithms/dataframe/projection_2d.py +44 -21
  5. workbench/algorithms/dataframe/proximity.py +78 -150
  6. workbench/algorithms/graph/light/proximity_graph.py +5 -5
  7. workbench/algorithms/models/cleanlab_model.py +382 -0
  8. workbench/algorithms/models/noise_model.py +388 -0
  9. workbench/algorithms/sql/outliers.py +3 -3
  10. workbench/api/__init__.py +3 -0
  11. workbench/api/df_store.py +17 -108
  12. workbench/api/endpoint.py +13 -11
  13. workbench/api/feature_set.py +111 -8
  14. workbench/api/meta_model.py +289 -0
  15. workbench/api/model.py +45 -12
  16. workbench/api/parameter_store.py +3 -52
  17. workbench/cached/cached_model.py +4 -4
  18. workbench/core/artifacts/artifact.py +5 -5
  19. workbench/core/artifacts/df_store_core.py +114 -0
  20. workbench/core/artifacts/endpoint_core.py +228 -237
  21. workbench/core/artifacts/feature_set_core.py +185 -230
  22. workbench/core/artifacts/model_core.py +34 -26
  23. workbench/core/artifacts/parameter_store_core.py +98 -0
  24. workbench/core/pipelines/pipeline_executor.py +1 -1
  25. workbench/core/transforms/features_to_model/features_to_model.py +22 -10
  26. workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +41 -10
  27. workbench/core/transforms/pandas_transforms/pandas_to_features.py +11 -2
  28. workbench/model_script_utils/model_script_utils.py +339 -0
  29. workbench/model_script_utils/pytorch_utils.py +405 -0
  30. workbench/model_script_utils/uq_harness.py +278 -0
  31. workbench/model_scripts/chemprop/chemprop.template +428 -631
  32. workbench/model_scripts/chemprop/generated_model_script.py +432 -635
  33. workbench/model_scripts/chemprop/model_script_utils.py +339 -0
  34. workbench/model_scripts/chemprop/requirements.txt +2 -10
  35. workbench/model_scripts/custom_models/chem_info/fingerprints.py +87 -46
  36. workbench/model_scripts/custom_models/proximity/feature_space_proximity.py +194 -0
  37. workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +6 -6
  38. workbench/model_scripts/custom_models/uq_models/feature_space_proximity.py +194 -0
  39. workbench/model_scripts/meta_model/generated_model_script.py +209 -0
  40. workbench/model_scripts/meta_model/meta_model.template +209 -0
  41. workbench/model_scripts/pytorch_model/generated_model_script.py +374 -613
  42. workbench/model_scripts/pytorch_model/model_script_utils.py +339 -0
  43. workbench/model_scripts/pytorch_model/pytorch.template +370 -609
  44. workbench/model_scripts/pytorch_model/pytorch_utils.py +405 -0
  45. workbench/model_scripts/pytorch_model/requirements.txt +1 -1
  46. workbench/model_scripts/pytorch_model/uq_harness.py +278 -0
  47. workbench/model_scripts/script_generation.py +6 -5
  48. workbench/model_scripts/uq_models/generated_model_script.py +65 -422
  49. workbench/model_scripts/xgb_model/generated_model_script.py +372 -395
  50. workbench/model_scripts/xgb_model/model_script_utils.py +339 -0
  51. workbench/model_scripts/xgb_model/uq_harness.py +278 -0
  52. workbench/model_scripts/xgb_model/xgb_model.template +366 -396
  53. workbench/repl/workbench_shell.py +0 -5
  54. workbench/resources/open_source_api.key +1 -1
  55. workbench/scripts/endpoint_test.py +2 -2
  56. workbench/scripts/meta_model_sim.py +35 -0
  57. workbench/scripts/training_test.py +85 -0
  58. workbench/utils/chem_utils/fingerprints.py +87 -46
  59. workbench/utils/chem_utils/projections.py +16 -6
  60. workbench/utils/chemprop_utils.py +36 -655
  61. workbench/utils/meta_model_simulator.py +499 -0
  62. workbench/utils/metrics_utils.py +256 -0
  63. workbench/utils/model_utils.py +192 -54
  64. workbench/utils/pytorch_utils.py +33 -472
  65. workbench/utils/shap_utils.py +1 -55
  66. workbench/utils/xgboost_local_crossfold.py +267 -0
  67. workbench/utils/xgboost_model_utils.py +49 -356
  68. workbench/web_interface/components/model_plot.py +7 -1
  69. workbench/web_interface/components/plugins/model_details.py +30 -68
  70. workbench/web_interface/components/plugins/scatter_plot.py +4 -8
  71. {workbench-0.8.202.dist-info → workbench-0.8.220.dist-info}/METADATA +6 -5
  72. {workbench-0.8.202.dist-info → workbench-0.8.220.dist-info}/RECORD +76 -60
  73. {workbench-0.8.202.dist-info → workbench-0.8.220.dist-info}/entry_points.txt +2 -0
  74. workbench/core/cloud_platform/aws/aws_df_store.py +0 -404
  75. workbench/core/cloud_platform/aws/aws_parameter_store.py +0 -296
  76. workbench/model_scripts/custom_models/meta_endpoints/example.py +0 -53
  77. workbench/model_scripts/custom_models/proximity/proximity.py +0 -410
  78. workbench/model_scripts/custom_models/uq_models/meta_uq.template +0 -377
  79. workbench/model_scripts/custom_models/uq_models/proximity.py +0 -410
  80. workbench/model_scripts/uq_models/mapie.template +0 -605
  81. workbench/model_scripts/uq_models/requirements.txt +0 -1
  82. {workbench-0.8.202.dist-info → workbench-0.8.220.dist-info}/WHEEL +0 -0
  83. {workbench-0.8.202.dist-info → workbench-0.8.220.dist-info}/licenses/LICENSE +0 -0
  84. {workbench-0.8.202.dist-info → workbench-0.8.220.dist-info}/top_level.txt +0 -0
@@ -1,475 +1,445 @@
1
- # Imports for XGB Model
2
- import xgboost as xgb
3
- import awswrangler as wr
1
+ # XGBoost Model Template for Workbench
2
+ #
3
+ # This template handles both classification and regression models with:
4
+ # - K-fold cross-validation ensemble training (or single train/val split)
5
+ # - Out-of-fold predictions for validation metrics
6
+ # - Uncertainty quantification for regression models
7
+ # - Sample weights support
8
+ # - Categorical feature handling
9
+ # - Compressed feature decompression
10
+ #
11
+ # NOTE: Imports are structured to minimize serverless endpoint startup time.
12
+ # Heavy imports (sklearn, awswrangler) are deferred to training time.
13
+
14
+ import json
15
+ import os
16
+
17
+ import joblib
4
18
  import numpy as np
19
+ import pandas as pd
20
+ import xgboost as xgb
5
21
 
6
- # Model Performance Scores
7
- from sklearn.metrics import (
8
- mean_absolute_error,
9
- median_absolute_error,
10
- r2_score,
11
- root_mean_squared_error,
12
- precision_recall_fscore_support,
13
- confusion_matrix,
22
+ from model_script_utils import (
23
+ convert_categorical_types,
24
+ decompress_features,
25
+ expand_proba_column,
26
+ input_fn,
27
+ match_features_case_insensitive,
28
+ output_fn,
29
+ )
30
+ from uq_harness import (
31
+ compute_confidence,
32
+ load_uq_models,
33
+ predict_intervals,
14
34
  )
15
- from scipy.stats import spearmanr
16
35
 
17
- # Classification Encoder
18
- from sklearn.preprocessing import LabelEncoder
36
+ # =============================================================================
37
+ # Default Hyperparameters
38
+ # =============================================================================
39
+ DEFAULT_HYPERPARAMETERS = {
40
+ # Training parameters
41
+ "n_folds": 5, # Number of CV folds (1 = single train/val split)
42
+ # Core tree parameters
43
+ "n_estimators": 300,
44
+ "max_depth": 7,
45
+ "learning_rate": 0.05,
46
+ # Sampling parameters (less aggressive - ensemble provides regularization)
47
+ "subsample": 0.8,
48
+ "colsample_bytree": 0.8,
49
+ # Regularization (lighter - ensemble averaging reduces overfitting)
50
+ "min_child_weight": 3,
51
+ "gamma": 0.1,
52
+ "reg_alpha": 0.1,
53
+ "reg_lambda": 1.0,
54
+ # Random seed
55
+ "seed": 42,
56
+ }
19
57
 
20
- # Scikit Learn Imports
21
- from sklearn.model_selection import train_test_split
58
+ # Workbench-specific parameters (not passed to XGBoost)
59
+ WORKBENCH_PARAMS = {"n_folds"}
22
60
 
23
- from io import StringIO
24
- import json
25
- import argparse
26
- import joblib
27
- import os
28
- import pandas as pd
29
- from typing import List, Tuple
61
+ # Regression-only parameters (filtered out for classifiers)
62
+ REGRESSION_ONLY_PARAMS = {"objective"}
30
63
 
31
- # Template Parameters
64
+ # Template parameters (filled in by Workbench)
32
65
  TEMPLATE_PARAMS = {
33
66
  "model_type": "{{model_type}}",
34
67
  "target": "{{target_column}}",
35
68
  "features": "{{feature_list}}",
69
+ "id_column": "{{id_column}}",
36
70
  "compressed_features": "{{compressed_features}}",
37
71
  "model_metrics_s3_path": "{{model_metrics_s3_path}}",
38
- "train_all_data": "{{train_all_data}}",
39
72
  "hyperparameters": "{{hyperparameters}}",
40
73
  }
41
74
 
42
75
 
43
- # Function to check if dataframe is empty
44
- def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
45
- """
46
- Check if the provided dataframe is empty and raise an exception if it is.
47
-
48
- Args:
49
- df (pd.DataFrame): DataFrame to check
50
- df_name (str): Name of the DataFrame
51
- """
52
- if df.empty:
53
- msg = f"*** The training data {df_name} has 0 rows! ***STOPPING***"
54
- print(msg)
55
- raise ValueError(msg)
56
-
57
-
58
- def expand_proba_column(df: pd.DataFrame, class_labels: List[str]) -> pd.DataFrame:
59
- """
60
- Expands a column in a DataFrame containing a list of probabilities into separate columns.
61
-
62
- Args:
63
- df (pd.DataFrame): DataFrame containing a "pred_proba" column
64
- class_labels (List[str]): List of class labels
65
-
66
- Returns:
67
- pd.DataFrame: DataFrame with the "pred_proba" expanded into separate columns
68
- """
69
-
70
- # Sanity check
71
- proba_column = "pred_proba"
72
- if proba_column not in df.columns:
73
- raise ValueError('DataFrame does not contain a "pred_proba" column')
74
-
75
- # Construct new column names with '_proba' suffix
76
- proba_splits = [f"{label}_proba" for label in class_labels]
77
-
78
- # Expand the proba_column into separate columns for each probability
79
- proba_df = pd.DataFrame(df[proba_column].tolist(), columns=proba_splits)
80
-
81
- # Drop any proba columns and reset the index in prep for the concat
82
- df = df.drop(columns=[proba_column] + proba_splits, errors="ignore")
83
- df = df.reset_index(drop=True)
76
+ # =============================================================================
77
+ # Model Loading (for SageMaker inference)
78
+ # =============================================================================
79
+ def model_fn(model_dir: str) -> dict:
80
+ """Load XGBoost ensemble from the specified directory."""
81
+ # Load ensemble metadata
82
+ metadata_path = os.path.join(model_dir, "ensemble_metadata.json")
83
+ if os.path.exists(metadata_path):
84
+ with open(metadata_path) as f:
85
+ metadata = json.load(f)
86
+ n_ensemble = metadata["n_ensemble"]
87
+ else:
88
+ n_ensemble = 1 # Legacy single model
84
89
 
85
- # Concatenate the new columns with the original DataFrame
86
- df = pd.concat([df, proba_df], axis=1)
87
- print(df)
88
- return df
90
+ # Load ensemble models
91
+ ensemble_models = []
92
+ for i in range(n_ensemble):
93
+ model_path = os.path.join(model_dir, f"xgb_model_{i}.joblib")
94
+ if not os.path.exists(model_path):
95
+ model_path = os.path.join(model_dir, "xgb_model.joblib") # Legacy fallback
96
+ ensemble_models.append(joblib.load(model_path))
89
97
 
98
+ print(f"Loaded {len(ensemble_models)} model(s)")
90
99
 
91
- def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> pd.DataFrame:
92
- """
93
- Matches and renames DataFrame columns to match model feature names (case-insensitive).
94
- Prioritizes exact matches, then case-insensitive matches.
95
-
96
- Raises ValueError if any model features cannot be matched.
97
- """
98
- df_columns_lower = {col.lower(): col for col in df.columns}
99
- rename_dict = {}
100
- missing = []
101
- for feature in model_features:
102
- if feature in df.columns:
103
- continue # Exact match
104
- elif feature.lower() in df_columns_lower:
105
- rename_dict[df_columns_lower[feature.lower()]] = feature
106
- else:
107
- missing.append(feature)
100
+ # Load label encoder (classifier only)
101
+ label_encoder = None
102
+ encoder_path = os.path.join(model_dir, "label_encoder.joblib")
103
+ if os.path.exists(encoder_path):
104
+ label_encoder = joblib.load(encoder_path)
105
+
106
+ # Load category mappings
107
+ category_mappings = {}
108
+ category_path = os.path.join(model_dir, "category_mappings.json")
109
+ if os.path.exists(category_path):
110
+ with open(category_path) as f:
111
+ category_mappings = json.load(f)
112
+
113
+ # Load UQ models (regression only)
114
+ uq_models, uq_metadata = None, None
115
+ uq_path = os.path.join(model_dir, "uq_metadata.json")
116
+ if os.path.exists(uq_path):
117
+ uq_models, uq_metadata = load_uq_models(model_dir)
118
+
119
+ return {
120
+ "ensemble_models": ensemble_models,
121
+ "n_ensemble": n_ensemble,
122
+ "label_encoder": label_encoder,
123
+ "category_mappings": category_mappings,
124
+ "uq_models": uq_models,
125
+ "uq_metadata": uq_metadata,
126
+ }
127
+
128
+
129
+ # =============================================================================
130
+ # Inference (for SageMaker inference)
131
+ # =============================================================================
132
+ def predict_fn(df: pd.DataFrame, model_dict: dict) -> pd.DataFrame:
133
+ """Make predictions with XGBoost ensemble."""
134
+ model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
135
+ with open(os.path.join(model_dir, "feature_columns.json")) as f:
136
+ features = json.load(f)
137
+ print(f"Model Features: {features}")
108
138
 
109
- if missing:
110
- raise ValueError(f"Features not found: {missing}")
139
+ # Extract model components
140
+ ensemble_models = model_dict["ensemble_models"]
141
+ label_encoder = model_dict.get("label_encoder")
142
+ category_mappings = model_dict.get("category_mappings", {})
143
+ uq_models = model_dict.get("uq_models")
144
+ uq_metadata = model_dict.get("uq_metadata")
145
+ compressed_features = TEMPLATE_PARAMS["compressed_features"]
111
146
 
112
- # Rename the DataFrame columns to match the model features
113
- return df.rename(columns=rename_dict)
147
+ # Prepare features
148
+ matched_df = match_features_case_insensitive(df, features)
149
+ matched_df, _ = convert_categorical_types(matched_df, features, category_mappings)
114
150
 
151
+ if compressed_features:
152
+ print("Decompressing features for prediction...")
153
+ matched_df, features = decompress_features(matched_df, features, compressed_features)
115
154
 
116
- def convert_categorical_types(df: pd.DataFrame, features: list, category_mappings={}) -> tuple:
117
- """
118
- Converts appropriate columns to categorical type with consistent mappings.
155
+ X = matched_df[features]
119
156
 
120
- Args:
121
- df (pd.DataFrame): The DataFrame to process.
122
- features (list): List of feature names to consider for conversion.
123
- category_mappings (dict, optional): Existing category mappings. If empty dict, we're in
124
- training mode. If populated, we're in inference mode.
157
+ # Collect ensemble predictions
158
+ all_preds = [m.predict(X) for m in ensemble_models]
159
+ ensemble_preds = np.stack(all_preds, axis=0)
125
160
 
126
- Returns:
127
- tuple: (processed DataFrame, category mappings dictionary)
128
- """
129
- # Training mode
130
- if category_mappings == {}:
131
- for col in df.select_dtypes(include=["object", "string"]):
132
- if col in features and df[col].nunique() < 20:
133
- print(f"Training mode: Converting {col} to category")
134
- df[col] = df[col].astype("category")
135
- category_mappings[col] = df[col].cat.categories.tolist() # Store category mappings
161
+ if label_encoder is not None:
162
+ # Classification: average probabilities, then argmax
163
+ all_probs = [m.predict_proba(X) for m in ensemble_models]
164
+ avg_probs = np.mean(np.stack(all_probs, axis=0), axis=0)
165
+ class_preds = np.argmax(avg_probs, axis=1)
136
166
 
137
- # Inference mode
167
+ df["prediction"] = label_encoder.inverse_transform(class_preds)
168
+ df["pred_proba"] = [p.tolist() for p in avg_probs]
169
+ df = expand_proba_column(df, label_encoder.classes_)
138
170
  else:
139
- for col, categories in category_mappings.items():
140
- if col in df.columns:
141
- print(f"Inference mode: Applying categorical mapping for {col}")
142
- df[col] = pd.Categorical(df[col], categories=categories) # Apply consistent categorical mapping
143
-
144
- return df, category_mappings
145
-
146
-
147
- def decompress_features(
148
- df: pd.DataFrame, features: List[str], compressed_features: List[str]
149
- ) -> Tuple[pd.DataFrame, List[str]]:
150
- """Prepare features for the model by decompressing bitstring features
151
-
152
- Args:
153
- df (pd.DataFrame): The features DataFrame
154
- features (List[str]): Full list of feature names
155
- compressed_features (List[str]): List of feature names to decompress (bitstrings)
156
-
157
- Returns:
158
- pd.DataFrame: DataFrame with the decompressed features
159
- List[str]: Updated list of feature names after decompression
160
-
161
- Raises:
162
- ValueError: If any missing values are found in the specified features
163
- """
164
-
165
- # Check for any missing values in the required features
166
- missing_counts = df[features].isna().sum()
167
- if missing_counts.any():
168
- missing_features = missing_counts[missing_counts > 0]
169
- print(
170
- f"WARNING: Found missing values in features: {missing_features.to_dict()}. "
171
- "WARNING: You might want to remove/replace all NaN values before processing."
172
- )
173
-
174
- # Decompress the specified compressed features
175
- decompressed_features = features.copy()
176
- for feature in compressed_features:
177
- if (feature not in df.columns) or (feature not in features):
178
- print(f"Feature '{feature}' not in the features list, skipping decompression.")
179
- continue
180
-
181
- # Remove the feature from the list of features to avoid duplication
182
- decompressed_features.remove(feature)
183
-
184
- # Handle all compressed features as bitstrings
185
- bit_matrix = np.array([list(bitstring) for bitstring in df[feature]], dtype=np.uint8)
186
- prefix = feature[:3]
187
-
188
- # Create all new columns at once - avoids fragmentation
189
- new_col_names = [f"{prefix}_{i}" for i in range(bit_matrix.shape[1])]
190
- new_df = pd.DataFrame(bit_matrix, columns=new_col_names, index=df.index)
191
-
192
- # Add to features list
193
- decompressed_features.extend(new_col_names)
171
+ # Regression: average predictions
172
+ df["prediction"] = np.mean(ensemble_preds, axis=0)
173
+ df["prediction_std"] = np.std(ensemble_preds, axis=0)
194
174
 
195
- # Drop original column and concatenate new ones
196
- df = df.drop(columns=[feature])
197
- df = pd.concat([df, new_df], axis=1)
175
+ # Add UQ intervals if available
176
+ if uq_models and uq_metadata:
177
+ df = predict_intervals(df, X, uq_models, uq_metadata)
178
+ df = compute_confidence(df, uq_metadata["median_interval_width"], "q_10", "q_90")
198
179
 
199
- return df, decompressed_features
180
+ print(f"Inference complete: {len(df)} predictions, {len(ensemble_models)} ensemble members")
181
+ return df
200
182
 
201
183
 
184
+ # =============================================================================
185
+ # Training
186
+ # =============================================================================
202
187
  if __name__ == "__main__":
203
- """The main function is for training the XGBoost model"""
188
+ # -------------------------------------------------------------------------
189
+ # Training-only imports (deferred to reduce serverless startup time)
190
+ # -------------------------------------------------------------------------
191
+ import argparse
192
+
193
+ import awswrangler as wr
194
+ from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
195
+ from sklearn.preprocessing import LabelEncoder
196
+
197
+ from model_script_utils import (
198
+ check_dataframe,
199
+ compute_classification_metrics,
200
+ compute_regression_metrics,
201
+ print_classification_metrics,
202
+ print_confusion_matrix,
203
+ print_regression_metrics,
204
+ )
205
+ from uq_harness import (
206
+ save_uq_models,
207
+ train_uq_models,
208
+ )
209
+
210
+ # -------------------------------------------------------------------------
211
+ # Setup: Parse arguments and load data
212
+ # -------------------------------------------------------------------------
213
+ parser = argparse.ArgumentParser()
214
+ parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model"))
215
+ parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train"))
216
+ parser.add_argument("--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data"))
217
+ args = parser.parse_args()
204
218
 
205
- # Harness Template Parameters
219
+ # Extract template parameters
206
220
  target = TEMPLATE_PARAMS["target"]
207
221
  features = TEMPLATE_PARAMS["features"]
208
222
  orig_features = features.copy()
223
+ id_column = TEMPLATE_PARAMS["id_column"]
209
224
  compressed_features = TEMPLATE_PARAMS["compressed_features"]
210
225
  model_type = TEMPLATE_PARAMS["model_type"]
211
226
  model_metrics_s3_path = TEMPLATE_PARAMS["model_metrics_s3_path"]
212
- train_all_data = TEMPLATE_PARAMS["train_all_data"]
213
- hyperparameters = TEMPLATE_PARAMS["hyperparameters"]
214
- validation_split = 0.2
215
-
216
- # Script arguments for input/output directories
217
- parser = argparse.ArgumentParser()
218
- parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model"))
219
- parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train"))
220
- parser.add_argument(
221
- "--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data")
222
- )
223
- args = parser.parse_args()
227
+ hyperparameters = {**DEFAULT_HYPERPARAMETERS, **(TEMPLATE_PARAMS["hyperparameters"] or {})}
224
228
 
225
- # Read the training data into DataFrames
226
- training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
229
+ # Load training data
230
+ training_files = [os.path.join(args.train, f) for f in os.listdir(args.train) if f.endswith(".csv")]
227
231
  print(f"Training Files: {training_files}")
228
-
229
- # Combine files and read them all into a single pandas dataframe
230
- all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
231
-
232
- # Check if the dataframe is empty
232
+ all_df = pd.concat([pd.read_csv(f, engine="python") for f in training_files])
233
233
  check_dataframe(all_df, "training_df")
234
234
 
235
- # Features/Target output
236
235
  print(f"Target: {target}")
237
- print(f"Features: {str(features)}")
236
+ print(f"Features: {features}")
237
+ print(f"Hyperparameters: {hyperparameters}")
238
238
 
239
- # Convert any features that might be categorical to 'category' type
239
+ # -------------------------------------------------------------------------
240
+ # Preprocessing: Categorical features and decompression
241
+ # -------------------------------------------------------------------------
240
242
  all_df, category_mappings = convert_categorical_types(all_df, features)
241
243
 
242
- # If we have compressed features, decompress them
243
244
  if compressed_features:
244
- print(f"Decompressing features {compressed_features}...")
245
+ print(f"Decompressing features: {compressed_features}")
245
246
  all_df, features = decompress_features(all_df, features, compressed_features)
246
247
 
247
- # Do we want to train on all the data?
248
- if train_all_data:
249
- print("Training on ALL of the data")
250
- df_train = all_df.copy()
251
- df_val = all_df.copy()
252
-
253
- # Does the dataframe have a training column?
254
- elif "training" in all_df.columns:
255
- print("Found training column, splitting data based on training column")
256
- df_train = all_df[all_df["training"]]
257
- df_val = all_df[~all_df["training"]]
258
- else:
259
- # Just do a random training Split
260
- print("WARNING: No training column found, splitting data with random state=42")
261
- df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
262
- print(f"FIT/TRAIN: {df_train.shape}")
263
- print(f"VALIDATION: {df_val.shape}")
264
-
265
- # Use any hyperparameters to set up both the trainer and model configurations
266
- print(f"Hyperparameters: {hyperparameters}")
267
-
268
- # Now spin up our XGB Model
248
+ # -------------------------------------------------------------------------
249
+ # Classification setup
250
+ # -------------------------------------------------------------------------
251
+ label_encoder = None
269
252
  if model_type == "classifier":
270
- xgb_model = xgb.XGBClassifier(enable_categorical=True, **hyperparameters)
271
-
272
- # Encode the target column
273
253
  label_encoder = LabelEncoder()
274
- df_train[target] = label_encoder.fit_transform(df_train[target])
275
- df_val[target] = label_encoder.transform(df_val[target])
276
-
277
- else:
278
- xgb_model = xgb.XGBRegressor(enable_categorical=True, **hyperparameters)
279
- label_encoder = None # We don't need this for regression
280
-
281
- # Grab our Features, Target and Train the Model
282
- y_train = df_train[target]
283
- X_train = df_train[features]
284
- xgb_model.fit(X_train, y_train)
285
-
286
- # Make Predictions on the Validation Set
287
- print(f"Making Predictions on Validation Set...")
288
- y_validate = df_val[target]
289
- X_validate = df_val[features]
290
- preds = xgb_model.predict(X_validate)
291
- if model_type == "classifier":
292
- # Also get the probabilities for each class
293
- print("Processing Probabilities...")
294
- probs = xgb_model.predict_proba(X_validate)
295
- df_val["pred_proba"] = [p.tolist() for p in probs]
296
-
297
- # Expand the pred_proba column into separate columns for each class
298
- print(df_val.columns)
299
- df_val = expand_proba_column(df_val, label_encoder.classes_)
300
- print(df_val.columns)
301
-
302
- # Decode the target and prediction labels
303
- y_validate = label_encoder.inverse_transform(y_validate)
304
- preds = label_encoder.inverse_transform(preds)
305
-
306
- # Save predictions to S3 (just the target, prediction, and '_proba' columns)
307
- df_val["prediction"] = preds
308
- output_columns = [target, "prediction"]
309
- output_columns += [col for col in df_val.columns if col.endswith("_proba")]
310
- wr.s3.to_csv(
311
- df_val[output_columns],
312
- path=f"{model_metrics_s3_path}/validation_predictions.csv",
313
- index=False,
314
- )
254
+ all_df[target] = label_encoder.fit_transform(all_df[target])
255
+ print(f"Class labels: {label_encoder.classes_.tolist()}")
315
256
 
316
- # Report Performance Metrics
317
- if model_type == "classifier":
318
- # Get the label names and their integer mapping
319
- label_names = label_encoder.classes_
257
+ # -------------------------------------------------------------------------
258
+ # Cross-validation setup
259
+ # -------------------------------------------------------------------------
260
+ n_folds = hyperparameters["n_folds"]
261
+ xgb_params = {k: v for k, v in hyperparameters.items() if k not in WORKBENCH_PARAMS}
320
262
 
321
- # Calculate various model performance metrics
322
- scores = precision_recall_fscore_support(y_validate, preds, average=None, labels=label_names)
323
-
324
- # Put the scores into a dataframe
325
- score_df = pd.DataFrame(
326
- {
327
- target: label_names,
328
- "precision": scores[0],
329
- "recall": scores[1],
330
- "f1": scores[2],
331
- "support": scores[3],
332
- }
333
- )
334
-
335
- # We need to get creative with the Classification Metrics
336
- metrics = ["precision", "recall", "f1", "support"]
337
- for t in label_names:
338
- for m in metrics:
339
- value = score_df.loc[score_df[target] == t, m].iloc[0]
340
- print(f"Metrics:{t}:{m} {value}")
341
-
342
- # Compute and output the confusion matrix
343
- conf_mtx = confusion_matrix(y_validate, preds, labels=label_names)
344
- for i, row_name in enumerate(label_names):
345
- for j, col_name in enumerate(label_names):
346
- value = conf_mtx[i, j]
347
- print(f"ConfusionMatrix:{row_name}:{col_name} {value}")
263
+ # Map 'seed' to 'random_state' for XGBoost
264
+ if "seed" in xgb_params:
265
+ xgb_params["random_state"] = xgb_params.pop("seed")
348
266
 
267
+ # Handle objective: filter regression-only params for classifiers, set default for regressors
268
+ if model_type == "classifier":
269
+ xgb_params = {k: v for k, v in xgb_params.items() if k not in REGRESSION_ONLY_PARAMS}
349
270
  else:
350
- # Calculate various model performance metrics (regression)
351
- rmse = root_mean_squared_error(y_validate, preds)
352
- mae = mean_absolute_error(y_validate, preds)
353
- medae = median_absolute_error(y_validate, preds)
354
- r2 = r2_score(y_validate, preds)
355
- spearman_corr = spearmanr(y_validate, preds).correlation
356
- support = len(df_val)
357
- print(f"rmse: {rmse:.3f}")
358
- print(f"mae: {mae:.3f}")
359
- print(f"medae: {medae:.3f}")
360
- print(f"r2: {r2:.3f}")
361
- print(f"spearmanr: {spearman_corr:.3f}")
362
- print(f"support: {support}")
363
-
364
- # Now save the model to the standard place/name
365
- joblib.dump(xgb_model, os.path.join(args.model_dir, "xgb_model.joblib"))
366
-
367
- # Save the label encoder if we have one
368
- if label_encoder:
369
- joblib.dump(label_encoder, os.path.join(args.model_dir, "label_encoder.joblib"))
370
-
371
- # Save the features (this will validate input during predictions)
372
- with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
373
- json.dump(orig_features, fp) # We save the original features, not the decompressed ones
374
-
375
- # Save the category mappings
376
- with open(os.path.join(args.model_dir, "category_mappings.json"), "w") as fp:
377
- json.dump(category_mappings, fp)
271
+ # Default to MAE (reg:absoluteerror) for regression if not specified
272
+ xgb_params.setdefault("objective", "reg:absoluteerror")
378
273
 
274
+ print(f"XGBoost params: {xgb_params}")
379
275
 
380
- def model_fn(model_dir):
381
- """Deserialize and return fitted XGBoost model"""
382
- model_path = os.path.join(model_dir, "xgb_model.joblib")
383
- model = joblib.load(model_path)
384
- return model
385
-
386
-
387
- def input_fn(input_data, content_type):
388
- """Parse input data and return a DataFrame."""
389
- if not input_data:
390
- raise ValueError("Empty input data is not supported!")
276
+ if n_folds == 1:
277
+ # Single train/val split
278
+ if "training" in all_df.columns:
279
+ print("Using 'training' column for train/val split")
280
+ train_idx = np.where(all_df["training"])[0]
281
+ val_idx = np.where(~all_df["training"])[0]
282
+ else:
283
+ print("WARNING: No 'training' column found, using random 80/20 split")
284
+ indices = np.arange(len(all_df))
285
+ train_idx, val_idx = train_test_split(indices, test_size=0.2, random_state=42)
286
+ folds = [(train_idx, val_idx)]
287
+ else:
288
+ # K-fold cross-validation
289
+ if model_type == "classifier":
290
+ kfold = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
291
+ folds = list(kfold.split(all_df, all_df[target]))
292
+ else:
293
+ kfold = KFold(n_splits=n_folds, shuffle=True, random_state=42)
294
+ folds = list(kfold.split(all_df))
391
295
 
392
- # Decode bytes to string if necessary
393
- if isinstance(input_data, bytes):
394
- input_data = input_data.decode("utf-8")
296
+ print(f"Training {'single model' if n_folds == 1 else f'{n_folds}-fold ensemble'}...")
395
297
 
396
- if "text/csv" in content_type:
397
- return pd.read_csv(StringIO(input_data))
398
- elif "application/json" in content_type:
399
- return pd.DataFrame(json.loads(input_data)) # Assumes JSON array of records
298
+ # -------------------------------------------------------------------------
299
+ # Training loop
300
+ # -------------------------------------------------------------------------
301
+ # Initialize out-of-fold storage
302
+ oof_predictions = np.full(len(all_df), np.nan, dtype=np.float64)
303
+ if model_type == "classifier":
304
+ num_classes = len(label_encoder.classes_)
305
+ oof_proba = np.full((len(all_df), num_classes), np.nan, dtype=np.float64)
400
306
  else:
401
- raise ValueError(f"{content_type} not supported!")
402
-
307
+ oof_proba = None
308
+
309
+ # Check for sample weights
310
+ has_sample_weights = "sample_weight" in all_df.columns
311
+ if has_sample_weights:
312
+ sw = all_df["sample_weight"]
313
+ print(f"Using sample weights: min={sw.min():.2f}, max={sw.max():.2f}, mean={sw.mean():.2f}")
314
+
315
+ # Train ensemble
316
+ ensemble_models = []
317
+ for fold_idx, (train_idx, val_idx) in enumerate(folds):
318
+ print(f"\n{'='*50}")
319
+ print(f"Fold {fold_idx + 1}/{len(folds)} - Train: {len(train_idx)}, Val: {len(val_idx)}")
320
+ print(f"{'='*50}")
321
+
322
+ # Prepare fold data
323
+ X_train = all_df.iloc[train_idx][features]
324
+ y_train = all_df.iloc[train_idx][target]
325
+ X_val = all_df.iloc[val_idx][features]
326
+ sample_weights = all_df.iloc[train_idx]["sample_weight"] if has_sample_weights else None
327
+
328
+ # Create model with fold-specific random state for diversity
329
+ fold_params = {**xgb_params, "random_state": xgb_params.get("random_state", 42) + fold_idx}
330
+ if model_type == "classifier":
331
+ model = xgb.XGBClassifier(enable_categorical=True, **fold_params)
332
+ else:
333
+ model = xgb.XGBRegressor(enable_categorical=True, **fold_params)
334
+
335
+ # Train
336
+ model.fit(X_train, y_train, sample_weight=sample_weights)
337
+ ensemble_models.append(model)
338
+
339
+ # Out-of-fold predictions
340
+ oof_predictions[val_idx] = model.predict(X_val)
341
+ if model_type == "classifier":
342
+ oof_proba[val_idx] = model.predict_proba(X_val)
343
+
344
+ print(f"\nTraining complete! Trained {len(ensemble_models)} model(s).")
345
+
346
+ # -------------------------------------------------------------------------
347
+ # Prepare validation results
348
+ # -------------------------------------------------------------------------
349
+ if n_folds == 1:
350
+ # Single fold: only validation rows
351
+ val_mask = ~np.isnan(oof_predictions)
352
+ df_val = all_df[val_mask].copy()
353
+ predictions = oof_predictions[val_mask]
354
+ if oof_proba is not None:
355
+ oof_proba = oof_proba[val_mask]
356
+ else:
357
+ # K-fold: all rows have out-of-fold predictions
358
+ df_val = all_df.copy()
359
+ predictions = oof_predictions
403
360
 
404
- def output_fn(output_df, accept_type):
405
- """Supports both CSV and JSON output formats."""
406
- if "text/csv" in accept_type:
407
- csv_output = output_df.fillna("N/A").to_csv(index=False) # CSV with N/A for missing values
408
- return csv_output, "text/csv"
409
- elif "application/json" in accept_type:
410
- return output_df.to_json(orient="records"), "application/json" # JSON array of records (NaNs -> null)
361
+ # Decode labels for classification
362
+ if model_type == "classifier":
363
+ df_val[target] = label_encoder.inverse_transform(df_val[target].astype(int))
364
+ df_val["prediction"] = label_encoder.inverse_transform(predictions.astype(int))
365
+ if oof_proba is not None:
366
+ df_val["pred_proba"] = [p.tolist() for p in oof_proba]
367
+ df_val = expand_proba_column(df_val, label_encoder.classes_)
411
368
  else:
412
- raise RuntimeError(f"{accept_type} accept type is not supported by this script.")
369
+ df_val["prediction"] = predictions
413
370
 
371
+ # -------------------------------------------------------------------------
372
+ # Compute and print metrics
373
+ # -------------------------------------------------------------------------
374
+ y_true = df_val[target].values
375
+ y_pred = df_val["prediction"].values
414
376
 
415
- def predict_fn(df, model) -> pd.DataFrame:
416
- """Make Predictions with our XGB Model
377
+ if model_type == "classifier":
378
+ label_names = label_encoder.classes_
379
+ score_df = compute_classification_metrics(y_true, y_pred, label_names, target)
380
+ print_classification_metrics(score_df, target, label_names)
381
+ print_confusion_matrix(y_true, y_pred, label_names)
382
+ else:
383
+ metrics = compute_regression_metrics(y_true, y_pred)
384
+ print_regression_metrics(metrics)
385
+
386
+ # Compute ensemble prediction_std
387
+ if n_folds > 1:
388
+ all_preds = np.stack([m.predict(all_df[features]) for m in ensemble_models])
389
+ df_val["prediction_std"] = np.std(all_preds, axis=0)
390
+ print(f"Ensemble std - mean: {df_val['prediction_std'].mean():.4f}, max: {df_val['prediction_std'].max():.4f}")
391
+ else:
392
+ df_val["prediction_std"] = 0.0
393
+
394
+ # Train UQ models for uncertainty quantification
395
+ print("\n" + "=" * 50)
396
+ print("Training UQ Models")
397
+ print("=" * 50)
398
+ uq_models, uq_metadata = train_uq_models(
399
+ all_df[features], all_df[target], df_val[features], y_true
400
+ )
401
+ df_val = predict_intervals(df_val, df_val[features], uq_models, uq_metadata)
402
+ df_val = compute_confidence(df_val, uq_metadata["median_interval_width"])
417
403
 
418
- Args:
419
- df (pd.DataFrame): The input DataFrame
420
- model: The model use for predictions
404
+ # -------------------------------------------------------------------------
405
+ # Save validation predictions to S3
406
+ # -------------------------------------------------------------------------
407
+ output_columns = []
408
+ if id_column in df_val.columns:
409
+ output_columns.append(id_column)
410
+ output_columns += [target, "prediction"]
421
411
 
422
- Returns:
423
- pd.DataFrame: The DataFrame with the predictions added
424
- """
425
- compressed_features = TEMPLATE_PARAMS["compressed_features"]
412
+ if model_type != "classifier":
413
+ output_columns.append("prediction_std")
414
+ output_columns += [c for c in df_val.columns if c.startswith("q_") or c == "confidence"]
426
415
 
427
- # Grab our feature columns (from training)
428
- model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
429
- with open(os.path.join(model_dir, "feature_columns.json")) as fp:
430
- features = json.load(fp)
431
- print(f"Model Features: {features}")
416
+ output_columns += [c for c in df_val.columns if c.endswith("_proba")]
432
417
 
433
- # Load the category mappings (from training)
434
- with open(os.path.join(model_dir, "category_mappings.json")) as fp:
435
- category_mappings = json.load(fp)
418
+ wr.s3.to_csv(df_val[output_columns], f"{model_metrics_s3_path}/validation_predictions.csv", index=False)
436
419
 
437
- # Load our Label Encoder if we have one
438
- label_encoder = None
439
- if os.path.exists(os.path.join(model_dir, "label_encoder.joblib")):
440
- label_encoder = joblib.load(os.path.join(model_dir, "label_encoder.joblib"))
420
+ # -------------------------------------------------------------------------
421
+ # Save model artifacts
422
+ # -------------------------------------------------------------------------
423
+ for idx, m in enumerate(ensemble_models):
424
+ joblib.dump(m, os.path.join(args.model_dir, f"xgb_model_{idx}.joblib"))
425
+ print(f"Saved {len(ensemble_models)} model(s)")
441
426
 
442
- # We're going match features in a case-insensitive manner, accounting for all the permutations
443
- # - Model has a feature list that's any case ("Id", "taCos", "cOunT", "likes_tacos")
444
- # - Incoming data has columns that are mixed case ("ID", "Tacos", "Count", "Likes_Tacos")
445
- matched_df = match_features_case_insensitive(df, features)
427
+ with open(os.path.join(args.model_dir, "ensemble_metadata.json"), "w") as f:
428
+ json.dump({"n_ensemble": len(ensemble_models), "n_folds": n_folds}, f)
446
429
 
447
- # Detect categorical types in the incoming DataFrame
448
- matched_df, _ = convert_categorical_types(matched_df, features, category_mappings)
430
+ with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as f:
431
+ json.dump(orig_features, f)
449
432
 
450
- # If we have compressed features, decompress them
451
- if compressed_features:
452
- print("Decompressing features for prediction...")
453
- matched_df, features = decompress_features(matched_df, features, compressed_features)
433
+ with open(os.path.join(args.model_dir, "category_mappings.json"), "w") as f:
434
+ json.dump(category_mappings, f)
454
435
 
455
- # Predict the features against our XGB Model
456
- X = matched_df[features]
457
- predictions = model.predict(X)
436
+ with open(os.path.join(args.model_dir, "hyperparameters.json"), "w") as f:
437
+ json.dump(hyperparameters, f, indent=2)
458
438
 
459
- # If we have a label encoder, decode the predictions
460
439
  if label_encoder:
461
- predictions = label_encoder.inverse_transform(predictions)
462
-
463
- # Set the predictions on the DataFrame
464
- df["prediction"] = predictions
465
-
466
- # Does our model have a 'predict_proba' method? If so we will call it and add the results to the DataFrame
467
- if getattr(model, "predict_proba", None):
468
- probs = model.predict_proba(matched_df[features])
469
- df["pred_proba"] = [p.tolist() for p in probs]
440
+ joblib.dump(label_encoder, os.path.join(args.model_dir, "label_encoder.joblib"))
470
441
 
471
- # Expand the pred_proba column into separate columns for each class
472
- df = expand_proba_column(df, label_encoder.classes_)
442
+ if model_type != "classifier":
443
+ save_uq_models(uq_models, uq_metadata, args.model_dir)
473
444
 
474
- # All done, return the DataFrame with new columns for the predictions
475
- return df
445
+ print(f"\nModel training complete! Artifacts saved to {args.model_dir}")