workbench 0.8.205__py3-none-any.whl → 0.8.212__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. workbench/algorithms/models/noise_model.py +388 -0
  2. workbench/api/endpoint.py +3 -6
  3. workbench/api/feature_set.py +1 -1
  4. workbench/api/model.py +5 -11
  5. workbench/cached/cached_model.py +4 -4
  6. workbench/core/artifacts/endpoint_core.py +57 -145
  7. workbench/core/artifacts/model_core.py +21 -19
  8. workbench/core/transforms/features_to_model/features_to_model.py +2 -2
  9. workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +1 -1
  10. workbench/model_script_utils/model_script_utils.py +335 -0
  11. workbench/model_script_utils/pytorch_utils.py +395 -0
  12. workbench/model_script_utils/uq_harness.py +278 -0
  13. workbench/model_scripts/chemprop/chemprop.template +289 -666
  14. workbench/model_scripts/chemprop/generated_model_script.py +292 -669
  15. workbench/model_scripts/chemprop/model_script_utils.py +335 -0
  16. workbench/model_scripts/chemprop/requirements.txt +2 -10
  17. workbench/model_scripts/pytorch_model/generated_model_script.py +355 -612
  18. workbench/model_scripts/pytorch_model/model_script_utils.py +335 -0
  19. workbench/model_scripts/pytorch_model/pytorch.template +350 -607
  20. workbench/model_scripts/pytorch_model/pytorch_utils.py +395 -0
  21. workbench/model_scripts/pytorch_model/requirements.txt +1 -1
  22. workbench/model_scripts/pytorch_model/uq_harness.py +278 -0
  23. workbench/model_scripts/script_generation.py +2 -5
  24. workbench/model_scripts/uq_models/generated_model_script.py +65 -422
  25. workbench/model_scripts/xgb_model/generated_model_script.py +349 -412
  26. workbench/model_scripts/xgb_model/model_script_utils.py +335 -0
  27. workbench/model_scripts/xgb_model/uq_harness.py +278 -0
  28. workbench/model_scripts/xgb_model/xgb_model.template +344 -407
  29. workbench/scripts/training_test.py +85 -0
  30. workbench/utils/chemprop_utils.py +18 -656
  31. workbench/utils/metrics_utils.py +172 -0
  32. workbench/utils/model_utils.py +104 -47
  33. workbench/utils/pytorch_utils.py +32 -472
  34. workbench/utils/xgboost_local_crossfold.py +267 -0
  35. workbench/utils/xgboost_model_utils.py +49 -356
  36. workbench/web_interface/components/plugins/model_details.py +30 -68
  37. {workbench-0.8.205.dist-info → workbench-0.8.212.dist-info}/METADATA +5 -5
  38. {workbench-0.8.205.dist-info → workbench-0.8.212.dist-info}/RECORD +42 -31
  39. {workbench-0.8.205.dist-info → workbench-0.8.212.dist-info}/entry_points.txt +1 -0
  40. workbench/model_scripts/uq_models/mapie.template +0 -605
  41. workbench/model_scripts/uq_models/requirements.txt +0 -1
  42. {workbench-0.8.205.dist-info → workbench-0.8.212.dist-info}/WHEEL +0 -0
  43. {workbench-0.8.205.dist-info → workbench-0.8.212.dist-info}/licenses/LICENSE +0 -0
  44. {workbench-0.8.205.dist-info → workbench-0.8.212.dist-info}/top_level.txt +0 -0
@@ -1,39 +1,76 @@
1
- # Imports for PyTorch Tabular Model
1
+ # PyTorch Tabular Model Template for Workbench
2
+ #
3
+ # This template handles both classification and regression models with:
4
+ # - K-fold cross-validation ensemble training (or single train/val split)
5
+ # - Out-of-fold predictions for validation metrics
6
+ # - Categorical feature embedding via TabularMLP
7
+ # - Compressed feature decompression
8
+
9
+ import argparse
10
+ import json
2
11
  import os
12
+
3
13
  import awswrangler as wr
14
+ import joblib
4
15
  import numpy as np
5
-
6
- # PyTorch compatibility: pytorch-tabular saves complex objects, not just tensors
7
- # Use legacy loading behavior for compatibility (recommended by PyTorch docs for this scenario)
8
- os.environ["TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD"] = "1"
9
- from pytorch_tabular import TabularModel
10
- from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig
11
- from pytorch_tabular.models import CategoryEmbeddingModelConfig
12
-
13
- # Model Performance Scores
14
- from sklearn.metrics import (
15
- mean_absolute_error,
16
- median_absolute_error,
17
- r2_score,
18
- root_mean_squared_error,
19
- precision_recall_fscore_support,
20
- confusion_matrix,
21
- )
22
- from scipy.stats import spearmanr
23
-
24
- # Classification Encoder
16
+ import pandas as pd
17
+ import torch
18
+ from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
25
19
  from sklearn.preprocessing import LabelEncoder
26
20
 
27
- # Scikit Learn Imports
28
- from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
21
+ # Enable Tensor Core optimization for GPUs that support it
22
+ torch.set_float32_matmul_precision("medium")
23
+
24
+ from model_script_utils import (
25
+ check_dataframe,
26
+ compute_classification_metrics,
27
+ compute_regression_metrics,
28
+ convert_categorical_types,
29
+ decompress_features,
30
+ expand_proba_column,
31
+ input_fn,
32
+ match_features_case_insensitive,
33
+ output_fn,
34
+ print_classification_metrics,
35
+ print_confusion_matrix,
36
+ print_regression_metrics,
37
+ )
38
+ from pytorch_utils import (
39
+ FeatureScaler,
40
+ create_model,
41
+ load_model,
42
+ predict,
43
+ prepare_data,
44
+ save_model,
45
+ train_model,
46
+ )
47
+ from uq_harness import (
48
+ compute_confidence,
49
+ load_uq_models,
50
+ predict_intervals,
51
+ save_uq_models,
52
+ train_uq_models,
53
+ )
29
54
 
30
- from io import StringIO
31
- import json
32
- import argparse
33
- import joblib
34
- import pandas as pd
55
+ # =============================================================================
56
+ # Default Hyperparameters
57
+ # =============================================================================
58
+ DEFAULT_HYPERPARAMETERS = {
59
+ # Training parameters
60
+ "n_folds": 5,
61
+ "max_epochs": 200,
62
+ "early_stopping_patience": 20,
63
+ "batch_size": 128,
64
+ # Model architecture
65
+ "layers": "256-128-64",
66
+ "learning_rate": 1e-3,
67
+ "dropout": 0.1,
68
+ "use_batch_norm": True,
69
+ # Random seed
70
+ "seed": 42,
71
+ }
35
72
 
36
- # Template Parameters
73
+ # Template parameters (filled in by Workbench)
37
74
  TEMPLATE_PARAMS = {
38
75
  "model_type": "{{model_type}}",
39
76
  "target": "{{target_column}}",
@@ -45,373 +82,167 @@ TEMPLATE_PARAMS = {
45
82
  }
46
83
 
47
84
 
48
- def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
49
- """
50
- Check if the provided dataframe is empty and raise an exception if it is.
51
-
52
- Args:
53
- df (pd.DataFrame): DataFrame to check
54
- df_name (str): Name of the DataFrame
55
- """
56
- if df.empty:
57
- msg = f"*** The training data {df_name} has 0 rows! ***STOPPING***"
58
- print(msg)
59
- raise ValueError(msg)
60
-
61
-
62
- def expand_proba_column(df: pd.DataFrame, class_labels: list[str]) -> pd.DataFrame:
63
- """
64
- Expands a column in a DataFrame containing a list of probabilities into separate columns.
65
-
66
- Args:
67
- df (pd.DataFrame): DataFrame containing a "pred_proba" column
68
- class_labels (list[str]): List of class labels
69
-
70
- Returns:
71
- pd.DataFrame: DataFrame with the "pred_proba" expanded into separate columns
72
- """
73
- proba_column = "pred_proba"
74
- if proba_column not in df.columns:
75
- raise ValueError('DataFrame does not contain a "pred_proba" column')
76
-
77
- # Construct new column names with '_proba' suffix
78
- proba_splits = [f"{label}_proba" for label in class_labels]
79
-
80
- # Expand the proba_column into separate columns for each probability
81
- proba_df = pd.DataFrame(df[proba_column].tolist(), columns=proba_splits)
82
-
83
- # Drop any proba columns and reset the index in prep for the concat
84
- df = df.drop(columns=[proba_column] + proba_splits, errors="ignore")
85
- df = df.reset_index(drop=True)
86
-
87
- # Concatenate the new columns with the original DataFrame
88
- df = pd.concat([df, proba_df], axis=1)
89
- return df
90
-
91
-
92
- def match_features_case_insensitive(df: pd.DataFrame, model_features: list[str]) -> pd.DataFrame:
93
- """
94
- Matches and renames DataFrame columns to match model feature names (case-insensitive).
95
- Prioritizes exact matches, then case-insensitive matches.
96
-
97
- Raises ValueError if any model features cannot be matched.
98
- """
99
- df_columns_lower = {col.lower(): col for col in df.columns}
100
- rename_dict = {}
101
- missing = []
102
- for feature in model_features:
103
- if feature in df.columns:
104
- continue # Exact match
105
- elif feature.lower() in df_columns_lower:
106
- rename_dict[df_columns_lower[feature.lower()]] = feature
107
- else:
108
- missing.append(feature)
109
-
110
- if missing:
111
- raise ValueError(f"Features not found: {missing}")
112
-
113
- # Rename the DataFrame columns to match the model features
114
- return df.rename(columns=rename_dict)
115
-
116
-
117
- def convert_categorical_types(
118
- df: pd.DataFrame, features: list[str], category_mappings: dict[str, list[str]] | None = None
119
- ) -> tuple[pd.DataFrame, dict[str, list[str]]]:
120
- """
121
- Converts appropriate columns to categorical type with consistent mappings.
122
-
123
- Args:
124
- df (pd.DataFrame): The DataFrame to process.
125
- features (list): List of feature names to consider for conversion.
126
- category_mappings (dict, optional): Existing category mappings. If None or empty,
127
- we're in training mode. If populated, we're in
128
- inference mode.
129
-
130
- Returns:
131
- tuple: (processed DataFrame, category mappings dictionary)
132
- """
133
- if category_mappings is None:
134
- category_mappings = {}
135
-
136
- # Training mode
137
- if not category_mappings:
138
- for col in df.select_dtypes(include=["object", "string"]):
139
- if col in features and df[col].nunique() < 20:
140
- print(f"Training mode: Converting {col} to category")
141
- df[col] = df[col].astype("category")
142
- category_mappings[col] = df[col].cat.categories.tolist()
143
-
144
- # Inference mode
145
- else:
146
- for col, categories in category_mappings.items():
147
- if col in df.columns:
148
- print(f"Inference mode: Applying categorical mapping for {col}")
149
- df[col] = pd.Categorical(df[col], categories=categories)
150
-
151
- return df, category_mappings
152
-
153
-
154
- def decompress_features(
155
- df: pd.DataFrame, features: list[str], compressed_features: list[str]
156
- ) -> tuple[pd.DataFrame, list[str]]:
157
- """Prepare features for the model
158
-
159
- Args:
160
- df (pd.DataFrame): The features DataFrame
161
- features (list[str]): Full list of feature names
162
- compressed_features (list[str]): List of feature names to decompress (bitstrings)
163
-
164
- Returns:
165
- pd.DataFrame: DataFrame with the decompressed features
166
- list[str]: Updated list of feature names after decompression
167
-
168
- Raises:
169
- ValueError: If any missing values are found in the specified features
170
- """
171
- # Check for any missing values in the required features
172
- missing_counts = df[features].isna().sum()
173
- if missing_counts.any():
174
- missing_features = missing_counts[missing_counts > 0]
175
- print(
176
- f"WARNING: Found missing values in features: {missing_features.to_dict()}. "
177
- "WARNING: You might want to remove/replace all NaN values before processing."
178
- )
179
-
180
- # Make a copy to avoid mutating the original list
181
- decompressed_features = features.copy()
182
-
183
- for feature in compressed_features:
184
- if (feature not in df.columns) or (feature not in decompressed_features):
185
- print(f"Feature '{feature}' not in the features list, skipping decompression.")
186
- continue
187
-
188
- # Remove the feature from the list of features to avoid duplication
189
- decompressed_features.remove(feature)
190
-
191
- # Handle all compressed features as bitstrings
192
- bit_matrix = np.array([list(bitstring) for bitstring in df[feature]], dtype=np.uint8)
193
- prefix = feature[:3]
194
-
195
- # Create all new columns at once - avoids fragmentation
196
- new_col_names = [f"{prefix}_{i}" for i in range(bit_matrix.shape[1])]
197
- new_df = pd.DataFrame(bit_matrix, columns=new_col_names, index=df.index)
198
-
199
- # Add to features list
200
- decompressed_features.extend(new_col_names)
201
-
202
- # Drop original column and concatenate new ones
203
- df = df.drop(columns=[feature])
204
- df = pd.concat([df, new_df], axis=1)
205
-
206
- return df, decompressed_features
207
-
208
-
85
+ # =============================================================================
86
+ # Model Loading (for SageMaker inference)
87
+ # =============================================================================
209
88
  def model_fn(model_dir: str) -> dict:
210
- """Load the PyTorch Tabular ensemble models from the specified directory.
211
-
212
- Args:
213
- model_dir: Directory containing the saved model(s)
214
-
215
- Returns:
216
- Dictionary with ensemble models and metadata
217
- """
218
- import torch
219
- from functools import partial
220
-
221
- # Load ensemble metadata if present
222
- ensemble_metadata_path = os.path.join(model_dir, "ensemble_metadata.joblib")
223
- if os.path.exists(ensemble_metadata_path):
224
- ensemble_metadata = joblib.load(ensemble_metadata_path)
225
- n_ensemble = ensemble_metadata["n_ensemble"]
89
+ """Load TabularMLP ensemble from the specified directory."""
90
+ # Load ensemble metadata
91
+ metadata_path = os.path.join(model_dir, "ensemble_metadata.joblib")
92
+ if os.path.exists(metadata_path):
93
+ metadata = joblib.load(metadata_path)
94
+ n_ensemble = metadata["n_ensemble"]
226
95
  else:
227
96
  n_ensemble = 1
228
97
 
229
- # Determine map_location for loading models (handle CUDA trained models on CPU inference)
230
- map_location = torch.device("cuda" if torch.cuda.is_available() else "cpu")
98
+ # Determine device
99
+ device = "cuda" if torch.cuda.is_available() else "cpu"
231
100
 
232
- # Patch torch.load globally to use map_location (needed for joblib-loaded callbacks)
233
- # This handles the case where pytorch-tabular loads callbacks.sav via joblib,
234
- # which internally calls torch.load without map_location
235
- original_torch_load = torch.load
236
- torch.load = partial(original_torch_load, map_location=map_location)
237
-
238
- # Save current working directory
239
- original_cwd = os.getcwd()
101
+ # Load ensemble models
240
102
  ensemble_models = []
241
-
242
- try:
243
- # Change to /tmp because Pytorch Tabular needs write access (creates a .pt_tmp directory)
244
- os.chdir("/tmp")
245
-
246
- for ens_idx in range(n_ensemble):
247
- # Try numbered model path first, fall back to legacy path
248
- model_path = os.path.join(model_dir, f"tabular_model_{ens_idx}")
249
- if not os.path.exists(model_path):
250
- model_path = os.path.join(model_dir, "tabular_model")
251
- model = TabularModel.load_model(model_path, map_location=map_location)
252
- ensemble_models.append(model)
253
-
254
- finally:
255
- # Restore torch.load and working directory
256
- torch.load = original_torch_load
257
- os.chdir(original_cwd)
258
-
259
- return {"ensemble_models": ensemble_models, "n_ensemble": n_ensemble}
260
-
261
-
262
- def input_fn(input_data, content_type: str) -> pd.DataFrame:
263
- """Parse input data and return a DataFrame."""
264
- if not input_data:
265
- raise ValueError("Empty input data is not supported!")
266
-
267
- # Decode bytes to string if necessary
268
- if isinstance(input_data, bytes):
269
- input_data = input_data.decode("utf-8")
270
-
271
- if "text/csv" in content_type:
272
- return pd.read_csv(StringIO(input_data))
273
- elif "application/json" in content_type:
274
- return pd.DataFrame(json.loads(input_data)) # Assumes JSON array of records
275
- else:
276
- raise ValueError(f"{content_type} not supported!")
277
-
278
-
279
- def output_fn(output_df: pd.DataFrame, accept_type: str) -> tuple[str, str]:
280
- """Supports both CSV and JSON output formats."""
281
- if "text/csv" in accept_type:
282
- csv_output = output_df.fillna("N/A").to_csv(index=False)
283
- return csv_output, "text/csv"
284
- elif "application/json" in accept_type:
285
- return output_df.to_json(orient="records"), "application/json"
286
- else:
287
- raise RuntimeError(f"{accept_type} accept type is not supported by this script.")
103
+ for i in range(n_ensemble):
104
+ model_path = os.path.join(model_dir, f"model_{i}")
105
+ model = load_model(model_path, device=device)
106
+ ensemble_models.append(model)
107
+
108
+ print(f"Loaded {len(ensemble_models)} model(s)")
109
+
110
+ # Load feature scaler
111
+ scaler = FeatureScaler.load(os.path.join(model_dir, "scaler.joblib"))
112
+
113
+ # Load UQ models (regression only)
114
+ uq_models, uq_metadata = None, None
115
+ uq_path = os.path.join(model_dir, "uq_metadata.json")
116
+ if os.path.exists(uq_path):
117
+ uq_models, uq_metadata = load_uq_models(model_dir)
118
+
119
+ return {
120
+ "ensemble_models": ensemble_models,
121
+ "n_ensemble": n_ensemble,
122
+ "scaler": scaler,
123
+ "uq_models": uq_models,
124
+ "uq_metadata": uq_metadata,
125
+ }
288
126
 
289
127
 
128
+ # =============================================================================
129
+ # Inference (for SageMaker inference)
130
+ # =============================================================================
290
131
  def predict_fn(df: pd.DataFrame, model_dict: dict) -> pd.DataFrame:
291
- """Make Predictions with our PyTorch Tabular Model ensemble.
292
-
293
- Args:
294
- df (pd.DataFrame): The input DataFrame
295
- model_dict: Dictionary containing ensemble models and metadata
296
-
297
- Returns:
298
- pd.DataFrame: The DataFrame with predictions (and prediction_std for ensembles)
299
- """
132
+ """Make predictions with TabularMLP ensemble."""
300
133
  model_type = TEMPLATE_PARAMS["model_type"]
301
134
  compressed_features = TEMPLATE_PARAMS["compressed_features"]
135
+ model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
302
136
 
303
- # Extract ensemble models
137
+ # Load artifacts
304
138
  ensemble_models = model_dict["ensemble_models"]
305
- n_ensemble = model_dict["n_ensemble"]
139
+ scaler = model_dict["scaler"]
140
+ uq_models = model_dict.get("uq_models")
141
+ uq_metadata = model_dict.get("uq_metadata")
306
142
 
307
- # Grab our feature columns (from training)
308
- model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
309
- with open(os.path.join(model_dir, "feature_columns.json")) as fp:
310
- features = json.load(fp)
311
- print(f"Model Features: {features}")
143
+ with open(os.path.join(model_dir, "feature_columns.json")) as f:
144
+ features = json.load(f)
145
+ with open(os.path.join(model_dir, "category_mappings.json")) as f:
146
+ category_mappings = json.load(f)
147
+ with open(os.path.join(model_dir, "feature_metadata.json")) as f:
148
+ feature_metadata = json.load(f)
312
149
 
313
- # Load the category mappings (from training)
314
- with open(os.path.join(model_dir, "category_mappings.json")) as fp:
315
- category_mappings = json.load(fp)
150
+ continuous_cols = feature_metadata["continuous_cols"]
151
+ categorical_cols = feature_metadata["categorical_cols"]
316
152
 
317
- # Load our Label Encoder if we have one
318
153
  label_encoder = None
319
- label_encoder_path = os.path.join(model_dir, "label_encoder.joblib")
320
- if os.path.exists(label_encoder_path):
321
- label_encoder = joblib.load(label_encoder_path)
154
+ encoder_path = os.path.join(model_dir, "label_encoder.joblib")
155
+ if os.path.exists(encoder_path):
156
+ label_encoder = joblib.load(encoder_path)
322
157
 
323
- # Match features in a case-insensitive manner
324
- matched_df = match_features_case_insensitive(df, features)
158
+ print(f"Model Features: {features}")
325
159
 
326
- # Detect categorical types in the incoming DataFrame
160
+ # Prepare features
161
+ matched_df = match_features_case_insensitive(df, features)
327
162
  matched_df, _ = convert_categorical_types(matched_df, features, category_mappings)
328
163
 
329
- # If we have compressed features, decompress them
330
164
  if compressed_features:
331
165
  print("Decompressing features for prediction...")
332
166
  matched_df, features = decompress_features(matched_df, features, compressed_features)
333
167
 
334
- # Track rows with missing features
168
+ # Track missing features
335
169
  missing_mask = matched_df[features].isna().any(axis=1)
336
170
  if missing_mask.any():
337
- print(f"Warning: {missing_mask.sum()} rows have missing features, will return NaN predictions")
171
+ print(f"Warning: {missing_mask.sum()} rows have missing features")
338
172
 
339
- # Initialize prediction columns
173
+ # Initialize output columns
340
174
  df["prediction"] = np.nan
341
175
  if model_type in ["regressor", "uq_regressor"]:
342
176
  df["prediction_std"] = np.nan
343
177
 
344
- # Only predict on complete rows
345
- complete_df = matched_df[~missing_mask]
178
+ complete_df = matched_df[~missing_mask].copy()
346
179
  if len(complete_df) == 0:
347
180
  print("Warning: No complete rows to predict on")
348
181
  return df
349
182
 
350
- # pytorch-tabular returns predictions using f"{target}_prediction" column
351
- target = TEMPLATE_PARAMS["target"]
352
- prediction_column = f"{target}_prediction"
353
-
354
- # Collect predictions from all ensemble members
355
- all_ensemble_preds = []
356
- all_ensemble_probs = []
357
-
358
- for ens_idx, ens_model in enumerate(ensemble_models):
359
- result = ens_model.predict(complete_df[features])
360
-
361
- if prediction_column in result.columns:
362
- ens_preds = result[prediction_column].values
363
- else:
364
- raise ValueError(f"Cannot find prediction column in: {result.columns.tolist()}")
365
-
366
- all_ensemble_preds.append(ens_preds)
183
+ # Prepare data for inference (with standardization)
184
+ x_cont, x_cat, _, _, _ = prepare_data(
185
+ complete_df, continuous_cols, categorical_cols, category_mappings=category_mappings, scaler=scaler
186
+ )
367
187
 
368
- # For classification, collect probabilities
369
- if label_encoder is not None:
370
- prob_cols = sorted([col for col in result.columns if col.endswith("_probability")])
371
- if prob_cols:
372
- all_ensemble_probs.append(result[prob_cols].values)
188
+ # Collect ensemble predictions
189
+ all_preds = []
190
+ for model in ensemble_models:
191
+ preds = predict(model, x_cont, x_cat)
192
+ all_preds.append(preds)
373
193
 
374
- # Stack and compute mean/std (std is 0 for single model)
375
- ensemble_preds = np.stack(all_ensemble_preds, axis=0) # (n_ensemble, n_samples)
194
+ # Aggregate predictions
195
+ ensemble_preds = np.stack(all_preds, axis=0)
376
196
  preds = np.mean(ensemble_preds, axis=0)
377
- preds_std = np.std(ensemble_preds, axis=0) # Will be 0s for n_ensemble=1
197
+ preds_std = np.std(ensemble_preds, axis=0)
378
198
 
379
- print(f"Inference: Ensemble predictions shape: {preds.shape}, n_ensemble: {n_ensemble}")
199
+ print(f"Inference complete: {len(preds)} predictions, {len(ensemble_models)} ensemble members")
380
200
 
381
- # Handle classification vs regression
382
201
  if label_encoder is not None:
383
- # For classification, average probabilities then take argmax
384
- if all_ensemble_probs:
385
- ensemble_probs = np.stack(all_ensemble_probs, axis=0) # (n_ensemble, n_samples, n_classes)
386
- avg_probs = np.mean(ensemble_probs, axis=0) # (n_samples, n_classes)
387
- class_preds = np.argmax(avg_probs, axis=1)
388
- predictions = label_encoder.inverse_transform(class_preds)
389
-
390
- # Build full proba Series with None for missing rows
391
- all_proba = pd.Series([None] * len(df), index=df.index, dtype=object)
392
- all_proba.loc[~missing_mask] = [p.tolist() for p in avg_probs]
393
- df["pred_proba"] = all_proba
394
-
395
- # Expand the pred_proba column into separate columns for each class
396
- df = expand_proba_column(df, label_encoder.classes_)
397
- else:
398
- # No probabilities, use averaged predictions
399
- predictions = label_encoder.inverse_transform(preds.astype(int))
202
+ # Classification: average probabilities, then argmax
203
+ avg_probs = preds # Already softmax output
204
+ class_preds = np.argmax(avg_probs, axis=1)
205
+ predictions = label_encoder.inverse_transform(class_preds)
206
+
207
+ all_proba = pd.Series([None] * len(df), index=df.index, dtype=object)
208
+ all_proba.loc[~missing_mask] = [p.tolist() for p in avg_probs]
209
+ df["pred_proba"] = all_proba
210
+ df = expand_proba_column(df, label_encoder.classes_)
400
211
  else:
401
- # Regression (includes uq_regressor)
402
- predictions = preds
403
- df.loc[~missing_mask, "prediction_std"] = preds_std
212
+ # Regression
213
+ predictions = preds.flatten()
214
+ df.loc[~missing_mask, "prediction_std"] = preds_std.flatten()
215
+
216
+ # Add UQ intervals if available
217
+ if uq_models and uq_metadata:
218
+ X_complete = complete_df[features]
219
+ df_complete = df.loc[~missing_mask].copy()
220
+ df_complete["prediction"] = predictions # Set prediction before compute_confidence
221
+ df_complete = predict_intervals(df_complete, X_complete, uq_models, uq_metadata)
222
+ df_complete = compute_confidence(df_complete, uq_metadata["median_interval_width"], "q_10", "q_90")
223
+ # Copy UQ columns back to main dataframe
224
+ for col in df_complete.columns:
225
+ if col.startswith("q_") or col == "confidence":
226
+ df.loc[~missing_mask, col] = df_complete[col].values
404
227
 
405
- # Set predictions only for complete rows
406
228
  df.loc[~missing_mask, "prediction"] = predictions
407
-
408
229
  return df
409
230
 
410
231
 
232
+ # =============================================================================
233
+ # Training
234
+ # =============================================================================
411
235
  if __name__ == "__main__":
412
- """The main function is for training the PyTorch Tabular model"""
236
+ # -------------------------------------------------------------------------
237
+ # Setup: Parse arguments and load data
238
+ # -------------------------------------------------------------------------
239
+ parser = argparse.ArgumentParser()
240
+ parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model"))
241
+ parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train"))
242
+ parser.add_argument("--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data"))
243
+ args = parser.parse_args()
413
244
 
414
- # Harness Template Parameters
245
+ # Extract template parameters
415
246
  target = TEMPLATE_PARAMS["target"]
416
247
  features = TEMPLATE_PARAMS["features"]
417
248
  orig_features = features.copy()
@@ -419,341 +250,253 @@ if __name__ == "__main__":
419
250
  compressed_features = TEMPLATE_PARAMS["compressed_features"]
420
251
  model_type = TEMPLATE_PARAMS["model_type"]
421
252
  model_metrics_s3_path = TEMPLATE_PARAMS["model_metrics_s3_path"]
422
- hyperparameters = TEMPLATE_PARAMS["hyperparameters"]
423
-
424
- # Script arguments for input/output directories
425
- parser = argparse.ArgumentParser()
426
- parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model"))
427
- parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train"))
428
- parser.add_argument(
429
- "--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data")
430
- )
431
- args = parser.parse_args()
253
+ hyperparameters = {**DEFAULT_HYPERPARAMETERS, **(TEMPLATE_PARAMS["hyperparameters"] or {})}
432
254
 
433
- # Read the training data into DataFrames
434
- training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
255
+ # Load training data
256
+ training_files = [os.path.join(args.train, f) for f in os.listdir(args.train) if f.endswith(".csv")]
435
257
  print(f"Training Files: {training_files}")
436
-
437
- # Combine files and read them all into a single pandas dataframe
438
- all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
439
-
440
- # Print out some info about the dataframe
441
- print(f"All Data Shape: {all_df.shape}")
442
- print(f"Feature dtypes:\n{all_df[features].dtypes.value_counts()}")
443
- print(f"Int64 columns: {all_df[features].select_dtypes(include=['int64']).columns.tolist()}")
444
-
445
- # Check if the dataframe is empty
258
+ all_df = pd.concat([pd.read_csv(f, engine="python") for f in training_files])
446
259
  check_dataframe(all_df, "training_df")
447
260
 
448
- # Drop any rows with missing feature values
449
- initial_row_count = all_df.shape[0]
261
+ # Drop rows with missing features
262
+ initial_count = len(all_df)
450
263
  all_df = all_df.dropna(subset=features)
451
- dropped_rows = initial_row_count - all_df.shape[0]
452
- if dropped_rows > 0:
453
- print(f"Dropped {dropped_rows} rows due to missing feature values.")
264
+ if len(all_df) < initial_count:
265
+ print(f"Dropped {initial_count - len(all_df)} rows with missing features")
454
266
 
455
- # Features/Target output
456
267
  print(f"Target: {target}")
457
- print(f"Features: {str(features)}")
268
+ print(f"Features: {features}")
269
+ print(f"Hyperparameters: {hyperparameters}")
458
270
 
459
- # Convert any features that might be categorical to 'category' type
271
+ # -------------------------------------------------------------------------
272
+ # Preprocessing
273
+ # -------------------------------------------------------------------------
460
274
  all_df, category_mappings = convert_categorical_types(all_df, features)
461
275
 
462
- # Print out some info about the dataframe
463
- print(f"All Data Shape: {all_df.shape}")
464
- print(f"Feature dtypes:\n{all_df[features].dtypes.value_counts()}")
465
- print(f"Int64 columns: {all_df[features].select_dtypes(include=['int64']).columns.tolist()}")
466
-
467
- # If we have compressed features, decompress them
468
276
  if compressed_features:
469
- print(f"Decompressing features {compressed_features}...")
277
+ print(f"Decompressing features: {compressed_features}")
470
278
  all_df, features = decompress_features(all_df, features, compressed_features)
471
279
 
472
- # Determine categorical and continuous columns
473
- categorical_cols = [col for col in features if all_df[col].dtype.name == "category"]
474
- continuous_cols = [col for col in features if col not in categorical_cols]
475
- print(f"Categorical columns: {categorical_cols}")
476
- print(f"Continuous columns: {continuous_cols}")
477
-
478
- # Cast continuous columns to float
280
+ # Determine categorical vs continuous columns
281
+ categorical_cols = [c for c in features if all_df[c].dtype.name == "category"]
282
+ continuous_cols = [c for c in features if c not in categorical_cols]
479
283
  all_df[continuous_cols] = all_df[continuous_cols].astype("float64")
284
+ print(f"Categorical: {categorical_cols}")
285
+ print(f"Continuous: {len(continuous_cols)} columns")
480
286
 
481
- # Choose the 'task' based on model type and set up the label encoder if needed
287
+ # -------------------------------------------------------------------------
288
+ # Classification setup
289
+ # -------------------------------------------------------------------------
290
+ label_encoder = None
291
+ n_outputs = 1
482
292
  if model_type == "classifier":
483
- task = "classification"
484
- # Encode the target column on full dataset for consistent encoding
485
293
  label_encoder = LabelEncoder()
486
294
  all_df[target] = label_encoder.fit_transform(all_df[target])
487
- num_classes = len(label_encoder.classes_)
488
- else:
489
- task = "regression"
490
- label_encoder = None
491
- num_classes = None
295
+ n_outputs = len(label_encoder.classes_)
296
+ print(f"Class labels: {label_encoder.classes_.tolist()}")
492
297
 
493
- # Use any hyperparameters to set up both the trainer and model configurations
494
- print(f"Hyperparameters: {hyperparameters}")
495
- n_folds = hyperparameters.get("n_folds", 5) # Number of CV folds (default: 5)
298
+ # -------------------------------------------------------------------------
299
+ # Cross-validation setup
300
+ # -------------------------------------------------------------------------
301
+ n_folds = hyperparameters["n_folds"]
302
+ task = "classification" if model_type == "classifier" else "regression"
303
+ hidden_layers = [int(x) for x in hyperparameters["layers"].split("-")]
496
304
 
497
- # =========================================================================
498
- # UNIFIED TRAINING: Works for n_folds=1 (single model) or n_folds>1 (K-fold CV)
499
- # =========================================================================
500
- print(f"Training {'single model' if n_folds == 1 else f'{n_folds}-fold cross-validation ensemble'}...")
305
+ # Get categorical cardinalities
306
+ categorical_cardinalities = [len(category_mappings.get(col, {})) for col in categorical_cols]
501
307
 
502
- # Create fold splits
503
308
  if n_folds == 1:
504
- # Single fold: use train/val split from "training" column or random split
505
309
  if "training" in all_df.columns:
506
- print("Found training column, splitting data based on training column")
310
+ print("Using 'training' column for train/val split")
507
311
  train_idx = np.where(all_df["training"])[0]
508
312
  val_idx = np.where(~all_df["training"])[0]
509
313
  else:
510
- print("WARNING: No training column found, splitting data with random 80/20 split")
511
- indices = np.arange(len(all_df))
512
- train_idx, val_idx = train_test_split(indices, test_size=0.2, random_state=42)
314
+ print("WARNING: No 'training' column found, using random 80/20 split")
315
+ train_idx, val_idx = train_test_split(np.arange(len(all_df)), test_size=0.2, random_state=42)
513
316
  folds = [(train_idx, val_idx)]
514
317
  else:
515
- # K-Fold CV
516
318
  if model_type == "classifier":
517
319
  kfold = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
518
- split_target = all_df[target]
320
+ folds = list(kfold.split(all_df, all_df[target]))
519
321
  else:
520
322
  kfold = KFold(n_splits=n_folds, shuffle=True, random_state=42)
521
- split_target = None
522
- folds = list(kfold.split(all_df, split_target))
523
-
524
- # Initialize storage for out-of-fold predictions
525
- oof_predictions = np.full(len(all_df), np.nan, dtype=np.float64)
526
- if model_type == "classifier" and num_classes and num_classes > 1:
527
- oof_proba = np.full((len(all_df), num_classes), np.nan, dtype=np.float64)
528
- else:
529
- oof_proba = None
323
+ folds = list(kfold.split(all_df))
530
324
 
531
- ensemble_models = []
325
+ print(f"Training {'single model' if n_folds == 1 else f'{n_folds}-fold ensemble'}...")
532
326
 
533
- # Set up PyTorch Tabular data configuration (shared across folds)
534
- data_config = DataConfig(
535
- target=[target],
536
- continuous_cols=continuous_cols,
537
- categorical_cols=categorical_cols,
538
- )
327
+ # Fit scaler on all training data (used across all folds)
328
+ scaler = FeatureScaler()
329
+ scaler.fit(all_df, continuous_cols)
330
+ print(f"Fitted scaler on {len(continuous_cols)} continuous features")
539
331
 
540
- # Model config defaults
541
- model_defaults = {
542
- "layers": "256-128-64",
543
- "activation": "LeakyReLU",
544
- "learning_rate": 1e-3,
545
- "dropout": 0.1,
546
- "use_batch_norm": True,
547
- "initialization": "kaiming",
548
- }
549
- # Override defaults with model_config if present
550
- model_overrides = {k: v for k, v in hyperparameters.get("model_config", {}).items() if k in model_defaults}
551
- for key, value in model_overrides.items():
552
- print(f"MODEL CONFIG Override: {key}: {model_defaults[key]} → {value}")
553
- model_params = {**model_defaults, **model_overrides}
332
+ # Determine device
333
+ device = "cuda" if torch.cuda.is_available() else "cpu"
334
+ print(f"Using device: {device}")
554
335
 
555
- model_config = CategoryEmbeddingModelConfig(task=task, **model_params)
556
- optimizer_config = OptimizerConfig()
336
+ # -------------------------------------------------------------------------
337
+ # Training loop
338
+ # -------------------------------------------------------------------------
339
+ oof_predictions = np.full((len(all_df), n_outputs), np.nan, dtype=np.float64)
557
340
 
341
+ ensemble_models = []
558
342
  for fold_idx, (train_idx, val_idx) in enumerate(folds):
559
343
  print(f"\n{'='*50}")
560
- print(f"Training Fold {fold_idx + 1}/{len(folds)}")
344
+ print(f"Fold {fold_idx + 1}/{len(folds)} - Train: {len(train_idx)}, Val: {len(val_idx)}")
561
345
  print(f"{'='*50}")
562
346
 
563
- # Split data for this fold
564
347
  df_train = all_df.iloc[train_idx].reset_index(drop=True)
565
348
  df_val = all_df.iloc[val_idx].reset_index(drop=True)
566
349
 
567
- print(f"Fold {fold_idx + 1} - Train: {len(df_train)}, Val: {len(df_val)}")
568
-
569
- # Set up PyTorch Tabular trainer configuration (per-fold for batch_size)
570
- # Calculate batch size that avoids single-sample last batch (batch norm requires >1)
571
- batch_size = min(128, max(32, len(df_train) // 16))
572
- if len(df_train) % batch_size == 1:
573
- batch_size += 1 # Adjust to avoid last batch of size 1
574
- trainer_defaults = {
575
- "auto_lr_find": False,
576
- "batch_size": batch_size,
577
- "max_epochs": 200,
578
- "min_epochs": 10,
579
- "early_stopping": "valid_loss",
580
- "early_stopping_patience": 20,
581
- "checkpoints": "valid_loss",
582
- "accelerator": "auto",
583
- "progress_bar": "none",
584
- "gradient_clip_val": 1.0,
585
- "seed": 42 + fold_idx,
586
- }
587
-
588
- # Override defaults with training_config if present
589
- training_overrides = {k: v for k, v in hyperparameters.get("training_config", {}).items() if k in trainer_defaults}
590
- if fold_idx == 0: # Only print overrides once
591
- for key, value in training_overrides.items():
592
- print(f"TRAINING CONFIG Override: {key}: {trainer_defaults[key]} → {value}")
593
- trainer_params = {**trainer_defaults, **training_overrides}
594
- trainer_config = TrainerConfig(**trainer_params)
595
-
596
- # Create and train the TabularModel for this fold
597
- tabular_model = TabularModel(
598
- data_config=data_config,
599
- model_config=model_config,
600
- optimizer_config=optimizer_config,
601
- trainer_config=trainer_config,
350
+ # Prepare data (using pre-fitted scaler)
351
+ train_x_cont, train_x_cat, train_y, _, _ = prepare_data(
352
+ df_train, continuous_cols, categorical_cols, target, category_mappings, scaler=scaler
353
+ )
354
+ val_x_cont, val_x_cat, val_y, _, _ = prepare_data(
355
+ df_val, continuous_cols, categorical_cols, target, category_mappings, scaler=scaler
602
356
  )
603
- tabular_model.fit(train=df_train, validation=df_val)
604
- ensemble_models.append(tabular_model)
605
357
 
606
- # Make out-of-fold predictions
607
- result = tabular_model.predict(df_val, include_input_features=False)
608
- fold_preds = result[f"{target}_prediction"].values
358
+ # Create model
359
+ torch.manual_seed(hyperparameters["seed"] + fold_idx)
360
+ model = create_model(
361
+ n_continuous=len(continuous_cols),
362
+ categorical_cardinalities=categorical_cardinalities,
363
+ hidden_layers=hidden_layers,
364
+ n_outputs=n_outputs,
365
+ task=task,
366
+ dropout=hyperparameters["dropout"],
367
+ use_batch_norm=hyperparameters["use_batch_norm"],
368
+ )
609
369
 
610
- # Store out-of-fold predictions
611
- if model_type == "classifier":
612
- oof_predictions[val_idx] = fold_preds.astype(int)
613
- prob_cols = sorted([col for col in result.columns if col.endswith("_probability")])
614
- if prob_cols and oof_proba is not None:
615
- oof_proba[val_idx] = result[prob_cols].values
616
- else:
617
- oof_predictions[val_idx] = fold_preds.flatten()
370
+ # Train
371
+ model, history = train_model(
372
+ model,
373
+ train_x_cont, train_x_cat, train_y,
374
+ val_x_cont, val_x_cat, val_y,
375
+ task=task,
376
+ max_epochs=hyperparameters["max_epochs"],
377
+ patience=hyperparameters["early_stopping_patience"],
378
+ batch_size=hyperparameters["batch_size"],
379
+ learning_rate=hyperparameters["learning_rate"],
380
+ device=device,
381
+ )
382
+ ensemble_models.append(model)
618
383
 
619
- print(f"Fold {fold_idx + 1} complete!")
384
+ # Out-of-fold predictions
385
+ fold_preds = predict(model, val_x_cont, val_x_cat)
386
+ oof_predictions[val_idx] = fold_preds
620
387
 
621
388
  print(f"\nTraining complete! Trained {len(ensemble_models)} model(s).")
622
389
 
623
- # Use out-of-fold predictions for metrics
624
- # For n_folds=1, we only have predictions for val_idx, so filter to those rows
390
+ # -------------------------------------------------------------------------
391
+ # Prepare validation results
392
+ # -------------------------------------------------------------------------
625
393
  if n_folds == 1:
626
- val_mask = ~np.isnan(oof_predictions)
627
- preds = oof_predictions[val_mask]
394
+ val_mask = ~np.isnan(oof_predictions[:, 0])
628
395
  df_val = all_df[val_mask].copy()
629
- if oof_proba is not None:
630
- oof_proba = oof_proba[val_mask]
396
+ predictions = oof_predictions[val_mask]
631
397
  else:
632
- preds = oof_predictions
633
398
  df_val = all_df.copy()
399
+ predictions = oof_predictions
634
400
 
635
- # Compute prediction_std by running all ensemble models on validation data
636
- # For n_folds=1, std will be 0 (only one model). For n_folds>1, std shows ensemble disagreement.
637
- preds_std = None
638
- if model_type in ["regressor", "uq_regressor"] and len(ensemble_models) > 0:
639
- print("Computing prediction_std from ensemble predictions on validation data...")
640
- all_ensemble_preds_for_std = []
641
- for ens_model in ensemble_models:
642
- result = ens_model.predict(df_val[features], include_input_features=False)
643
- ens_preds = result[f"{target}_prediction"].values.flatten()
644
- all_ensemble_preds_for_std.append(ens_preds)
645
-
646
- ensemble_preds_stacked = np.stack(all_ensemble_preds_for_std, axis=0)
647
- preds_std = np.std(ensemble_preds_stacked, axis=0)
648
- print(f"Ensemble prediction_std - mean: {np.mean(preds_std):.4f}, max: {np.max(preds_std):.4f}")
401
+ # Decode labels for classification
402
+ if model_type == "classifier":
403
+ class_preds = np.argmax(predictions, axis=1)
404
+ df_val[target] = label_encoder.inverse_transform(df_val[target].astype(int))
405
+ df_val["prediction"] = label_encoder.inverse_transform(class_preds)
406
+ df_val["pred_proba"] = [p.tolist() for p in predictions]
407
+ df_val = expand_proba_column(df_val, label_encoder.classes_)
408
+ else:
409
+ df_val["prediction"] = predictions.flatten()
410
+
411
+ # -------------------------------------------------------------------------
412
+ # Compute and print metrics
413
+ # -------------------------------------------------------------------------
414
+ y_true = df_val[target].values
415
+ y_pred = df_val["prediction"].values
649
416
 
650
417
  if model_type == "classifier":
651
- # Get probabilities for classification
652
- if oof_proba is not None:
653
- df_val = df_val.copy()
654
- df_val["pred_proba"] = [p.tolist() for p in oof_proba]
655
- df_val = expand_proba_column(df_val, label_encoder.classes_)
656
-
657
- # Decode the target and prediction labels
658
- y_validate = label_encoder.inverse_transform(df_val[target])
659
- preds_decoded = label_encoder.inverse_transform(preds.astype(int))
418
+ score_df = compute_classification_metrics(y_true, y_pred, label_encoder.classes_, target)
419
+ print_classification_metrics(score_df, target, label_encoder.classes_)
420
+ print_confusion_matrix(y_true, y_pred, label_encoder.classes_)
660
421
  else:
661
- y_validate = df_val[target].values
662
- preds_decoded = preds
422
+ metrics = compute_regression_metrics(y_true, y_pred)
423
+ print_regression_metrics(metrics)
424
+
425
+ # Compute ensemble prediction_std
426
+ if n_folds > 1:
427
+ # Re-run inference with all models to get std
428
+ x_cont, x_cat, _, _, _ = prepare_data(
429
+ df_val, continuous_cols, categorical_cols, category_mappings=category_mappings, scaler=scaler
430
+ )
431
+ all_preds = [predict(m, x_cont, x_cat).flatten() for m in ensemble_models]
432
+ df_val["prediction_std"] = np.std(np.stack(all_preds), axis=0)
433
+ print(f"Ensemble std - mean: {df_val['prediction_std'].mean():.4f}, max: {df_val['prediction_std'].max():.4f}")
434
+ else:
435
+ df_val["prediction_std"] = 0.0
663
436
 
664
- # Save predictions to S3
665
- df_val = df_val.copy()
666
- df_val["prediction"] = preds_decoded
437
+ # Train UQ models for uncertainty quantification
438
+ print("\n" + "=" * 50)
439
+ print("Training UQ Models")
440
+ print("=" * 50)
441
+ uq_models, uq_metadata = train_uq_models(
442
+ all_df[features], all_df[target], df_val[features], y_true
443
+ )
444
+ df_val = predict_intervals(df_val, df_val[features], uq_models, uq_metadata)
445
+ df_val = compute_confidence(df_val, uq_metadata["median_interval_width"])
667
446
 
668
- # Build output columns - include id_column if it exists
447
+ # -------------------------------------------------------------------------
448
+ # Save validation predictions to S3
449
+ # -------------------------------------------------------------------------
669
450
  output_columns = []
670
451
  if id_column in df_val.columns:
671
452
  output_columns.append(id_column)
672
453
  output_columns += [target, "prediction"]
673
454
 
674
- # Add prediction_std for regression models (always present, 0 for single model)
675
- if model_type in ["regressor", "uq_regressor"]:
676
- if preds_std is not None:
677
- df_val["prediction_std"] = preds_std
678
- else:
679
- df_val["prediction_std"] = 0.0
455
+ if model_type != "classifier":
680
456
  output_columns.append("prediction_std")
681
- print(f"Ensemble std - mean: {df_val['prediction_std'].mean():.4f}, max: {df_val['prediction_std'].max():.4f}")
457
+ output_columns += [c for c in df_val.columns if c.startswith("q_") or c == "confidence"]
458
+
459
+ output_columns += [c for c in df_val.columns if c.endswith("_proba")]
460
+
461
+ wr.s3.to_csv(df_val[output_columns], f"{model_metrics_s3_path}/validation_predictions.csv", index=False)
462
+
463
+ # -------------------------------------------------------------------------
464
+ # Save model artifacts
465
+ # -------------------------------------------------------------------------
466
+ model_config = {
467
+ "n_continuous": len(continuous_cols),
468
+ "categorical_cardinalities": categorical_cardinalities,
469
+ "hidden_layers": hidden_layers,
470
+ "n_outputs": n_outputs,
471
+ "task": task,
472
+ "dropout": hyperparameters["dropout"],
473
+ "use_batch_norm": hyperparameters["use_batch_norm"],
474
+ }
682
475
 
683
- output_columns += [col for col in df_val.columns if col.endswith("_proba")]
684
- wr.s3.to_csv(
685
- df_val[output_columns],
686
- path=f"{model_metrics_s3_path}/validation_predictions.csv",
687
- index=False,
688
- )
476
+ for idx, m in enumerate(ensemble_models):
477
+ save_model(m, os.path.join(args.model_dir, f"model_{idx}"), model_config)
478
+ print(f"Saved {len(ensemble_models)} model(s)")
689
479
 
690
- # Report Performance Metrics
691
- if model_type == "classifier":
692
- # Get the label names and their integer mapping
693
- label_names = label_encoder.classes_
694
-
695
- # Calculate various model performance metrics
696
- scores = precision_recall_fscore_support(y_validate, preds_decoded, average=None, labels=label_names)
697
-
698
- # Put the scores into a dataframe
699
- score_df = pd.DataFrame(
700
- {
701
- target: label_names,
702
- "precision": scores[0],
703
- "recall": scores[1],
704
- "f1": scores[2],
705
- "support": scores[3],
706
- }
707
- )
480
+ joblib.dump({"n_ensemble": len(ensemble_models), "n_folds": n_folds}, os.path.join(args.model_dir, "ensemble_metadata.joblib"))
708
481
 
709
- # Output metrics per class
710
- metrics = ["precision", "recall", "f1", "support"]
711
- for t in label_names:
712
- for m in metrics:
713
- value = score_df.loc[score_df[target] == t, m].iloc[0]
714
- print(f"Metrics:{t}:{m} {value}")
482
+ with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as f:
483
+ json.dump(orig_features, f)
715
484
 
716
- # Compute and output the confusion matrix
717
- conf_mtx = confusion_matrix(y_validate, preds_decoded, labels=label_names)
718
- for i, row_name in enumerate(label_names):
719
- for j, col_name in enumerate(label_names):
720
- value = conf_mtx[i, j]
721
- print(f"ConfusionMatrix:{row_name}:{col_name} {value}")
485
+ with open(os.path.join(args.model_dir, "category_mappings.json"), "w") as f:
486
+ json.dump(category_mappings, f)
722
487
 
723
- else:
724
- # Calculate various model performance metrics (regression)
725
- rmse = root_mean_squared_error(y_validate, preds_decoded)
726
- mae = mean_absolute_error(y_validate, preds_decoded)
727
- medae = median_absolute_error(y_validate, preds_decoded)
728
- r2 = r2_score(y_validate, preds_decoded)
729
- spearman_corr = spearmanr(y_validate, preds_decoded).correlation
730
- support = len(df_val)
731
- print(f"rmse: {rmse:.3f}")
732
- print(f"mae: {mae:.3f}")
733
- print(f"medae: {medae:.3f}")
734
- print(f"r2: {r2:.3f}")
735
- print(f"spearmanr: {spearman_corr:.3f}")
736
- print(f"support: {support}")
737
-
738
- # Save ensemble models
739
- for model_idx, ens_model in enumerate(ensemble_models):
740
- model_path = os.path.join(args.model_dir, f"tabular_model_{model_idx}")
741
- ens_model.save_model(model_path)
742
- print(f"Saved model {model_idx + 1} to {model_path}")
743
-
744
- # Save ensemble metadata
745
- n_ensemble = len(ensemble_models)
746
- ensemble_metadata = {"n_ensemble": n_ensemble, "n_folds": n_folds}
747
- joblib.dump(ensemble_metadata, os.path.join(args.model_dir, "ensemble_metadata.joblib"))
748
- print(f"Saved ensemble metadata (n_ensemble={n_ensemble}, n_folds={n_folds})")
488
+ with open(os.path.join(args.model_dir, "feature_metadata.json"), "w") as f:
489
+ json.dump({"continuous_cols": continuous_cols, "categorical_cols": categorical_cols}, f)
490
+
491
+ with open(os.path.join(args.model_dir, "hyperparameters.json"), "w") as f:
492
+ json.dump(hyperparameters, f, indent=2)
493
+
494
+ scaler.save(os.path.join(args.model_dir, "scaler.joblib"))
749
495
 
750
496
  if label_encoder:
751
497
  joblib.dump(label_encoder, os.path.join(args.model_dir, "label_encoder.joblib"))
752
498
 
753
- # Save the features (this will validate input during predictions)
754
- with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
755
- json.dump(orig_features, fp)
499
+ if model_type != "classifier":
500
+ save_uq_models(uq_models, uq_metadata, args.model_dir)
756
501
 
757
- # Save the category mappings
758
- with open(os.path.join(args.model_dir, "category_mappings.json"), "w") as fp:
759
- json.dump(category_mappings, fp)
502
+ print(f"\nModel training complete! Artifacts saved to {args.model_dir}")