workbench 0.8.160__py3-none-any.whl → 0.8.202__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of workbench might be problematic. Click here for more details.

Files changed (114) hide show
  1. workbench/algorithms/dataframe/__init__.py +1 -2
  2. workbench/algorithms/dataframe/fingerprint_proximity.py +2 -2
  3. workbench/algorithms/dataframe/proximity.py +261 -235
  4. workbench/algorithms/graph/light/proximity_graph.py +10 -8
  5. workbench/api/__init__.py +2 -1
  6. workbench/api/compound.py +1 -1
  7. workbench/api/endpoint.py +11 -0
  8. workbench/api/feature_set.py +12 -8
  9. workbench/api/meta.py +5 -2
  10. workbench/api/model.py +16 -15
  11. workbench/api/monitor.py +1 -16
  12. workbench/api/parameter_store.py +5 -0
  13. workbench/core/artifacts/__init__.py +11 -2
  14. workbench/core/artifacts/artifact.py +11 -3
  15. workbench/core/artifacts/data_capture_core.py +355 -0
  16. workbench/core/artifacts/endpoint_core.py +256 -118
  17. workbench/core/artifacts/feature_set_core.py +265 -16
  18. workbench/core/artifacts/model_core.py +110 -63
  19. workbench/core/artifacts/monitor_core.py +33 -248
  20. workbench/core/cloud_platform/aws/aws_account_clamp.py +50 -1
  21. workbench/core/cloud_platform/aws/aws_meta.py +12 -5
  22. workbench/core/cloud_platform/aws/aws_parameter_store.py +18 -2
  23. workbench/core/cloud_platform/aws/aws_session.py +4 -4
  24. workbench/core/transforms/data_to_features/light/molecular_descriptors.py +4 -4
  25. workbench/core/transforms/features_to_model/features_to_model.py +45 -33
  26. workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +36 -6
  27. workbench/core/transforms/pandas_transforms/pandas_to_features.py +27 -0
  28. workbench/core/views/training_view.py +113 -42
  29. workbench/core/views/view.py +53 -3
  30. workbench/core/views/view_utils.py +4 -4
  31. workbench/model_scripts/chemprop/chemprop.template +852 -0
  32. workbench/model_scripts/chemprop/generated_model_script.py +852 -0
  33. workbench/model_scripts/chemprop/requirements.txt +11 -0
  34. workbench/model_scripts/custom_models/chem_info/fingerprints.py +134 -0
  35. workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +483 -0
  36. workbench/model_scripts/custom_models/chem_info/mol_standardize.py +450 -0
  37. workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +7 -9
  38. workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py +1 -1
  39. workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +3 -5
  40. workbench/model_scripts/custom_models/proximity/proximity.py +261 -235
  41. workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
  42. workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +20 -21
  43. workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
  44. workbench/model_scripts/custom_models/uq_models/meta_uq.template +166 -62
  45. workbench/model_scripts/custom_models/uq_models/ngboost.template +30 -18
  46. workbench/model_scripts/custom_models/uq_models/proximity.py +261 -235
  47. workbench/model_scripts/custom_models/uq_models/requirements.txt +1 -3
  48. workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +15 -17
  49. workbench/model_scripts/pytorch_model/generated_model_script.py +390 -188
  50. workbench/model_scripts/pytorch_model/pytorch.template +387 -176
  51. workbench/model_scripts/scikit_learn/generated_model_script.py +7 -12
  52. workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
  53. workbench/model_scripts/script_generation.py +19 -10
  54. workbench/model_scripts/uq_models/generated_model_script.py +605 -0
  55. workbench/model_scripts/uq_models/mapie.template +605 -0
  56. workbench/model_scripts/uq_models/requirements.txt +1 -0
  57. workbench/model_scripts/xgb_model/generated_model_script.py +37 -46
  58. workbench/model_scripts/xgb_model/xgb_model.template +44 -46
  59. workbench/repl/workbench_shell.py +28 -14
  60. workbench/scripts/endpoint_test.py +162 -0
  61. workbench/scripts/lambda_test.py +73 -0
  62. workbench/scripts/ml_pipeline_batch.py +137 -0
  63. workbench/scripts/ml_pipeline_sqs.py +186 -0
  64. workbench/scripts/monitor_cloud_watch.py +20 -100
  65. workbench/utils/aws_utils.py +4 -3
  66. workbench/utils/chem_utils/__init__.py +0 -0
  67. workbench/utils/chem_utils/fingerprints.py +134 -0
  68. workbench/utils/chem_utils/misc.py +194 -0
  69. workbench/utils/chem_utils/mol_descriptors.py +483 -0
  70. workbench/utils/chem_utils/mol_standardize.py +450 -0
  71. workbench/utils/chem_utils/mol_tagging.py +348 -0
  72. workbench/utils/chem_utils/projections.py +209 -0
  73. workbench/utils/chem_utils/salts.py +256 -0
  74. workbench/utils/chem_utils/sdf.py +292 -0
  75. workbench/utils/chem_utils/toxicity.py +250 -0
  76. workbench/utils/chem_utils/vis.py +253 -0
  77. workbench/utils/chemprop_utils.py +760 -0
  78. workbench/utils/cloudwatch_handler.py +1 -1
  79. workbench/utils/cloudwatch_utils.py +137 -0
  80. workbench/utils/config_manager.py +3 -7
  81. workbench/utils/endpoint_utils.py +5 -7
  82. workbench/utils/license_manager.py +2 -6
  83. workbench/utils/model_utils.py +95 -34
  84. workbench/utils/monitor_utils.py +44 -62
  85. workbench/utils/pandas_utils.py +3 -3
  86. workbench/utils/pytorch_utils.py +526 -0
  87. workbench/utils/shap_utils.py +10 -2
  88. workbench/utils/workbench_logging.py +0 -3
  89. workbench/utils/workbench_sqs.py +1 -1
  90. workbench/utils/xgboost_model_utils.py +371 -156
  91. workbench/web_interface/components/model_plot.py +7 -1
  92. workbench/web_interface/components/plugin_unit_test.py +5 -2
  93. workbench/web_interface/components/plugins/dashboard_status.py +3 -1
  94. workbench/web_interface/components/plugins/generated_compounds.py +1 -1
  95. workbench/web_interface/components/plugins/model_details.py +9 -7
  96. workbench/web_interface/components/plugins/scatter_plot.py +3 -3
  97. {workbench-0.8.160.dist-info → workbench-0.8.202.dist-info}/METADATA +27 -6
  98. {workbench-0.8.160.dist-info → workbench-0.8.202.dist-info}/RECORD +102 -86
  99. {workbench-0.8.160.dist-info → workbench-0.8.202.dist-info}/entry_points.txt +4 -0
  100. {workbench-0.8.160.dist-info → workbench-0.8.202.dist-info}/licenses/LICENSE +1 -1
  101. workbench/model_scripts/custom_models/chem_info/local_utils.py +0 -769
  102. workbench/model_scripts/custom_models/chem_info/tautomerize.py +0 -83
  103. workbench/model_scripts/custom_models/proximity/generated_model_script.py +0 -138
  104. workbench/model_scripts/custom_models/uq_models/generated_model_script.py +0 -393
  105. workbench/model_scripts/custom_models/uq_models/mapie_xgb.template +0 -203
  106. workbench/model_scripts/ensemble_xgb/generated_model_script.py +0 -279
  107. workbench/model_scripts/quant_regression/quant_regression.template +0 -279
  108. workbench/model_scripts/quant_regression/requirements.txt +0 -1
  109. workbench/utils/chem_utils.py +0 -1556
  110. workbench/utils/execution_environment.py +0 -211
  111. workbench/utils/fast_inference.py +0 -167
  112. workbench/utils/resource_utils.py +0 -39
  113. {workbench-0.8.160.dist-info → workbench-0.8.202.dist-info}/WHEEL +0 -0
  114. {workbench-0.8.160.dist-info → workbench-0.8.202.dist-info}/top_level.txt +0 -0
@@ -13,38 +13,38 @@ from pytorch_tabular.models import CategoryEmbeddingModelConfig
13
13
  # Model Performance Scores
14
14
  from sklearn.metrics import (
15
15
  mean_absolute_error,
16
+ median_absolute_error,
16
17
  r2_score,
17
18
  root_mean_squared_error,
18
19
  precision_recall_fscore_support,
19
20
  confusion_matrix,
20
21
  )
22
+ from scipy.stats import spearmanr
21
23
 
22
24
  # Classification Encoder
23
25
  from sklearn.preprocessing import LabelEncoder
24
26
 
25
27
  # Scikit Learn Imports
26
- from sklearn.model_selection import train_test_split
28
+ from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
27
29
 
28
30
  from io import StringIO
29
31
  import json
30
32
  import argparse
31
33
  import joblib
32
- import os
33
34
  import pandas as pd
34
- from typing import List, Tuple
35
35
 
36
36
  # Template Parameters
37
37
  TEMPLATE_PARAMS = {
38
38
  "model_type": "{{model_type}}",
39
- "target_column": "{{target_column}}",
39
+ "target": "{{target_column}}",
40
40
  "features": "{{feature_list}}",
41
+ "id_column": "{{id_column}}",
41
42
  "compressed_features": "{{compressed_features}}",
42
43
  "model_metrics_s3_path": "{{model_metrics_s3_path}}",
43
- "train_all_data": "{{train_all_data}}"
44
+ "hyperparameters": "{{hyperparameters}}",
44
45
  }
45
46
 
46
47
 
47
- # Function to check if dataframe is empty
48
48
  def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
49
49
  """
50
50
  Check if the provided dataframe is empty and raise an exception if it is.
@@ -59,19 +59,17 @@ def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
59
59
  raise ValueError(msg)
60
60
 
61
61
 
62
- def expand_proba_column(df: pd.DataFrame, class_labels: List[str]) -> pd.DataFrame:
62
+ def expand_proba_column(df: pd.DataFrame, class_labels: list[str]) -> pd.DataFrame:
63
63
  """
64
64
  Expands a column in a DataFrame containing a list of probabilities into separate columns.
65
65
 
66
66
  Args:
67
67
  df (pd.DataFrame): DataFrame containing a "pred_proba" column
68
- class_labels (List[str]): List of class labels
68
+ class_labels (list[str]): List of class labels
69
69
 
70
70
  Returns:
71
71
  pd.DataFrame: DataFrame with the "pred_proba" expanded into separate columns
72
72
  """
73
-
74
- # Sanity check
75
73
  proba_column = "pred_proba"
76
74
  if proba_column not in df.columns:
77
75
  raise ValueError('DataFrame does not contain a "pred_proba" column')
@@ -88,11 +86,10 @@ def expand_proba_column(df: pd.DataFrame, class_labels: List[str]) -> pd.DataFra
88
86
 
89
87
  # Concatenate the new columns with the original DataFrame
90
88
  df = pd.concat([df, proba_df], axis=1)
91
- print(df)
92
89
  return df
93
90
 
94
91
 
95
- def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> pd.DataFrame:
92
+ def match_features_case_insensitive(df: pd.DataFrame, model_features: list[str]) -> pd.DataFrame:
96
93
  """
97
94
  Matches and renames DataFrame columns to match model feature names (case-insensitive).
98
95
  Prioritizes exact matches, then case-insensitive matches.
@@ -102,7 +99,6 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
102
99
  df_columns_lower = {col.lower(): col for col in df.columns}
103
100
  rename_dict = {}
104
101
  missing = []
105
-
106
102
  for feature in model_features:
107
103
  if feature in df.columns:
108
104
  continue # Exact match
@@ -114,58 +110,64 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
114
110
  if missing:
115
111
  raise ValueError(f"Features not found: {missing}")
116
112
 
113
+ # Rename the DataFrame columns to match the model features
117
114
  return df.rename(columns=rename_dict)
118
115
 
119
116
 
120
- def convert_categorical_types(df: pd.DataFrame, features: list, category_mappings={}) -> tuple:
117
+ def convert_categorical_types(
118
+ df: pd.DataFrame, features: list[str], category_mappings: dict[str, list[str]] | None = None
119
+ ) -> tuple[pd.DataFrame, dict[str, list[str]]]:
121
120
  """
122
121
  Converts appropriate columns to categorical type with consistent mappings.
123
122
 
124
123
  Args:
125
124
  df (pd.DataFrame): The DataFrame to process.
126
125
  features (list): List of feature names to consider for conversion.
127
- category_mappings (dict, optional): Existing category mappings. If empty dict, we're in
128
- training mode. If populated, we're in inference mode.
126
+ category_mappings (dict, optional): Existing category mappings. If None or empty,
127
+ we're in training mode. If populated, we're in
128
+ inference mode.
129
129
 
130
130
  Returns:
131
131
  tuple: (processed DataFrame, category mappings dictionary)
132
132
  """
133
+ if category_mappings is None:
134
+ category_mappings = {}
135
+
133
136
  # Training mode
134
- if category_mappings == {}:
137
+ if not category_mappings:
135
138
  for col in df.select_dtypes(include=["object", "string"]):
136
139
  if col in features and df[col].nunique() < 20:
137
140
  print(f"Training mode: Converting {col} to category")
138
141
  df[col] = df[col].astype("category")
139
- category_mappings[col] = df[col].cat.categories.tolist() # Store category mappings
142
+ category_mappings[col] = df[col].cat.categories.tolist()
140
143
 
141
144
  # Inference mode
142
145
  else:
143
146
  for col, categories in category_mappings.items():
144
147
  if col in df.columns:
145
148
  print(f"Inference mode: Applying categorical mapping for {col}")
146
- df[col] = pd.Categorical(df[col], categories=categories) # Apply consistent categorical mapping
149
+ df[col] = pd.Categorical(df[col], categories=categories)
147
150
 
148
151
  return df, category_mappings
149
152
 
150
153
 
151
154
  def decompress_features(
152
- df: pd.DataFrame, features: List[str], compressed_features: List[str]
153
- ) -> Tuple[pd.DataFrame, List[str]]:
155
+ df: pd.DataFrame, features: list[str], compressed_features: list[str]
156
+ ) -> tuple[pd.DataFrame, list[str]]:
154
157
  """Prepare features for the model
155
158
 
156
159
  Args:
157
160
  df (pd.DataFrame): The features DataFrame
158
- features (List[str]): Full list of feature names
159
- compressed_features (List[str]): List of feature names to decompress (bitstrings)
161
+ features (list[str]): Full list of feature names
162
+ compressed_features (list[str]): List of feature names to decompress (bitstrings)
160
163
 
161
164
  Returns:
162
165
  pd.DataFrame: DataFrame with the decompressed features
163
- List[str]: Updated list of feature names after decompression
166
+ list[str]: Updated list of feature names after decompression
164
167
 
165
168
  Raises:
166
169
  ValueError: If any missing values are found in the specified features
167
170
  """
168
-
169
171
  # Check for any missing values in the required features
170
172
  missing_counts = df[features].isna().sum()
171
173
  if missing_counts.any():
@@ -175,10 +177,11 @@ def decompress_features(
175
177
  "WARNING: You might want to remove/replace all NaN values before processing."
176
178
  )
177
179
 
178
- # Decompress the specified compressed features
179
- decompressed_features = features
180
+ # Make a copy to avoid mutating the original list
181
+ decompressed_features = features.copy()
182
+
180
183
  for feature in compressed_features:
181
- if (feature not in df.columns) or (feature not in features):
184
+ if (feature not in df.columns) or (feature not in decompressed_features):
182
185
  print(f"Feature '{feature}' not in the features list, skipping decompression.")
183
186
  continue
184
187
 
@@ -203,26 +206,60 @@ def decompress_features(
203
206
  return df, decompressed_features
204
207
 
205
208
 
206
- def model_fn(model_dir):
209
+ def model_fn(model_dir: str) -> dict:
210
+ """Load the PyTorch Tabular ensemble models from the specified directory.
211
+
212
+ Args:
213
+ model_dir: Directory containing the saved model(s)
214
+
215
+ Returns:
216
+ Dictionary with ensemble models and metadata
217
+ """
218
+ import torch
219
+ from functools import partial
220
+
221
+ # Load ensemble metadata if present
222
+ ensemble_metadata_path = os.path.join(model_dir, "ensemble_metadata.joblib")
223
+ if os.path.exists(ensemble_metadata_path):
224
+ ensemble_metadata = joblib.load(ensemble_metadata_path)
225
+ n_ensemble = ensemble_metadata["n_ensemble"]
226
+ else:
227
+ n_ensemble = 1
228
+
229
+ # Determine map_location for loading models (handle CUDA trained models on CPU inference)
230
+ map_location = torch.device("cuda" if torch.cuda.is_available() else "cpu")
231
+
232
+ # Patch torch.load globally to use map_location (needed for joblib-loaded callbacks)
233
+ # This handles the case where pytorch-tabular loads callbacks.sav via joblib,
234
+ # which internally calls torch.load without map_location
235
+ original_torch_load = torch.load
236
+ torch.load = partial(original_torch_load, map_location=map_location)
207
237
 
208
238
  # Save current working directory
209
239
  original_cwd = os.getcwd()
240
+ ensemble_models = []
241
+
210
242
  try:
211
243
  # Change to /tmp because Pytorch Tabular needs write access (creates a .pt_tmp directory)
212
- os.chdir('/tmp')
244
+ os.chdir("/tmp")
213
245
 
214
- # Load the model
215
- model_path = os.path.join(model_dir, "tabular_model")
216
- model = TabularModel.load_model(model_path)
246
+ for ens_idx in range(n_ensemble):
247
+ # Try numbered model path first, fall back to legacy path
248
+ model_path = os.path.join(model_dir, f"tabular_model_{ens_idx}")
249
+ if not os.path.exists(model_path):
250
+ model_path = os.path.join(model_dir, "tabular_model")
251
+ model = TabularModel.load_model(model_path, map_location=map_location)
252
+ ensemble_models.append(model)
217
253
 
218
- # Restore the original working directory
219
254
  finally:
255
+ # Restore torch.load and working directory
256
+ torch.load = original_torch_load
220
257
  os.chdir(original_cwd)
221
258
 
222
- return model
259
+ return {"ensemble_models": ensemble_models, "n_ensemble": n_ensemble}
223
260
 
224
261
 
225
- def input_fn(input_data, content_type):
262
+ def input_fn(input_data, content_type: str) -> pd.DataFrame:
226
263
  """Parse input data and return a DataFrame."""
227
264
  if not input_data:
228
265
  raise ValueError("Empty input data is not supported!")
@@ -239,29 +276,34 @@ def input_fn(input_data, content_type):
239
276
  raise ValueError(f"{content_type} not supported!")
240
277
 
241
278
 
242
- def output_fn(output_df, accept_type):
279
+ def output_fn(output_df: pd.DataFrame, accept_type: str) -> tuple[str, str]:
243
280
  """Supports both CSV and JSON output formats."""
244
281
  if "text/csv" in accept_type:
245
- csv_output = output_df.fillna("N/A").to_csv(index=False) # CSV with N/A for missing values
282
+ csv_output = output_df.fillna("N/A").to_csv(index=False)
246
283
  return csv_output, "text/csv"
247
284
  elif "application/json" in accept_type:
248
- return output_df.to_json(orient="records"), "application/json" # JSON array of records (NaNs -> null)
285
+ return output_df.to_json(orient="records"), "application/json"
249
286
  else:
250
287
  raise RuntimeError(f"{accept_type} accept type is not supported by this script.")
251
288
 
252
289
 
253
- def predict_fn(df, model) -> pd.DataFrame:
254
- """Make Predictions with our PyTorch Tabular Model
290
+ def predict_fn(df: pd.DataFrame, model_dict: dict) -> pd.DataFrame:
291
+ """Make Predictions with our PyTorch Tabular Model ensemble.
255
292
 
256
293
  Args:
257
294
  df (pd.DataFrame): The input DataFrame
258
- model: The TabularModel use for predictions
295
+ model_dict: Dictionary containing ensemble models and metadata
259
296
 
260
297
  Returns:
261
- pd.DataFrame: The DataFrame with the predictions added
298
+ pd.DataFrame: The DataFrame with predictions (and prediction_std for ensembles)
262
299
  """
300
+ model_type = TEMPLATE_PARAMS["model_type"]
263
301
  compressed_features = TEMPLATE_PARAMS["compressed_features"]
264
302
 
303
+ # Extract ensemble models
304
+ ensemble_models = model_dict["ensemble_models"]
305
+ n_ensemble = model_dict["n_ensemble"]
306
+
265
307
  # Grab our feature columns (from training)
266
308
  model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
267
309
  with open(os.path.join(model_dir, "feature_columns.json")) as fp:
@@ -274,12 +316,11 @@ def predict_fn(df, model) -> pd.DataFrame:
274
316
 
275
317
  # Load our Label Encoder if we have one
276
318
  label_encoder = None
277
- if os.path.exists(os.path.join(model_dir, "label_encoder.joblib")):
278
- label_encoder = joblib.load(os.path.join(model_dir, "label_encoder.joblib"))
319
+ label_encoder_path = os.path.join(model_dir, "label_encoder.joblib")
320
+ if os.path.exists(label_encoder_path):
321
+ label_encoder = joblib.load(label_encoder_path)
279
322
 
280
- # We're going match features in a case-insensitive manner, accounting for all the permutations
281
- # - Model has a feature list that's any case ("Id", "taCos", "cOunT", "likes_tacos")
282
- # - Incoming data has columns that are mixed case ("ID", "Tacos", "Count", "Likes_Tacos")
323
+ # Match features in a case-insensitive manner
283
324
  matched_df = match_features_case_insensitive(df, features)
284
325
 
285
326
  # Detect categorical types in the incoming DataFrame
@@ -290,36 +331,80 @@ def predict_fn(df, model) -> pd.DataFrame:
290
331
  print("Decompressing features for prediction...")
291
332
  matched_df, features = decompress_features(matched_df, features, compressed_features)
292
333
 
293
- # Make predictions using the TabularModel
294
- result = model.predict(matched_df[features])
334
+ # Track rows with missing features
335
+ missing_mask = matched_df[features].isna().any(axis=1)
336
+ if missing_mask.any():
337
+ print(f"Warning: {missing_mask.sum()} rows have missing features, will return NaN predictions")
338
+
339
+ # Initialize prediction columns
340
+ df["prediction"] = np.nan
341
+ if model_type in ["regressor", "uq_regressor"]:
342
+ df["prediction_std"] = np.nan
343
+
344
+ # Only predict on complete rows
345
+ complete_df = matched_df[~missing_mask]
346
+ if len(complete_df) == 0:
347
+ print("Warning: No complete rows to predict on")
348
+ return df
295
349
 
296
350
  # pytorch-tabular returns predictions using f"{target}_prediction" column
297
- # and classification probabilities in columns ending with "_probability"
298
- target = TEMPLATE_PARAMS["target_column"]
351
+ target = TEMPLATE_PARAMS["target"]
299
352
  prediction_column = f"{target}_prediction"
300
- if prediction_column in result.columns:
301
- predictions = result[prediction_column].values
302
- else:
303
- raise ValueError(f"Cannot find prediction column in: {result.columns.tolist()}")
304
353
 
305
- # If we have a label encoder, decode the predictions
306
- if label_encoder:
307
- predictions = label_encoder.inverse_transform(predictions.astype(int))
354
+ # Collect predictions from all ensemble members
355
+ all_ensemble_preds = []
356
+ all_ensemble_probs = []
357
+
358
+ for ens_idx, ens_model in enumerate(ensemble_models):
359
+ result = ens_model.predict(complete_df[features])
360
+
361
+ if prediction_column in result.columns:
362
+ ens_preds = result[prediction_column].values
363
+ else:
364
+ raise ValueError(f"Cannot find prediction column in: {result.columns.tolist()}")
365
+
366
+ all_ensemble_preds.append(ens_preds)
367
+
368
+ # For classification, collect probabilities
369
+ if label_encoder is not None:
370
+ prob_cols = sorted([col for col in result.columns if col.endswith("_probability")])
371
+ if prob_cols:
372
+ all_ensemble_probs.append(result[prob_cols].values)
373
+
374
+ # Stack and compute mean/std (std is 0 for single model)
375
+ ensemble_preds = np.stack(all_ensemble_preds, axis=0) # (n_ensemble, n_samples)
376
+ preds = np.mean(ensemble_preds, axis=0)
377
+ preds_std = np.std(ensemble_preds, axis=0) # Will be 0s for n_ensemble=1
308
378
 
309
- # Set the predictions on the DataFrame
310
- df["prediction"] = predictions
379
+ print(f"Inference: Ensemble predictions shape: {preds.shape}, n_ensemble: {n_ensemble}")
311
380
 
312
- # For classification, get probabilities
381
+ # Handle classification vs regression
313
382
  if label_encoder is not None:
314
- prob_cols = [col for col in result.columns if col.endswith("_probability")]
315
- if prob_cols:
316
- probs = result[prob_cols].values
317
- df["pred_proba"] = [p.tolist() for p in probs]
383
+ # For classification, average probabilities then take argmax
384
+ if all_ensemble_probs:
385
+ ensemble_probs = np.stack(all_ensemble_probs, axis=0) # (n_ensemble, n_samples, n_classes)
386
+ avg_probs = np.mean(ensemble_probs, axis=0) # (n_samples, n_classes)
387
+ class_preds = np.argmax(avg_probs, axis=1)
388
+ predictions = label_encoder.inverse_transform(class_preds)
389
+
390
+ # Build full proba Series with None for missing rows
391
+ all_proba = pd.Series([None] * len(df), index=df.index, dtype=object)
392
+ all_proba.loc[~missing_mask] = [p.tolist() for p in avg_probs]
393
+ df["pred_proba"] = all_proba
318
394
 
319
395
  # Expand the pred_proba column into separate columns for each class
320
396
  df = expand_proba_column(df, label_encoder.classes_)
397
+ else:
398
+ # No probabilities, use averaged predictions
399
+ predictions = label_encoder.inverse_transform(preds.astype(int))
400
+ else:
401
+ # Regression (includes uq_regressor)
402
+ predictions = preds
403
+ df.loc[~missing_mask, "prediction_std"] = preds_std
404
+
405
+ # Set predictions only for complete rows
406
+ df.loc[~missing_mask, "prediction"] = predictions
321
407
 
322
- # All done, return the DataFrame with new columns for the predictions
323
408
  return df
324
409
 
325
410
 
@@ -327,14 +412,14 @@ if __name__ == "__main__":
327
412
  """The main function is for training the PyTorch Tabular model"""
328
413
 
329
414
  # Harness Template Parameters
330
- target = TEMPLATE_PARAMS["target_column"]
415
+ target = TEMPLATE_PARAMS["target"]
331
416
  features = TEMPLATE_PARAMS["features"]
332
417
  orig_features = features.copy()
418
+ id_column = TEMPLATE_PARAMS["id_column"]
333
419
  compressed_features = TEMPLATE_PARAMS["compressed_features"]
334
420
  model_type = TEMPLATE_PARAMS["model_type"]
335
421
  model_metrics_s3_path = TEMPLATE_PARAMS["model_metrics_s3_path"]
336
- train_all_data = TEMPLATE_PARAMS["train_all_data"]
337
- validation_split = 0.2
422
+ hyperparameters = TEMPLATE_PARAMS["hyperparameters"]
338
423
 
339
424
  # Script arguments for input/output directories
340
425
  parser = argparse.ArgumentParser()
@@ -346,19 +431,27 @@ if __name__ == "__main__":
346
431
  args = parser.parse_args()
347
432
 
348
433
  # Read the training data into DataFrames
349
- training_files = [
350
- os.path.join(args.train, file)
351
- for file in os.listdir(args.train)
352
- if file.endswith(".csv")
353
- ]
434
+ training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
354
435
  print(f"Training Files: {training_files}")
355
436
 
356
437
  # Combine files and read them all into a single pandas dataframe
357
438
  all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
358
439
 
440
+ # Print out some info about the dataframe
441
+ print(f"All Data Shape: {all_df.shape}")
442
+ print(f"Feature dtypes:\n{all_df[features].dtypes.value_counts()}")
443
+ print(f"Int64 columns: {all_df[features].select_dtypes(include=['int64']).columns.tolist()}")
444
+
359
445
  # Check if the dataframe is empty
360
446
  check_dataframe(all_df, "training_df")
361
447
 
448
+ # Drop any rows with missing feature values
449
+ initial_row_count = all_df.shape[0]
450
+ all_df = all_df.dropna(subset=features)
451
+ dropped_rows = initial_row_count - all_df.shape[0]
452
+ if dropped_rows > 0:
453
+ print(f"Dropped {dropped_rows} rows due to missing feature values.")
454
+
362
455
  # Features/Target output
363
456
  print(f"Target: {target}")
364
457
  print(f"Features: {str(features)}")
@@ -366,125 +459,228 @@ if __name__ == "__main__":
366
459
  # Convert any features that might be categorical to 'category' type
367
460
  all_df, category_mappings = convert_categorical_types(all_df, features)
368
461
 
462
+ # Print out some info about the dataframe
463
+ print(f"All Data Shape: {all_df.shape}")
464
+ print(f"Feature dtypes:\n{all_df[features].dtypes.value_counts()}")
465
+ print(f"Int64 columns: {all_df[features].select_dtypes(include=['int64']).columns.tolist()}")
466
+
369
467
  # If we have compressed features, decompress them
370
468
  if compressed_features:
371
469
  print(f"Decompressing features {compressed_features}...")
372
470
  all_df, features = decompress_features(all_df, features, compressed_features)
373
471
 
374
- # Do we want to train on all the data?
375
- if train_all_data:
376
- print("Training on ALL of the data")
377
- df_train = all_df.copy()
378
- df_val = all_df.copy()
379
-
380
- # Does the dataframe have a training column?
381
- elif "training" in all_df.columns:
382
- print("Found training column, splitting data based on training column")
383
- df_train = all_df[all_df["training"]]
384
- df_val = all_df[~all_df["training"]]
385
- else:
386
- # Just do a random training Split
387
- print("WARNING: No training column found, splitting data with random state=42")
388
- df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
389
- print(f"FIT/TRAIN: {df_train.shape}")
390
- print(f"VALIDATION: {df_val.shape}")
391
-
392
472
  # Determine categorical and continuous columns
393
- categorical_cols = [col for col in features if df_train[col].dtype.name == "category"]
473
+ categorical_cols = [col for col in features if all_df[col].dtype.name == "category"]
394
474
  continuous_cols = [col for col in features if col not in categorical_cols]
395
-
396
475
  print(f"Categorical columns: {categorical_cols}")
397
476
  print(f"Continuous columns: {continuous_cols}")
398
477
 
399
- # Set up PyTorch Tabular configuration
400
- data_config = DataConfig(
401
- target=[target],
402
- continuous_cols=continuous_cols,
403
- categorical_cols=categorical_cols,
404
- )
478
+ # Cast continuous columns to float
479
+ all_df[continuous_cols] = all_df[continuous_cols].astype("float64")
405
480
 
406
- trainer_config = TrainerConfig(
407
- auto_lr_find=True,
408
- batch_size=min(1024, len(df_train) // 4),
409
- max_epochs=100,
410
- early_stopping="valid_loss",
411
- early_stopping_patience=15,
412
- checkpoints="valid_loss",
413
- accelerator="auto",
414
- progress_bar="none",
415
- gradient_clip_val=1.0,
416
- )
417
-
418
- optimizer_config = OptimizerConfig()
419
-
420
- # Choose model configuration based on model type
481
+ # Choose the 'task' based on model type and set up the label encoder if needed
421
482
  if model_type == "classifier":
422
483
  task = "classification"
423
- # Encode the target column
484
+ # Encode the target column on full dataset for consistent encoding
424
485
  label_encoder = LabelEncoder()
425
- df_train[target] = label_encoder.fit_transform(df_train[target])
426
- df_val[target] = label_encoder.transform(df_val[target])
486
+ all_df[target] = label_encoder.fit_transform(all_df[target])
487
+ num_classes = len(label_encoder.classes_)
427
488
  else:
428
489
  task = "regression"
429
490
  label_encoder = None
491
+ num_classes = None
492
+
493
+ # Use any hyperparameters to set up both the trainer and model configurations
494
+ print(f"Hyperparameters: {hyperparameters}")
495
+ n_folds = hyperparameters.get("n_folds", 5) # Number of CV folds (default: 5)
496
+
497
+ # =========================================================================
498
+ # UNIFIED TRAINING: Works for n_folds=1 (single model) or n_folds>1 (K-fold CV)
499
+ # =========================================================================
500
+ print(f"Training {'single model' if n_folds == 1 else f'{n_folds}-fold cross-validation ensemble'}...")
501
+
502
+ # Create fold splits
503
+ if n_folds == 1:
504
+ # Single fold: use train/val split from "training" column or random split
505
+ if "training" in all_df.columns:
506
+ print("Found training column, splitting data based on training column")
507
+ train_idx = np.where(all_df["training"])[0]
508
+ val_idx = np.where(~all_df["training"])[0]
509
+ else:
510
+ print("WARNING: No training column found, splitting data with random 80/20 split")
511
+ indices = np.arange(len(all_df))
512
+ train_idx, val_idx = train_test_split(indices, test_size=0.2, random_state=42)
513
+ folds = [(train_idx, val_idx)]
514
+ else:
515
+ # K-Fold CV
516
+ if model_type == "classifier":
517
+ kfold = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
518
+ split_target = all_df[target]
519
+ else:
520
+ kfold = KFold(n_splits=n_folds, shuffle=True, random_state=42)
521
+ split_target = None
522
+ folds = list(kfold.split(all_df, split_target))
523
+
524
+ # Initialize storage for out-of-fold predictions
525
+ oof_predictions = np.full(len(all_df), np.nan, dtype=np.float64)
526
+ if model_type == "classifier" and num_classes and num_classes > 1:
527
+ oof_proba = np.full((len(all_df), num_classes), np.nan, dtype=np.float64)
528
+ else:
529
+ oof_proba = None
430
530
 
431
- # Use CategoryEmbedding for both regression and classification tasks
432
- model_config = CategoryEmbeddingModelConfig(
433
- task=task,
434
- layers="1024-512-512",
435
- activation="ReLU",
436
- learning_rate=1e-3,
437
- dropout=0.1,
438
- use_batch_norm=True,
439
- initialization="kaiming",
440
- )
531
+ ensemble_models = []
441
532
 
442
- # Create and train the TabularModel
443
- tabular_model = TabularModel(
444
- data_config=data_config,
445
- model_config=model_config,
446
- optimizer_config=optimizer_config,
447
- trainer_config=trainer_config,
533
+ # Set up PyTorch Tabular data configuration (shared across folds)
534
+ data_config = DataConfig(
535
+ target=[target],
536
+ continuous_cols=continuous_cols,
537
+ categorical_cols=categorical_cols,
448
538
  )
449
539
 
450
- # Train the model
451
- tabular_model.fit(train=df_train, validation=df_val)
540
+ # Model config defaults
541
+ model_defaults = {
542
+ "layers": "256-128-64",
543
+ "activation": "LeakyReLU",
544
+ "learning_rate": 1e-3,
545
+ "dropout": 0.1,
546
+ "use_batch_norm": True,
547
+ "initialization": "kaiming",
548
+ }
549
+ # Override defaults with model_config if present
550
+ model_overrides = {k: v for k, v in hyperparameters.get("model_config", {}).items() if k in model_defaults}
551
+ for key, value in model_overrides.items():
552
+ print(f"MODEL CONFIG Override: {key}: {model_defaults[key]} → {value}")
553
+ model_params = {**model_defaults, **model_overrides}
554
+
555
+ model_config = CategoryEmbeddingModelConfig(task=task, **model_params)
556
+ optimizer_config = OptimizerConfig()
452
557
 
453
- # Make Predictions on the Validation Set
454
- print("Making Predictions on Validation Set...")
455
- result = tabular_model.predict(df_val, include_input_features=False)
558
+ for fold_idx, (train_idx, val_idx) in enumerate(folds):
559
+ print(f"\n{'='*50}")
560
+ print(f"Training Fold {fold_idx + 1}/{len(folds)}")
561
+ print(f"{'='*50}")
562
+
563
+ # Split data for this fold
564
+ df_train = all_df.iloc[train_idx].reset_index(drop=True)
565
+ df_val = all_df.iloc[val_idx].reset_index(drop=True)
566
+
567
+ print(f"Fold {fold_idx + 1} - Train: {len(df_train)}, Val: {len(df_val)}")
568
+
569
+ # Set up PyTorch Tabular trainer configuration (per-fold for batch_size)
570
+ # Calculate batch size that avoids single-sample last batch (batch norm requires >1)
571
+ batch_size = min(128, max(32, len(df_train) // 16))
572
+ if len(df_train) % batch_size == 1:
573
+ batch_size += 1 # Adjust to avoid last batch of size 1
574
+ trainer_defaults = {
575
+ "auto_lr_find": False,
576
+ "batch_size": batch_size,
577
+ "max_epochs": 200,
578
+ "min_epochs": 10,
579
+ "early_stopping": "valid_loss",
580
+ "early_stopping_patience": 20,
581
+ "checkpoints": "valid_loss",
582
+ "accelerator": "auto",
583
+ "progress_bar": "none",
584
+ "gradient_clip_val": 1.0,
585
+ "seed": 42 + fold_idx,
586
+ }
587
+
588
+ # Override defaults with training_config if present
589
+ training_overrides = {k: v for k, v in hyperparameters.get("training_config", {}).items() if k in trainer_defaults}
590
+ if fold_idx == 0: # Only print overrides once
591
+ for key, value in training_overrides.items():
592
+ print(f"TRAINING CONFIG Override: {key}: {trainer_defaults[key]} → {value}")
593
+ trainer_params = {**trainer_defaults, **training_overrides}
594
+ trainer_config = TrainerConfig(**trainer_params)
595
+
596
+ # Create and train the TabularModel for this fold
597
+ tabular_model = TabularModel(
598
+ data_config=data_config,
599
+ model_config=model_config,
600
+ optimizer_config=optimizer_config,
601
+ trainer_config=trainer_config,
602
+ )
603
+ tabular_model.fit(train=df_train, validation=df_val)
604
+ ensemble_models.append(tabular_model)
605
+
606
+ # Make out-of-fold predictions
607
+ result = tabular_model.predict(df_val, include_input_features=False)
608
+ fold_preds = result[f"{target}_prediction"].values
609
+
610
+ # Store out-of-fold predictions
611
+ if model_type == "classifier":
612
+ oof_predictions[val_idx] = fold_preds.astype(int)
613
+ prob_cols = sorted([col for col in result.columns if col.endswith("_probability")])
614
+ if prob_cols and oof_proba is not None:
615
+ oof_proba[val_idx] = result[prob_cols].values
616
+ else:
617
+ oof_predictions[val_idx] = fold_preds.flatten()
456
618
 
457
- # pytorch-tabular returns predictions using f"{target}_prediction" column
458
- # and classification probabilities in columns ending with "_probability"
459
- if model_type == "classifier":
460
- preds = result[f"{target}_prediction"].values
619
+ print(f"Fold {fold_idx + 1} complete!")
620
+
621
+ print(f"\nTraining complete! Trained {len(ensemble_models)} model(s).")
622
+
623
+ # Use out-of-fold predictions for metrics
624
+ # For n_folds=1, we only have predictions for val_idx, so filter to those rows
625
+ if n_folds == 1:
626
+ val_mask = ~np.isnan(oof_predictions)
627
+ preds = oof_predictions[val_mask]
628
+ df_val = all_df[val_mask].copy()
629
+ if oof_proba is not None:
630
+ oof_proba = oof_proba[val_mask]
461
631
  else:
462
- # Regression: use the target column name
463
- preds = result[f"{target}_prediction"].values
632
+ preds = oof_predictions
633
+ df_val = all_df.copy()
634
+
635
+ # Compute prediction_std by running all ensemble models on validation data
636
+ # For n_folds=1, std will be 0 (only one model). For n_folds>1, std shows ensemble disagreement.
637
+ preds_std = None
638
+ if model_type in ["regressor", "uq_regressor"] and len(ensemble_models) > 0:
639
+ print("Computing prediction_std from ensemble predictions on validation data...")
640
+ all_ensemble_preds_for_std = []
641
+ for ens_model in ensemble_models:
642
+ result = ens_model.predict(df_val[features], include_input_features=False)
643
+ ens_preds = result[f"{target}_prediction"].values.flatten()
644
+ all_ensemble_preds_for_std.append(ens_preds)
645
+
646
+ ensemble_preds_stacked = np.stack(all_ensemble_preds_for_std, axis=0)
647
+ preds_std = np.std(ensemble_preds_stacked, axis=0)
648
+ print(f"Ensemble prediction_std - mean: {np.mean(preds_std):.4f}, max: {np.max(preds_std):.4f}")
464
649
 
465
650
  if model_type == "classifier":
466
651
  # Get probabilities for classification
467
- print("Processing Probabilities...")
468
- prob_cols = [col for col in result.columns if col.endswith("_probability")]
469
- if prob_cols:
470
- probs = result[prob_cols].values
471
- df_val["pred_proba"] = [p.tolist() for p in probs]
472
-
473
- # Expand the pred_proba column into separate columns for each class
474
- print(df_val.columns)
652
+ if oof_proba is not None:
653
+ df_val = df_val.copy()
654
+ df_val["pred_proba"] = [p.tolist() for p in oof_proba]
475
655
  df_val = expand_proba_column(df_val, label_encoder.classes_)
476
- print(df_val.columns)
477
656
 
478
657
  # Decode the target and prediction labels
479
658
  y_validate = label_encoder.inverse_transform(df_val[target])
480
- preds = label_encoder.inverse_transform(preds.astype(int))
659
+ preds_decoded = label_encoder.inverse_transform(preds.astype(int))
481
660
  else:
482
661
  y_validate = df_val[target].values
662
+ preds_decoded = preds
663
+
664
+ # Save predictions to S3
665
+ df_val = df_val.copy()
666
+ df_val["prediction"] = preds_decoded
667
+
668
+ # Build output columns - include id_column if it exists
669
+ output_columns = []
670
+ if id_column in df_val.columns:
671
+ output_columns.append(id_column)
672
+ output_columns += [target, "prediction"]
673
+
674
+ # Add prediction_std for regression models (always present, 0 for single model)
675
+ if model_type in ["regressor", "uq_regressor"]:
676
+ if preds_std is not None:
677
+ df_val["prediction_std"] = preds_std
678
+ else:
679
+ df_val["prediction_std"] = 0.0
680
+ output_columns.append("prediction_std")
681
+ print(f"Ensemble std - mean: {df_val['prediction_std'].mean():.4f}, max: {df_val['prediction_std'].max():.4f}")
483
682
 
484
- # Save predictions to S3 (just the target, prediction, and '_probability' columns)
485
- df_val["prediction"] = preds
486
- output_columns = [target, "prediction"]
487
- output_columns += [col for col in df_val.columns if col.endswith("_probability")]
683
+ output_columns += [col for col in df_val.columns if col.endswith("_proba")]
488
684
  wr.s3.to_csv(
489
685
  df_val[output_columns],
490
686
  path=f"{model_metrics_s3_path}/validation_predictions.csv",
@@ -497,7 +693,7 @@ if __name__ == "__main__":
497
693
  label_names = label_encoder.classes_
498
694
 
499
695
  # Calculate various model performance metrics
500
- scores = precision_recall_fscore_support(y_validate, preds, average=None, labels=label_names)
696
+ scores = precision_recall_fscore_support(y_validate, preds_decoded, average=None, labels=label_names)
501
697
 
502
698
  # Put the scores into a dataframe
503
699
  score_df = pd.DataFrame(
@@ -505,20 +701,20 @@ if __name__ == "__main__":
505
701
  target: label_names,
506
702
  "precision": scores[0],
507
703
  "recall": scores[1],
508
- "fscore": scores[2],
704
+ "f1": scores[2],
509
705
  "support": scores[3],
510
706
  }
511
707
  )
512
708
 
513
- # We need to get creative with the Classification Metrics
514
- metrics = ["precision", "recall", "fscore", "support"]
709
+ # Output metrics per class
710
+ metrics = ["precision", "recall", "f1", "support"]
515
711
  for t in label_names:
516
712
  for m in metrics:
517
713
  value = score_df.loc[score_df[target] == t, m].iloc[0]
518
714
  print(f"Metrics:{t}:{m} {value}")
519
715
 
520
716
  # Compute and output the confusion matrix
521
- conf_mtx = confusion_matrix(y_validate, preds, labels=label_names)
717
+ conf_mtx = confusion_matrix(y_validate, preds_decoded, labels=label_names)
522
718
  for i, row_name in enumerate(label_names):
523
719
  for j, col_name in enumerate(label_names):
524
720
  value = conf_mtx[i, j]
@@ -526,22 +722,37 @@ if __name__ == "__main__":
526
722
 
527
723
  else:
528
724
  # Calculate various model performance metrics (regression)
529
- rmse = root_mean_squared_error(y_validate, preds)
530
- mae = mean_absolute_error(y_validate, preds)
531
- r2 = r2_score(y_validate, preds)
532
- print(f"RMSE: {rmse:.3f}")
533
- print(f"MAE: {mae:.3f}")
534
- print(f"R2: {r2:.3f}")
535
- print(f"NumRows: {len(df_val)}")
536
-
537
- # Save the model to the standard place/name
538
- tabular_model.save_model(os.path.join(args.model_dir, "tabular_model"))
725
+ rmse = root_mean_squared_error(y_validate, preds_decoded)
726
+ mae = mean_absolute_error(y_validate, preds_decoded)
727
+ medae = median_absolute_error(y_validate, preds_decoded)
728
+ r2 = r2_score(y_validate, preds_decoded)
729
+ spearman_corr = spearmanr(y_validate, preds_decoded).correlation
730
+ support = len(df_val)
731
+ print(f"rmse: {rmse:.3f}")
732
+ print(f"mae: {mae:.3f}")
733
+ print(f"medae: {medae:.3f}")
734
+ print(f"r2: {r2:.3f}")
735
+ print(f"spearmanr: {spearman_corr:.3f}")
736
+ print(f"support: {support}")
737
+
738
+ # Save ensemble models
739
+ for model_idx, ens_model in enumerate(ensemble_models):
740
+ model_path = os.path.join(args.model_dir, f"tabular_model_{model_idx}")
741
+ ens_model.save_model(model_path)
742
+ print(f"Saved model {model_idx + 1} to {model_path}")
743
+
744
+ # Save ensemble metadata
745
+ n_ensemble = len(ensemble_models)
746
+ ensemble_metadata = {"n_ensemble": n_ensemble, "n_folds": n_folds}
747
+ joblib.dump(ensemble_metadata, os.path.join(args.model_dir, "ensemble_metadata.joblib"))
748
+ print(f"Saved ensemble metadata (n_ensemble={n_ensemble}, n_folds={n_folds})")
749
+
539
750
  if label_encoder:
540
751
  joblib.dump(label_encoder, os.path.join(args.model_dir, "label_encoder.joblib"))
541
752
 
542
753
  # Save the features (this will validate input during predictions)
543
754
  with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
544
- json.dump(orig_features, fp) # We save the original features, not the decompressed ones
755
+ json.dump(orig_features, fp)
545
756
 
546
757
  # Save the category mappings
547
758
  with open(os.path.join(args.model_dir, "category_mappings.json"), "w") as fp: