workbench 0.8.198__py3-none-any.whl → 0.8.203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. workbench/algorithms/dataframe/proximity.py +11 -4
  2. workbench/api/__init__.py +2 -1
  3. workbench/api/df_store.py +17 -108
  4. workbench/api/feature_set.py +48 -11
  5. workbench/api/model.py +1 -1
  6. workbench/api/parameter_store.py +3 -52
  7. workbench/core/artifacts/__init__.py +11 -2
  8. workbench/core/artifacts/artifact.py +5 -5
  9. workbench/core/artifacts/df_store_core.py +114 -0
  10. workbench/core/artifacts/endpoint_core.py +261 -78
  11. workbench/core/artifacts/feature_set_core.py +69 -1
  12. workbench/core/artifacts/model_core.py +48 -14
  13. workbench/core/artifacts/parameter_store_core.py +98 -0
  14. workbench/core/transforms/features_to_model/features_to_model.py +50 -33
  15. workbench/core/transforms/pandas_transforms/pandas_to_features.py +11 -2
  16. workbench/core/views/view.py +2 -2
  17. workbench/model_scripts/chemprop/chemprop.template +933 -0
  18. workbench/model_scripts/chemprop/generated_model_script.py +933 -0
  19. workbench/model_scripts/chemprop/requirements.txt +11 -0
  20. workbench/model_scripts/custom_models/chem_info/fingerprints.py +134 -0
  21. workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py +1 -1
  22. workbench/model_scripts/custom_models/proximity/proximity.py +11 -4
  23. workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +11 -5
  24. workbench/model_scripts/custom_models/uq_models/meta_uq.template +11 -5
  25. workbench/model_scripts/custom_models/uq_models/ngboost.template +11 -5
  26. workbench/model_scripts/custom_models/uq_models/proximity.py +11 -4
  27. workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +11 -5
  28. workbench/model_scripts/pytorch_model/generated_model_script.py +365 -173
  29. workbench/model_scripts/pytorch_model/pytorch.template +362 -170
  30. workbench/model_scripts/scikit_learn/generated_model_script.py +302 -0
  31. workbench/model_scripts/script_generation.py +10 -7
  32. workbench/model_scripts/uq_models/generated_model_script.py +43 -27
  33. workbench/model_scripts/uq_models/mapie.template +40 -24
  34. workbench/model_scripts/xgb_model/generated_model_script.py +36 -7
  35. workbench/model_scripts/xgb_model/xgb_model.template +36 -7
  36. workbench/repl/workbench_shell.py +14 -5
  37. workbench/resources/open_source_api.key +1 -1
  38. workbench/scripts/endpoint_test.py +162 -0
  39. workbench/scripts/{lambda_launcher.py → lambda_test.py} +10 -0
  40. workbench/utils/chemprop_utils.py +761 -0
  41. workbench/utils/pytorch_utils.py +527 -0
  42. workbench/utils/xgboost_model_utils.py +10 -5
  43. workbench/web_interface/components/model_plot.py +7 -1
  44. {workbench-0.8.198.dist-info → workbench-0.8.203.dist-info}/METADATA +3 -3
  45. {workbench-0.8.198.dist-info → workbench-0.8.203.dist-info}/RECORD +49 -43
  46. {workbench-0.8.198.dist-info → workbench-0.8.203.dist-info}/entry_points.txt +2 -1
  47. workbench/core/cloud_platform/aws/aws_df_store.py +0 -404
  48. workbench/core/cloud_platform/aws/aws_parameter_store.py +0 -280
  49. workbench/model_scripts/__pycache__/script_generation.cpython-312.pyc +0 -0
  50. workbench/model_scripts/__pycache__/script_generation.cpython-313.pyc +0 -0
  51. {workbench-0.8.198.dist-info → workbench-0.8.203.dist-info}/WHEEL +0 -0
  52. {workbench-0.8.198.dist-info → workbench-0.8.203.dist-info}/licenses/LICENSE +0 -0
  53. {workbench-0.8.198.dist-info → workbench-0.8.203.dist-info}/top_level.txt +0 -0
@@ -13,39 +13,38 @@ from pytorch_tabular.models import CategoryEmbeddingModelConfig
13
13
  # Model Performance Scores
14
14
  from sklearn.metrics import (
15
15
  mean_absolute_error,
16
+ median_absolute_error,
16
17
  r2_score,
17
18
  root_mean_squared_error,
18
19
  precision_recall_fscore_support,
19
20
  confusion_matrix,
20
21
  )
22
+ from scipy.stats import spearmanr
21
23
 
22
24
  # Classification Encoder
23
25
  from sklearn.preprocessing import LabelEncoder
24
26
 
25
27
  # Scikit Learn Imports
26
- from sklearn.model_selection import train_test_split
28
+ from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
27
29
 
28
30
  from io import StringIO
29
31
  import json
30
32
  import argparse
31
33
  import joblib
32
- import os
33
34
  import pandas as pd
34
- from typing import List, Tuple
35
35
 
36
36
  # Template Parameters
37
37
  TEMPLATE_PARAMS = {
38
- "model_type": "regressor",
39
- "target": "udm_asy_res_efflux_ratio",
40
- "features": ['chi2v', 'fr_sulfone', 'chi1v', 'bcut2d_logplow', 'fr_piperzine', 'kappa3', 'smr_vsa1', 'slogp_vsa5', 'fr_ketone_topliss', 'fr_sulfonamd', 'fr_imine', 'fr_benzene', 'fr_ester', 'chi2n', 'labuteasa', 'peoe_vsa2', 'smr_vsa6', 'bcut2d_chglo', 'fr_sh', 'peoe_vsa1', 'fr_allylic_oxid', 'chi4n', 'fr_ar_oh', 'fr_nh0', 'fr_term_acetylene', 'slogp_vsa7', 'slogp_vsa4', 'estate_vsa1', 'vsa_estate4', 'numbridgeheadatoms', 'numheterocycles', 'fr_ketone', 'fr_morpholine', 'fr_guanido', 'estate_vsa2', 'numheteroatoms', 'fr_nitro_arom_nonortho', 'fr_piperdine', 'nocount', 'numspiroatoms', 'fr_aniline', 'fr_thiophene', 'slogp_vsa10', 'fr_amide', 'slogp_vsa2', 'fr_epoxide', 'vsa_estate7', 'fr_ar_coo', 'fr_imidazole', 'fr_nitrile', 'fr_oxazole', 'numsaturatedrings', 'fr_pyridine', 'fr_hoccn', 'fr_ndealkylation1', 'numaliphaticheterocycles', 'fr_phenol', 'maxpartialcharge', 'vsa_estate5', 'peoe_vsa13', 'minpartialcharge', 'qed', 'fr_al_oh', 'slogp_vsa11', 'chi0n', 'fr_bicyclic', 'peoe_vsa12', 'fpdensitymorgan1', 'fr_oxime', 'molwt', 'fr_dihydropyridine', 'smr_vsa5', 'peoe_vsa5', 'fr_nitro', 'hallkieralpha', 'heavyatommolwt', 'fr_alkyl_halide', 'peoe_vsa8', 'fr_nhpyrrole', 'fr_isocyan', 'bcut2d_chghi', 'fr_lactam', 'peoe_vsa11', 'smr_vsa9', 'tpsa', 'chi4v', 'slogp_vsa1', 'phi', 'bcut2d_logphi', 'avgipc', 'estate_vsa11', 'fr_coo', 'bcut2d_mwhi', 'numunspecifiedatomstereocenters', 'vsa_estate10', 'estate_vsa8', 'numvalenceelectrons', 'fr_nh2', 'fr_lactone', 'vsa_estate1', 'estate_vsa4', 'numatomstereocenters', 'vsa_estate8', 'fr_para_hydroxylation', 'peoe_vsa3', 'fr_thiazole', 'peoe_vsa10', 'fr_ndealkylation2', 'slogp_vsa12', 'peoe_vsa9', 'maxestateindex', 'fr_quatn', 'smr_vsa7', 'minestateindex', 'numaromaticheterocycles', 'numrotatablebonds', 'fr_ar_nh', 'fr_ether', 'exactmolwt', 'fr_phenol_noorthohbond', 'slogp_vsa3', 'fr_ar_n', 'sps', 'fr_c_o_nocoo', 'bertzct', 'peoe_vsa7', 'slogp_vsa8', 'numradicalelectrons', 'molmr', 'fr_tetrazole', 'numsaturatedcarbocycles', 'bcut2d_mrhi', 'kappa1', 'numamidebonds', 'fpdensitymorgan2', 'smr_vsa8', 'chi1n', 'estate_vsa6', 'fr_barbitur', 'fr_diazo', 'kappa2', 'chi0', 'bcut2d_mrlow', 'balabanj', 'peoe_vsa4', 'numhacceptors', 'fr_sulfide', 'chi3n', 'smr_vsa2', 'fr_al_oh_notert', 'fr_benzodiazepine', 'fr_phos_ester', 'fr_aldehyde', 'fr_coo2', 'estate_vsa5', 'fr_prisulfonamd', 'numaromaticcarbocycles', 'fr_unbrch_alkane', 'fr_urea', 'fr_nitroso', 'smr_vsa10', 'fr_c_s', 'smr_vsa3', 'fr_methoxy', 'maxabspartialcharge', 'slogp_vsa9', 'heavyatomcount', 'fr_azide', 'chi3v', 'smr_vsa4', 'mollogp', 'chi0v', 'fr_aryl_methyl', 'fr_nh1', 'fpdensitymorgan3', 'fr_furan', 'fr_hdrzine', 'fr_arn', 'numaromaticrings', 'vsa_estate3', 'fr_azo', 'fr_halogen', 'estate_vsa9', 'fr_hdrzone', 'numhdonors', 'fr_alkyl_carbamate', 'fr_isothiocyan', 'minabspartialcharge', 'fr_al_coo', 'ringcount', 'chi1', 'estate_vsa7', 'fr_nitro_arom', 'vsa_estate9', 'minabsestateindex', 'maxabsestateindex', 'vsa_estate6', 'estate_vsa10', 'estate_vsa3', 'fr_n_o', 'fr_amidine', 'fr_thiocyan', 'fr_phos_acid', 'fr_c_o', 'fr_imide', 'numaliphaticrings', 'peoe_vsa6', 'vsa_estate2', 'nhohcount', 'numsaturatedheterocycles', 'slogp_vsa6', 'peoe_vsa14', 'fractioncsp3', 'bcut2d_mwlow', 'numaliphaticcarbocycles', 'fr_priamide', 'nacid', 'nbase', 'naromatom', 'narombond', 'sz', 'sm', 'sv', 'sse', 'spe', 'sare', 'sp', 'si', 'mz', 'mm', 'mv', 'mse', 'mpe', 'mare', 'mp', 'mi', 'xch_3d', 'xch_4d', 'xch_5d', 'xch_6d', 'xch_7d', 'xch_3dv', 'xch_4dv', 'xch_5dv', 'xch_6dv', 'xch_7dv', 'xc_3d', 'xc_4d', 'xc_5d', 'xc_6d', 'xc_3dv', 'xc_4dv', 'xc_5dv', 'xc_6dv', 'xpc_4d', 'xpc_5d', 'xpc_6d', 'xpc_4dv', 'xpc_5dv', 'xpc_6dv', 'xp_0d', 'xp_1d', 'xp_2d', 'xp_3d', 'xp_4d', 'xp_5d', 'xp_6d', 'xp_7d', 'axp_0d', 'axp_1d', 'axp_2d', 'axp_3d', 'axp_4d', 'axp_5d', 'axp_6d', 'axp_7d', 'xp_0dv', 'xp_1dv', 'xp_2dv', 'xp_3dv', 'xp_4dv', 'xp_5dv', 'xp_6dv', 'xp_7dv', 'axp_0dv', 'axp_1dv', 'axp_2dv', 'axp_3dv', 'axp_4dv', 'axp_5dv', 'axp_6dv', 'axp_7dv', 'c1sp1', 'c2sp1', 'c1sp2', 'c2sp2', 'c3sp2', 'c1sp3', 'c2sp3', 'c3sp3', 'c4sp3', 'hybratio', 'fcsp3', 'num_stereocenters', 'num_unspecified_stereocenters', 'num_defined_stereocenters', 'num_r_centers', 'num_s_centers', 'num_stereobonds', 'num_e_bonds', 'num_z_bonds', 'stereo_complexity', 'frac_defined_stereo'],
38
+ "model_type": "uq_regressor",
39
+ "target": "mppb",
40
+ "features": ['mollogp', 'mi', 'fr_benzene', 'smr_vsa3', 'fr_halogen', 'c2sp2', 'peoe_vsa6', 'bcut2d_mwhi', 'vsa_estate1', 'mv', 'numaromaticcarbocycles', 'vsa_estate5', 'fr_nh0', 'mm', 'smr_vsa7', 'tpsa', 'c1sp2', 'mz', 'vsa_estate2', 'peoe_vsa7', 'vsa_estate10', 'vsa_estate7', 'vsa_estate6', 'smr_vsa10', 'slogp_vsa2', 'bcut2d_logphi', 'naromatom', 'axp_2dv', 'bcut2d_mrhi', 'vsa_estate8', 'slogp_vsa3', 'vsa_estate4', 'xpc_6dv', 'slogp_vsa12', 'peoe_vsa9', 'mp', 'slogp_vsa1', 'peoe_vsa1', 'xch_5dv', 'qed', 'vsa_estate3', 'fpdensitymorgan3', 'axp_2d', 'axp_0d', 'mse', 'numhacceptors', 'bertzct', 'estate_vsa8', 'minestateindex', 'estate_vsa3', 'fpdensitymorgan2', 'smr_vsa6', 'peoe_vsa8', 'slogp_vsa6', 'xp_5dv', 'hallkieralpha', 'avgipc', 'fr_arn', 'xp_7d', 'mare', 'xp_6d', 'bcut2d_mrlow', 'estate_vsa4', 'bcut2d_logplow', 'peoe_vsa10', 'maxabspartialcharge', 'peoe_vsa3', 'bcut2d_mwlow', 'axp_7d', 'minpartialcharge', 'xpc_4d', 'axp_1d', 'estate_vsa9', 'vsa_estate9', 'estate_vsa7', 'maxestateindex', 'estate_vsa6', 'smr_vsa1', 'xpc_6d', 'xch_7d', 'xc_5d', 'phi', 'axp_0dv', 'axp_3dv', 'mpe', 'xc_3d', 'xch_5d', 'xc_5dv', 'xch_6d', 'chi4n', 'axp_7dv', 'slogp_vsa5', 'axp_1dv', 'xch_6dv', 'minabsestateindex', 'numrotatablebonds', 'peoe_vsa2', 'estate_vsa2', 'slogp_vsa8', 'bcut2d_chglo', 'xch_7dv', 'kappa2', 'axp_4dv', 'xc_3dv', 'kappa1', 'nbase', 'xpc_5dv', 'maxpartialcharge', 'bcut2d_chghi', 'axp_5d', 'balabanj', 'xpc_5d', 'fpdensitymorgan1', 'xp_5d', 'smr_vsa5', 'axp_4d', 'kappa3', 'fr_morpholine', 'estate_vsa5', 'chi2n', 'labuteasa', 'axp_5dv', 'molwt', 'smr_vsa9', 'maxabsestateindex', 'xp_7dv', 'fr_bicyclic', 'numaliphaticheterocycles', 'axp_6dv', 'slogp_vsa4', 'axp_3d', 'xp_6dv', 'nocount', 'axp_6d', 'fr_aniline', 'xpc_4dv', 'xp_1d', 'c3sp2', 'numheterocycles', 'nhohcount', 'molmr', 'numaromaticheterocycles', 'chi0', 'minabspartialcharge', 'fr_ar_n', 'xp_3d', 'chi2v', 'fr_ether', 'chi1v', 'chi1', 'xp_2d', 'xp_4dv', 'xp_4d', 'chi4v', 'fr_pyridine', 'smr_vsa4', 'sps', 'chi3n', 'heavyatommolwt', 'slogp_vsa11', 'fr_aryl_methyl', 'si', 'fractioncsp3', 'sse', 'fr_para_hydroxylation', 'slogp_vsa10', 'c1sp3', 'exactmolwt', 'numsaturatedheterocycles', 'chi1n', 'chi0n', 'fcsp3'],
41
+ "id_column": "molecule_name",
41
42
  "compressed_features": [],
42
- "model_metrics_s3_path": "s3://ideaya-sageworks-bucket/models/caco2-er-reg-pytorch-test/training",
43
- "train_all_data": False,
44
- "hyperparameters": {},
43
+ "model_metrics_s3_path": "s3://sandbox-sageworks-artifacts/models/mppb-reg-pytorch/training",
44
+ "hyperparameters": {'n_folds': 5},
45
45
  }
46
46
 
47
47
 
48
- # Function to check if dataframe is empty
49
48
  def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
50
49
  """
51
50
  Check if the provided dataframe is empty and raise an exception if it is.
@@ -60,19 +59,17 @@ def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
60
59
  raise ValueError(msg)
61
60
 
62
61
 
63
- def expand_proba_column(df: pd.DataFrame, class_labels: List[str]) -> pd.DataFrame:
62
+ def expand_proba_column(df: pd.DataFrame, class_labels: list[str]) -> pd.DataFrame:
64
63
  """
65
64
  Expands a column in a DataFrame containing a list of probabilities into separate columns.
66
65
 
67
66
  Args:
68
67
  df (pd.DataFrame): DataFrame containing a "pred_proba" column
69
- class_labels (List[str]): List of class labels
68
+ class_labels (list[str]): List of class labels
70
69
 
71
70
  Returns:
72
71
  pd.DataFrame: DataFrame with the "pred_proba" expanded into separate columns
73
72
  """
74
-
75
- # Sanity check
76
73
  proba_column = "pred_proba"
77
74
  if proba_column not in df.columns:
78
75
  raise ValueError('DataFrame does not contain a "pred_proba" column')
@@ -89,11 +86,10 @@ def expand_proba_column(df: pd.DataFrame, class_labels: List[str]) -> pd.DataFra
89
86
 
90
87
  # Concatenate the new columns with the original DataFrame
91
88
  df = pd.concat([df, proba_df], axis=1)
92
- print(df)
93
89
  return df
94
90
 
95
91
 
96
- def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> pd.DataFrame:
92
+ def match_features_case_insensitive(df: pd.DataFrame, model_features: list[str]) -> pd.DataFrame:
97
93
  """
98
94
  Matches and renames DataFrame columns to match model feature names (case-insensitive).
99
95
  Prioritizes exact matches, then case-insensitive matches.
@@ -118,55 +114,60 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
118
114
  return df.rename(columns=rename_dict)
119
115
 
120
116
 
121
- def convert_categorical_types(df: pd.DataFrame, features: list, category_mappings={}) -> tuple:
117
+ def convert_categorical_types(
118
+ df: pd.DataFrame, features: list[str], category_mappings: dict[str, list[str]] | None = None
119
+ ) -> tuple[pd.DataFrame, dict[str, list[str]]]:
122
120
  """
123
121
  Converts appropriate columns to categorical type with consistent mappings.
124
122
 
125
123
  Args:
126
124
  df (pd.DataFrame): The DataFrame to process.
127
125
  features (list): List of feature names to consider for conversion.
128
- category_mappings (dict, optional): Existing category mappings. If empty dict, we're in
129
- training mode. If populated, we're in inference mode.
126
+ category_mappings (dict, optional): Existing category mappings. If None or empty,
127
+ we're in training mode. If populated, we're in
128
+ inference mode.
130
129
 
131
130
  Returns:
132
131
  tuple: (processed DataFrame, category mappings dictionary)
133
132
  """
133
+ if category_mappings is None:
134
+ category_mappings = {}
135
+
134
136
  # Training mode
135
- if category_mappings == {}:
137
+ if not category_mappings:
136
138
  for col in df.select_dtypes(include=["object", "string"]):
137
139
  if col in features and df[col].nunique() < 20:
138
140
  print(f"Training mode: Converting {col} to category")
139
141
  df[col] = df[col].astype("category")
140
- category_mappings[col] = df[col].cat.categories.tolist() # Store category mappings
142
+ category_mappings[col] = df[col].cat.categories.tolist()
141
143
 
142
144
  # Inference mode
143
145
  else:
144
146
  for col, categories in category_mappings.items():
145
147
  if col in df.columns:
146
148
  print(f"Inference mode: Applying categorical mapping for {col}")
147
- df[col] = pd.Categorical(df[col], categories=categories) # Apply consistent categorical mapping
149
+ df[col] = pd.Categorical(df[col], categories=categories)
148
150
 
149
151
  return df, category_mappings
150
152
 
151
153
 
152
154
  def decompress_features(
153
- df: pd.DataFrame, features: List[str], compressed_features: List[str]
154
- ) -> Tuple[pd.DataFrame, List[str]]:
155
+ df: pd.DataFrame, features: list[str], compressed_features: list[str]
156
+ ) -> tuple[pd.DataFrame, list[str]]:
155
157
  """Prepare features for the model
156
158
 
157
159
  Args:
158
160
  df (pd.DataFrame): The features DataFrame
159
- features (List[str]): Full list of feature names
160
- compressed_features (List[str]): List of feature names to decompress (bitstrings)
161
+ features (list[str]): Full list of feature names
162
+ compressed_features (list[str]): List of feature names to decompress (bitstrings)
161
163
 
162
164
  Returns:
163
165
  pd.DataFrame: DataFrame with the decompressed features
164
- List[str]: Updated list of feature names after decompression
166
+ list[str]: Updated list of feature names after decompression
165
167
 
166
168
  Raises:
167
169
  ValueError: If any missing values are found in the specified features
168
170
  """
169
-
170
171
  # Check for any missing values in the required features
171
172
  missing_counts = df[features].isna().sum()
172
173
  if missing_counts.any():
@@ -176,10 +177,11 @@ def decompress_features(
176
177
  "WARNING: You might want to remove/replace all NaN values before processing."
177
178
  )
178
179
 
179
- # Decompress the specified compressed features
180
- decompressed_features = features
180
+ # Make a copy to avoid mutating the original list
181
+ decompressed_features = features.copy()
182
+
181
183
  for feature in compressed_features:
182
- if (feature not in df.columns) or (feature not in features):
184
+ if (feature not in df.columns) or (feature not in decompressed_features):
183
185
  print(f"Feature '{feature}' not in the features list, skipping decompression.")
184
186
  continue
185
187
 
@@ -204,26 +206,60 @@ def decompress_features(
204
206
  return df, decompressed_features
205
207
 
206
208
 
207
- def model_fn(model_dir):
209
+ def model_fn(model_dir: str) -> dict:
210
+ """Load the PyTorch Tabular ensemble models from the specified directory.
211
+
212
+ Args:
213
+ model_dir: Directory containing the saved model(s)
214
+
215
+ Returns:
216
+ Dictionary with ensemble models and metadata
217
+ """
218
+ import torch
219
+ from functools import partial
220
+
221
+ # Load ensemble metadata if present
222
+ ensemble_metadata_path = os.path.join(model_dir, "ensemble_metadata.joblib")
223
+ if os.path.exists(ensemble_metadata_path):
224
+ ensemble_metadata = joblib.load(ensemble_metadata_path)
225
+ n_ensemble = ensemble_metadata["n_ensemble"]
226
+ else:
227
+ n_ensemble = 1
228
+
229
+ # Determine map_location for loading models (handle CUDA trained models on CPU inference)
230
+ map_location = torch.device("cuda" if torch.cuda.is_available() else "cpu")
231
+
232
+ # Patch torch.load globally to use map_location (needed for joblib-loaded callbacks)
233
+ # This handles the case where pytorch-tabular loads callbacks.sav via joblib,
234
+ # which internally calls torch.load without map_location
235
+ original_torch_load = torch.load
236
+ torch.load = partial(original_torch_load, map_location=map_location)
208
237
 
209
238
  # Save current working directory
210
239
  original_cwd = os.getcwd()
240
+ ensemble_models = []
241
+
211
242
  try:
212
243
  # Change to /tmp because Pytorch Tabular needs write access (creates a .pt_tmp directory)
213
244
  os.chdir("/tmp")
214
245
 
215
- # Load the model
216
- model_path = os.path.join(model_dir, "tabular_model")
217
- model = TabularModel.load_model(model_path)
246
+ for ens_idx in range(n_ensemble):
247
+ # Try numbered model path first, fall back to legacy path
248
+ model_path = os.path.join(model_dir, f"tabular_model_{ens_idx}")
249
+ if not os.path.exists(model_path):
250
+ model_path = os.path.join(model_dir, "tabular_model")
251
+ model = TabularModel.load_model(model_path, map_location=map_location)
252
+ ensemble_models.append(model)
218
253
 
219
- # Restore the original working directory
220
254
  finally:
255
+ # Restore torch.load and working directory
256
+ torch.load = original_torch_load
221
257
  os.chdir(original_cwd)
222
258
 
223
- return model
259
+ return {"ensemble_models": ensemble_models, "n_ensemble": n_ensemble}
224
260
 
225
261
 
226
- def input_fn(input_data, content_type):
262
+ def input_fn(input_data, content_type: str) -> pd.DataFrame:
227
263
  """Parse input data and return a DataFrame."""
228
264
  if not input_data:
229
265
  raise ValueError("Empty input data is not supported!")
@@ -240,29 +276,34 @@ def input_fn(input_data, content_type):
240
276
  raise ValueError(f"{content_type} not supported!")
241
277
 
242
278
 
243
- def output_fn(output_df, accept_type):
279
+ def output_fn(output_df: pd.DataFrame, accept_type: str) -> tuple[str, str]:
244
280
  """Supports both CSV and JSON output formats."""
245
281
  if "text/csv" in accept_type:
246
- csv_output = output_df.fillna("N/A").to_csv(index=False) # CSV with N/A for missing values
282
+ csv_output = output_df.fillna("N/A").to_csv(index=False)
247
283
  return csv_output, "text/csv"
248
284
  elif "application/json" in accept_type:
249
- return output_df.to_json(orient="records"), "application/json" # JSON array of records (NaNs -> null)
285
+ return output_df.to_json(orient="records"), "application/json"
250
286
  else:
251
287
  raise RuntimeError(f"{accept_type} accept type is not supported by this script.")
252
288
 
253
289
 
254
- def predict_fn(df, model) -> pd.DataFrame:
255
- """Make Predictions with our PyTorch Tabular Model
290
+ def predict_fn(df: pd.DataFrame, model_dict: dict) -> pd.DataFrame:
291
+ """Make Predictions with our PyTorch Tabular Model ensemble.
256
292
 
257
293
  Args:
258
294
  df (pd.DataFrame): The input DataFrame
259
- model: The TabularModel use for predictions
295
+ model_dict: Dictionary containing ensemble models and metadata
260
296
 
261
297
  Returns:
262
- pd.DataFrame: The DataFrame with the predictions added
298
+ pd.DataFrame: The DataFrame with predictions (and prediction_std for ensembles)
263
299
  """
300
+ model_type = TEMPLATE_PARAMS["model_type"]
264
301
  compressed_features = TEMPLATE_PARAMS["compressed_features"]
265
302
 
303
+ # Extract ensemble models
304
+ ensemble_models = model_dict["ensemble_models"]
305
+ n_ensemble = model_dict["n_ensemble"]
306
+
266
307
  # Grab our feature columns (from training)
267
308
  model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
268
309
  with open(os.path.join(model_dir, "feature_columns.json")) as fp:
@@ -275,12 +316,11 @@ def predict_fn(df, model) -> pd.DataFrame:
275
316
 
276
317
  # Load our Label Encoder if we have one
277
318
  label_encoder = None
278
- if os.path.exists(os.path.join(model_dir, "label_encoder.joblib")):
279
- label_encoder = joblib.load(os.path.join(model_dir, "label_encoder.joblib"))
319
+ label_encoder_path = os.path.join(model_dir, "label_encoder.joblib")
320
+ if os.path.exists(label_encoder_path):
321
+ label_encoder = joblib.load(label_encoder_path)
280
322
 
281
- # We're going match features in a case-insensitive manner, accounting for all the permutations
282
- # - Model has a feature list that's any case ("Id", "taCos", "cOunT", "likes_tacos")
283
- # - Incoming data has columns that are mixed case ("ID", "Tacos", "Count", "Likes_Tacos")
323
+ # Match features in a case-insensitive manner
284
324
  matched_df = match_features_case_insensitive(df, features)
285
325
 
286
326
  # Detect categorical types in the incoming DataFrame
@@ -291,36 +331,80 @@ def predict_fn(df, model) -> pd.DataFrame:
291
331
  print("Decompressing features for prediction...")
292
332
  matched_df, features = decompress_features(matched_df, features, compressed_features)
293
333
 
294
- # Make predictions using the TabularModel
295
- result = model.predict(matched_df[features])
334
+ # Track rows with missing features
335
+ missing_mask = matched_df[features].isna().any(axis=1)
336
+ if missing_mask.any():
337
+ print(f"Warning: {missing_mask.sum()} rows have missing features, will return NaN predictions")
338
+
339
+ # Initialize prediction columns
340
+ df["prediction"] = np.nan
341
+ if model_type in ["regressor", "uq_regressor"]:
342
+ df["prediction_std"] = np.nan
343
+
344
+ # Only predict on complete rows
345
+ complete_df = matched_df[~missing_mask]
346
+ if len(complete_df) == 0:
347
+ print("Warning: No complete rows to predict on")
348
+ return df
296
349
 
297
350
  # pytorch-tabular returns predictions using f"{target}_prediction" column
298
- # and classification probabilities in columns ending with "_probability"
299
- target = TEMPLATE_PARAMS["target_column"]
351
+ target = TEMPLATE_PARAMS["target"]
300
352
  prediction_column = f"{target}_prediction"
301
- if prediction_column in result.columns:
302
- predictions = result[prediction_column].values
303
- else:
304
- raise ValueError(f"Cannot find prediction column in: {result.columns.tolist()}")
305
353
 
306
- # If we have a label encoder, decode the predictions
307
- if label_encoder:
308
- predictions = label_encoder.inverse_transform(predictions.astype(int))
354
+ # Collect predictions from all ensemble members
355
+ all_ensemble_preds = []
356
+ all_ensemble_probs = []
357
+
358
+ for ens_idx, ens_model in enumerate(ensemble_models):
359
+ result = ens_model.predict(complete_df[features])
360
+
361
+ if prediction_column in result.columns:
362
+ ens_preds = result[prediction_column].values
363
+ else:
364
+ raise ValueError(f"Cannot find prediction column in: {result.columns.tolist()}")
365
+
366
+ all_ensemble_preds.append(ens_preds)
309
367
 
310
- # Set the predictions on the DataFrame
311
- df["prediction"] = predictions
368
+ # For classification, collect probabilities
369
+ if label_encoder is not None:
370
+ prob_cols = sorted([col for col in result.columns if col.endswith("_probability")])
371
+ if prob_cols:
372
+ all_ensemble_probs.append(result[prob_cols].values)
312
373
 
313
- # For classification, get probabilities
374
+ # Stack and compute mean/std (std is 0 for single model)
375
+ ensemble_preds = np.stack(all_ensemble_preds, axis=0) # (n_ensemble, n_samples)
376
+ preds = np.mean(ensemble_preds, axis=0)
377
+ preds_std = np.std(ensemble_preds, axis=0) # Will be 0s for n_ensemble=1
378
+
379
+ print(f"Inference: Ensemble predictions shape: {preds.shape}, n_ensemble: {n_ensemble}")
380
+
381
+ # Handle classification vs regression
314
382
  if label_encoder is not None:
315
- prob_cols = [col for col in result.columns if col.endswith("_probability")]
316
- if prob_cols:
317
- probs = result[prob_cols].values
318
- df["pred_proba"] = [p.tolist() for p in probs]
383
+ # For classification, average probabilities then take argmax
384
+ if all_ensemble_probs:
385
+ ensemble_probs = np.stack(all_ensemble_probs, axis=0) # (n_ensemble, n_samples, n_classes)
386
+ avg_probs = np.mean(ensemble_probs, axis=0) # (n_samples, n_classes)
387
+ class_preds = np.argmax(avg_probs, axis=1)
388
+ predictions = label_encoder.inverse_transform(class_preds)
389
+
390
+ # Build full proba Series with None for missing rows
391
+ all_proba = pd.Series([None] * len(df), index=df.index, dtype=object)
392
+ all_proba.loc[~missing_mask] = [p.tolist() for p in avg_probs]
393
+ df["pred_proba"] = all_proba
319
394
 
320
395
  # Expand the pred_proba column into separate columns for each class
321
396
  df = expand_proba_column(df, label_encoder.classes_)
397
+ else:
398
+ # No probabilities, use averaged predictions
399
+ predictions = label_encoder.inverse_transform(preds.astype(int))
400
+ else:
401
+ # Regression (includes uq_regressor)
402
+ predictions = preds
403
+ df.loc[~missing_mask, "prediction_std"] = preds_std
404
+
405
+ # Set predictions only for complete rows
406
+ df.loc[~missing_mask, "prediction"] = predictions
322
407
 
323
- # All done, return the DataFrame with new columns for the predictions
324
408
  return df
325
409
 
326
410
 
@@ -331,12 +415,11 @@ if __name__ == "__main__":
331
415
  target = TEMPLATE_PARAMS["target"]
332
416
  features = TEMPLATE_PARAMS["features"]
333
417
  orig_features = features.copy()
418
+ id_column = TEMPLATE_PARAMS["id_column"]
334
419
  compressed_features = TEMPLATE_PARAMS["compressed_features"]
335
420
  model_type = TEMPLATE_PARAMS["model_type"]
336
421
  model_metrics_s3_path = TEMPLATE_PARAMS["model_metrics_s3_path"]
337
- train_all_data = TEMPLATE_PARAMS["train_all_data"]
338
422
  hyperparameters = TEMPLATE_PARAMS["hyperparameters"]
339
- validation_split = 0.2
340
423
 
341
424
  # Script arguments for input/output directories
342
425
  parser = argparse.ArgumentParser()
@@ -354,9 +437,21 @@ if __name__ == "__main__":
354
437
  # Combine files and read them all into a single pandas dataframe
355
438
  all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
356
439
 
440
+ # Print out some info about the dataframe
441
+ print(f"All Data Shape: {all_df.shape}")
442
+ print(f"Feature dtypes:\n{all_df[features].dtypes.value_counts()}")
443
+ print(f"Int64 columns: {all_df[features].select_dtypes(include=['int64']).columns.tolist()}")
444
+
357
445
  # Check if the dataframe is empty
358
446
  check_dataframe(all_df, "training_df")
359
447
 
448
+ # Drop any rows with missing feature values
449
+ initial_row_count = all_df.shape[0]
450
+ all_df = all_df.dropna(subset=features)
451
+ dropped_rows = initial_row_count - all_df.shape[0]
452
+ if dropped_rows > 0:
453
+ print(f"Dropped {dropped_rows} rows due to missing feature values.")
454
+
360
455
  # Features/Target output
361
456
  print(f"Target: {target}")
362
457
  print(f"Features: {str(features)}")
@@ -364,82 +459,88 @@ if __name__ == "__main__":
364
459
  # Convert any features that might be categorical to 'category' type
365
460
  all_df, category_mappings = convert_categorical_types(all_df, features)
366
461
 
462
+ # Print out some info about the dataframe
463
+ print(f"All Data Shape: {all_df.shape}")
464
+ print(f"Feature dtypes:\n{all_df[features].dtypes.value_counts()}")
465
+ print(f"Int64 columns: {all_df[features].select_dtypes(include=['int64']).columns.tolist()}")
466
+
367
467
  # If we have compressed features, decompress them
368
468
  if compressed_features:
369
469
  print(f"Decompressing features {compressed_features}...")
370
470
  all_df, features = decompress_features(all_df, features, compressed_features)
371
471
 
372
- # Do we want to train on all the data?
373
- if train_all_data:
374
- print("Training on ALL of the data")
375
- df_train = all_df.copy()
376
- df_val = all_df.copy()
377
-
378
- # Does the dataframe have a training column?
379
- elif "training" in all_df.columns:
380
- print("Found training column, splitting data based on training column")
381
- df_train = all_df[all_df["training"]]
382
- df_val = all_df[~all_df["training"]]
383
- else:
384
- # Just do a random training Split
385
- print("WARNING: No training column found, splitting data with random state=42")
386
- df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
387
- print(f"FIT/TRAIN: {df_train.shape}")
388
- print(f"VALIDATION: {df_val.shape}")
389
-
390
472
  # Determine categorical and continuous columns
391
- categorical_cols = [col for col in features if df_train[col].dtype.name == "category"]
473
+ categorical_cols = [col for col in features if all_df[col].dtype.name == "category"]
392
474
  continuous_cols = [col for col in features if col not in categorical_cols]
393
-
394
475
  print(f"Categorical columns: {categorical_cols}")
395
476
  print(f"Continuous columns: {continuous_cols}")
396
477
 
397
- # Set up PyTorch Tabular configuration
398
- data_config = DataConfig(
399
- target=[target],
400
- continuous_cols=continuous_cols,
401
- categorical_cols=categorical_cols,
402
- )
478
+ # Cast continuous columns to float
479
+ all_df[continuous_cols] = all_df[continuous_cols].astype("float64")
403
480
 
404
- # Choose the 'task' based on model type also set up the label encoder if needed
481
+ # Choose the 'task' based on model type and set up the label encoder if needed
405
482
  if model_type == "classifier":
406
483
  task = "classification"
407
- # Encode the target column
484
+ # Encode the target column on full dataset for consistent encoding
408
485
  label_encoder = LabelEncoder()
409
- df_train[target] = label_encoder.fit_transform(df_train[target])
410
- df_val[target] = label_encoder.transform(df_val[target])
486
+ all_df[target] = label_encoder.fit_transform(all_df[target])
487
+ num_classes = len(label_encoder.classes_)
411
488
  else:
412
489
  task = "regression"
413
490
  label_encoder = None
491
+ num_classes = None
414
492
 
415
493
  # Use any hyperparameters to set up both the trainer and model configurations
416
494
  print(f"Hyperparameters: {hyperparameters}")
495
+ n_folds = hyperparameters.get("n_folds", 5) # Number of CV folds (default: 5)
496
+
497
+ # =========================================================================
498
+ # UNIFIED TRAINING: Works for n_folds=1 (single model) or n_folds>1 (K-fold CV)
499
+ # =========================================================================
500
+ print(f"Training {'single model' if n_folds == 1 else f'{n_folds}-fold cross-validation ensemble'}...")
501
+
502
+ # Create fold splits
503
+ if n_folds == 1:
504
+ # Single fold: use train/val split from "training" column or random split
505
+ if "training" in all_df.columns:
506
+ print("Found training column, splitting data based on training column")
507
+ train_idx = np.where(all_df["training"])[0]
508
+ val_idx = np.where(~all_df["training"])[0]
509
+ else:
510
+ print("WARNING: No training column found, splitting data with random 80/20 split")
511
+ indices = np.arange(len(all_df))
512
+ train_idx, val_idx = train_test_split(indices, test_size=0.2, random_state=42)
513
+ folds = [(train_idx, val_idx)]
514
+ else:
515
+ # K-Fold CV
516
+ if model_type == "classifier":
517
+ kfold = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
518
+ split_target = all_df[target]
519
+ else:
520
+ kfold = KFold(n_splits=n_folds, shuffle=True, random_state=42)
521
+ split_target = None
522
+ folds = list(kfold.split(all_df, split_target))
523
+
524
+ # Initialize storage for out-of-fold predictions
525
+ oof_predictions = np.full(len(all_df), np.nan, dtype=np.float64)
526
+ if model_type == "classifier" and num_classes and num_classes > 1:
527
+ oof_proba = np.full((len(all_df), num_classes), np.nan, dtype=np.float64)
528
+ else:
529
+ oof_proba = None
417
530
 
418
- # Set up PyTorch Tabular configuration with defaults
419
- trainer_defaults = {
420
- "auto_lr_find": True,
421
- "batch_size": min(1024, max(32, len(df_train) // 4)),
422
- "max_epochs": 100,
423
- "early_stopping": "valid_loss",
424
- "early_stopping_patience": 15,
425
- "checkpoints": "valid_loss",
426
- "accelerator": "auto",
427
- "progress_bar": "none",
428
- "gradient_clip_val": 1.0,
429
- }
531
+ ensemble_models = []
430
532
 
431
- # Override defaults with training_config if present
432
- training_overrides = {k: v for k, v in hyperparameters.get("training_config", {}).items() if k in trainer_defaults}
433
- # Print overwrites
434
- for key, value in training_overrides.items():
435
- print(f"TRAINING CONFIG Override: {key}: {trainer_defaults[key]} → {value}")
436
- trainer_params = {**trainer_defaults, **training_overrides}
437
- trainer_config = TrainerConfig(**trainer_params)
533
+ # Set up PyTorch Tabular data configuration (shared across folds)
534
+ data_config = DataConfig(
535
+ target=[target],
536
+ continuous_cols=continuous_cols,
537
+ categorical_cols=categorical_cols,
538
+ )
438
539
 
439
540
  # Model config defaults
440
541
  model_defaults = {
441
- "layers": "1024-512-512",
442
- "activation": "ReLU",
542
+ "layers": "256-128-64",
543
+ "activation": "LeakyReLU",
443
544
  "learning_rate": 1e-3,
444
545
  "dropout": 0.1,
445
546
  "use_batch_norm": True,
@@ -447,63 +548,139 @@ if __name__ == "__main__":
447
548
  }
448
549
  # Override defaults with model_config if present
449
550
  model_overrides = {k: v for k, v in hyperparameters.get("model_config", {}).items() if k in model_defaults}
450
- # Print overwrites
451
551
  for key, value in model_overrides.items():
452
552
  print(f"MODEL CONFIG Override: {key}: {model_defaults[key]} → {value}")
453
553
  model_params = {**model_defaults, **model_overrides}
454
554
 
455
- # Use CategoryEmbedding model configuration for general-purpose tabular modeling.
456
- # Works effectively for both regression and classification as the foundational
457
- # architecture in PyTorch Tabular
458
555
  model_config = CategoryEmbeddingModelConfig(task=task, **model_params)
459
556
  optimizer_config = OptimizerConfig()
460
557
 
461
- #####################################
462
- # Create and train the TabularModel #
463
- #####################################
464
- tabular_model = TabularModel(
465
- data_config=data_config,
466
- model_config=model_config,
467
- optimizer_config=optimizer_config,
468
- trainer_config=trainer_config,
469
- )
470
- tabular_model.fit(train=df_train, validation=df_val)
558
+ for fold_idx, (train_idx, val_idx) in enumerate(folds):
559
+ print(f"\n{'='*50}")
560
+ print(f"Training Fold {fold_idx + 1}/{len(folds)}")
561
+ print(f"{'='*50}")
562
+
563
+ # Split data for this fold
564
+ df_train = all_df.iloc[train_idx].reset_index(drop=True)
565
+ df_val = all_df.iloc[val_idx].reset_index(drop=True)
566
+
567
+ print(f"Fold {fold_idx + 1} - Train: {len(df_train)}, Val: {len(df_val)}")
568
+
569
+ # Set up PyTorch Tabular trainer configuration (per-fold for batch_size)
570
+ # Calculate batch size that avoids single-sample last batch (batch norm requires >1)
571
+ batch_size = min(128, max(32, len(df_train) // 16))
572
+ if len(df_train) % batch_size == 1:
573
+ batch_size += 1 # Adjust to avoid last batch of size 1
574
+ trainer_defaults = {
575
+ "auto_lr_find": False,
576
+ "batch_size": batch_size,
577
+ "max_epochs": 200,
578
+ "min_epochs": 10,
579
+ "early_stopping": "valid_loss",
580
+ "early_stopping_patience": 20,
581
+ "checkpoints": "valid_loss",
582
+ "accelerator": "auto",
583
+ "progress_bar": "none",
584
+ "gradient_clip_val": 1.0,
585
+ "seed": 42 + fold_idx,
586
+ }
587
+
588
+ # Override defaults with training_config if present
589
+ training_overrides = {k: v for k, v in hyperparameters.get("training_config", {}).items() if k in trainer_defaults}
590
+ if fold_idx == 0: # Only print overrides once
591
+ for key, value in training_overrides.items():
592
+ print(f"TRAINING CONFIG Override: {key}: {trainer_defaults[key]} → {value}")
593
+ trainer_params = {**trainer_defaults, **training_overrides}
594
+ trainer_config = TrainerConfig(**trainer_params)
595
+
596
+ # Create and train the TabularModel for this fold
597
+ tabular_model = TabularModel(
598
+ data_config=data_config,
599
+ model_config=model_config,
600
+ optimizer_config=optimizer_config,
601
+ trainer_config=trainer_config,
602
+ )
603
+ tabular_model.fit(train=df_train, validation=df_val)
604
+ ensemble_models.append(tabular_model)
605
+
606
+ # Make out-of-fold predictions
607
+ result = tabular_model.predict(df_val, include_input_features=False)
608
+ fold_preds = result[f"{target}_prediction"].values
609
+
610
+ # Store out-of-fold predictions
611
+ if model_type == "classifier":
612
+ oof_predictions[val_idx] = fold_preds.astype(int)
613
+ prob_cols = sorted([col for col in result.columns if col.endswith("_probability")])
614
+ if prob_cols and oof_proba is not None:
615
+ oof_proba[val_idx] = result[prob_cols].values
616
+ else:
617
+ oof_predictions[val_idx] = fold_preds.flatten()
471
618
 
472
- # Make Predictions on the Validation Set
473
- print("Making Predictions on Validation Set...")
474
- result = tabular_model.predict(df_val, include_input_features=False)
619
+ print(f"Fold {fold_idx + 1} complete!")
475
620
 
476
- # pytorch-tabular returns predictions using f"{target}_prediction" column
477
- # and classification probabilities in columns ending with "_probability"
478
- if model_type == "classifier":
479
- preds = result[f"{target}_prediction"].values
621
+ print(f"\nTraining complete! Trained {len(ensemble_models)} model(s).")
622
+
623
+ # Use out-of-fold predictions for metrics
624
+ # For n_folds=1, we only have predictions for val_idx, so filter to those rows
625
+ if n_folds == 1:
626
+ val_mask = ~np.isnan(oof_predictions)
627
+ preds = oof_predictions[val_mask]
628
+ df_val = all_df[val_mask].copy()
629
+ if oof_proba is not None:
630
+ oof_proba = oof_proba[val_mask]
480
631
  else:
481
- # Regression: use the target column name
482
- preds = result[f"{target}_prediction"].values
632
+ preds = oof_predictions
633
+ df_val = all_df.copy()
634
+
635
+ # Compute prediction_std by running all ensemble models on validation data
636
+ # For n_folds=1, std will be 0 (only one model). For n_folds>1, std shows ensemble disagreement.
637
+ preds_std = None
638
+ if model_type in ["regressor", "uq_regressor"] and len(ensemble_models) > 0:
639
+ print("Computing prediction_std from ensemble predictions on validation data...")
640
+ all_ensemble_preds_for_std = []
641
+ for ens_model in ensemble_models:
642
+ result = ens_model.predict(df_val[features], include_input_features=False)
643
+ ens_preds = result[f"{target}_prediction"].values.flatten()
644
+ all_ensemble_preds_for_std.append(ens_preds)
645
+
646
+ ensemble_preds_stacked = np.stack(all_ensemble_preds_for_std, axis=0)
647
+ preds_std = np.std(ensemble_preds_stacked, axis=0)
648
+ print(f"Ensemble prediction_std - mean: {np.mean(preds_std):.4f}, max: {np.max(preds_std):.4f}")
483
649
 
484
650
  if model_type == "classifier":
485
651
  # Get probabilities for classification
486
- print("Processing Probabilities...")
487
- prob_cols = [col for col in result.columns if col.endswith("_probability")]
488
- if prob_cols:
489
- probs = result[prob_cols].values
490
- df_val["pred_proba"] = [p.tolist() for p in probs]
491
-
492
- # Expand the pred_proba column into separate columns for each class
493
- print(df_val.columns)
652
+ if oof_proba is not None:
653
+ df_val = df_val.copy()
654
+ df_val["pred_proba"] = [p.tolist() for p in oof_proba]
494
655
  df_val = expand_proba_column(df_val, label_encoder.classes_)
495
- print(df_val.columns)
496
656
 
497
657
  # Decode the target and prediction labels
498
658
  y_validate = label_encoder.inverse_transform(df_val[target])
499
- preds = label_encoder.inverse_transform(preds.astype(int))
659
+ preds_decoded = label_encoder.inverse_transform(preds.astype(int))
500
660
  else:
501
661
  y_validate = df_val[target].values
662
+ preds_decoded = preds
663
+
664
+ # Save predictions to S3
665
+ df_val = df_val.copy()
666
+ df_val["prediction"] = preds_decoded
667
+
668
+ # Build output columns - include id_column if it exists
669
+ output_columns = []
670
+ if id_column in df_val.columns:
671
+ output_columns.append(id_column)
672
+ output_columns += [target, "prediction"]
673
+
674
+ # Add prediction_std for regression models (always present, 0 for single model)
675
+ if model_type in ["regressor", "uq_regressor"]:
676
+ if preds_std is not None:
677
+ df_val["prediction_std"] = preds_std
678
+ else:
679
+ df_val["prediction_std"] = 0.0
680
+ output_columns.append("prediction_std")
681
+ print(f"Ensemble std - mean: {df_val['prediction_std'].mean():.4f}, max: {df_val['prediction_std'].max():.4f}")
502
682
 
503
- # Save predictions to S3 (just the target, prediction, and '_probability' columns)
504
- df_val["prediction"] = preds
505
- output_columns = [target, "prediction"]
506
- output_columns += [col for col in df_val.columns if col.endswith("_probability")]
683
+ output_columns += [col for col in df_val.columns if col.endswith("_proba")]
507
684
  wr.s3.to_csv(
508
685
  df_val[output_columns],
509
686
  path=f"{model_metrics_s3_path}/validation_predictions.csv",
@@ -516,7 +693,7 @@ if __name__ == "__main__":
516
693
  label_names = label_encoder.classes_
517
694
 
518
695
  # Calculate various model performance metrics
519
- scores = precision_recall_fscore_support(y_validate, preds, average=None, labels=label_names)
696
+ scores = precision_recall_fscore_support(y_validate, preds_decoded, average=None, labels=label_names)
520
697
 
521
698
  # Put the scores into a dataframe
522
699
  score_df = pd.DataFrame(
@@ -529,7 +706,7 @@ if __name__ == "__main__":
529
706
  }
530
707
  )
531
708
 
532
- # We need to get creative with the Classification Metrics
709
+ # Output metrics per class
533
710
  metrics = ["precision", "recall", "f1", "support"]
534
711
  for t in label_names:
535
712
  for m in metrics:
@@ -537,7 +714,7 @@ if __name__ == "__main__":
537
714
  print(f"Metrics:{t}:{m} {value}")
538
715
 
539
716
  # Compute and output the confusion matrix
540
- conf_mtx = confusion_matrix(y_validate, preds, labels=label_names)
717
+ conf_mtx = confusion_matrix(y_validate, preds_decoded, labels=label_names)
541
718
  for i, row_name in enumerate(label_names):
542
719
  for j, col_name in enumerate(label_names):
543
720
  value = conf_mtx[i, j]
@@ -545,22 +722,37 @@ if __name__ == "__main__":
545
722
 
546
723
  else:
547
724
  # Calculate various model performance metrics (regression)
548
- rmse = root_mean_squared_error(y_validate, preds)
549
- mae = mean_absolute_error(y_validate, preds)
550
- r2 = r2_score(y_validate, preds)
551
- print(f"RMSE: {rmse:.3f}")
552
- print(f"MAE: {mae:.3f}")
553
- print(f"R2: {r2:.3f}")
554
- print(f"NumRows: {len(df_val)}")
555
-
556
- # Save the model to the standard place/name
557
- tabular_model.save_model(os.path.join(args.model_dir, "tabular_model"))
725
+ rmse = root_mean_squared_error(y_validate, preds_decoded)
726
+ mae = mean_absolute_error(y_validate, preds_decoded)
727
+ medae = median_absolute_error(y_validate, preds_decoded)
728
+ r2 = r2_score(y_validate, preds_decoded)
729
+ spearman_corr = spearmanr(y_validate, preds_decoded).correlation
730
+ support = len(df_val)
731
+ print(f"rmse: {rmse:.3f}")
732
+ print(f"mae: {mae:.3f}")
733
+ print(f"medae: {medae:.3f}")
734
+ print(f"r2: {r2:.3f}")
735
+ print(f"spearmanr: {spearman_corr:.3f}")
736
+ print(f"support: {support}")
737
+
738
+ # Save ensemble models
739
+ for model_idx, ens_model in enumerate(ensemble_models):
740
+ model_path = os.path.join(args.model_dir, f"tabular_model_{model_idx}")
741
+ ens_model.save_model(model_path)
742
+ print(f"Saved model {model_idx + 1} to {model_path}")
743
+
744
+ # Save ensemble metadata
745
+ n_ensemble = len(ensemble_models)
746
+ ensemble_metadata = {"n_ensemble": n_ensemble, "n_folds": n_folds}
747
+ joblib.dump(ensemble_metadata, os.path.join(args.model_dir, "ensemble_metadata.joblib"))
748
+ print(f"Saved ensemble metadata (n_ensemble={n_ensemble}, n_folds={n_folds})")
749
+
558
750
  if label_encoder:
559
751
  joblib.dump(label_encoder, os.path.join(args.model_dir, "label_encoder.joblib"))
560
752
 
561
753
  # Save the features (this will validate input during predictions)
562
754
  with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
563
- json.dump(orig_features, fp) # We save the original features, not the decompressed ones
755
+ json.dump(orig_features, fp)
564
756
 
565
757
  # Save the category mappings
566
758
  with open(os.path.join(args.model_dir, "category_mappings.json"), "w") as fp: