workbench 0.8.162__py3-none-any.whl → 0.8.202__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of workbench might be problematic. Click here for more details.

Files changed (113) hide show
  1. workbench/algorithms/dataframe/__init__.py +1 -2
  2. workbench/algorithms/dataframe/fingerprint_proximity.py +2 -2
  3. workbench/algorithms/dataframe/proximity.py +261 -235
  4. workbench/algorithms/graph/light/proximity_graph.py +10 -8
  5. workbench/api/__init__.py +2 -1
  6. workbench/api/compound.py +1 -1
  7. workbench/api/endpoint.py +11 -0
  8. workbench/api/feature_set.py +11 -8
  9. workbench/api/meta.py +5 -2
  10. workbench/api/model.py +16 -15
  11. workbench/api/monitor.py +1 -16
  12. workbench/core/artifacts/__init__.py +11 -2
  13. workbench/core/artifacts/artifact.py +11 -3
  14. workbench/core/artifacts/data_capture_core.py +355 -0
  15. workbench/core/artifacts/endpoint_core.py +256 -118
  16. workbench/core/artifacts/feature_set_core.py +265 -16
  17. workbench/core/artifacts/model_core.py +107 -60
  18. workbench/core/artifacts/monitor_core.py +33 -248
  19. workbench/core/cloud_platform/aws/aws_account_clamp.py +50 -1
  20. workbench/core/cloud_platform/aws/aws_meta.py +12 -5
  21. workbench/core/cloud_platform/aws/aws_parameter_store.py +18 -2
  22. workbench/core/cloud_platform/aws/aws_session.py +4 -4
  23. workbench/core/transforms/data_to_features/light/molecular_descriptors.py +4 -4
  24. workbench/core/transforms/features_to_model/features_to_model.py +42 -32
  25. workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +36 -6
  26. workbench/core/transforms/pandas_transforms/pandas_to_features.py +27 -0
  27. workbench/core/views/training_view.py +113 -42
  28. workbench/core/views/view.py +53 -3
  29. workbench/core/views/view_utils.py +4 -4
  30. workbench/model_scripts/chemprop/chemprop.template +852 -0
  31. workbench/model_scripts/chemprop/generated_model_script.py +852 -0
  32. workbench/model_scripts/chemprop/requirements.txt +11 -0
  33. workbench/model_scripts/custom_models/chem_info/fingerprints.py +134 -0
  34. workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +483 -0
  35. workbench/model_scripts/custom_models/chem_info/mol_standardize.py +450 -0
  36. workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +7 -9
  37. workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py +1 -1
  38. workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +3 -5
  39. workbench/model_scripts/custom_models/proximity/proximity.py +261 -235
  40. workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
  41. workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +20 -21
  42. workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
  43. workbench/model_scripts/custom_models/uq_models/meta_uq.template +166 -62
  44. workbench/model_scripts/custom_models/uq_models/ngboost.template +30 -18
  45. workbench/model_scripts/custom_models/uq_models/proximity.py +261 -235
  46. workbench/model_scripts/custom_models/uq_models/requirements.txt +1 -3
  47. workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +15 -17
  48. workbench/model_scripts/pytorch_model/generated_model_script.py +373 -190
  49. workbench/model_scripts/pytorch_model/pytorch.template +370 -187
  50. workbench/model_scripts/scikit_learn/generated_model_script.py +7 -12
  51. workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
  52. workbench/model_scripts/script_generation.py +17 -9
  53. workbench/model_scripts/uq_models/generated_model_script.py +605 -0
  54. workbench/model_scripts/uq_models/mapie.template +605 -0
  55. workbench/model_scripts/uq_models/requirements.txt +1 -0
  56. workbench/model_scripts/xgb_model/generated_model_script.py +37 -46
  57. workbench/model_scripts/xgb_model/xgb_model.template +44 -46
  58. workbench/repl/workbench_shell.py +28 -14
  59. workbench/scripts/endpoint_test.py +162 -0
  60. workbench/scripts/lambda_test.py +73 -0
  61. workbench/scripts/ml_pipeline_batch.py +137 -0
  62. workbench/scripts/ml_pipeline_sqs.py +186 -0
  63. workbench/scripts/monitor_cloud_watch.py +20 -100
  64. workbench/utils/aws_utils.py +4 -3
  65. workbench/utils/chem_utils/__init__.py +0 -0
  66. workbench/utils/chem_utils/fingerprints.py +134 -0
  67. workbench/utils/chem_utils/misc.py +194 -0
  68. workbench/utils/chem_utils/mol_descriptors.py +483 -0
  69. workbench/utils/chem_utils/mol_standardize.py +450 -0
  70. workbench/utils/chem_utils/mol_tagging.py +348 -0
  71. workbench/utils/chem_utils/projections.py +209 -0
  72. workbench/utils/chem_utils/salts.py +256 -0
  73. workbench/utils/chem_utils/sdf.py +292 -0
  74. workbench/utils/chem_utils/toxicity.py +250 -0
  75. workbench/utils/chem_utils/vis.py +253 -0
  76. workbench/utils/chemprop_utils.py +760 -0
  77. workbench/utils/cloudwatch_handler.py +1 -1
  78. workbench/utils/cloudwatch_utils.py +137 -0
  79. workbench/utils/config_manager.py +3 -7
  80. workbench/utils/endpoint_utils.py +5 -7
  81. workbench/utils/license_manager.py +2 -6
  82. workbench/utils/model_utils.py +95 -34
  83. workbench/utils/monitor_utils.py +44 -62
  84. workbench/utils/pandas_utils.py +3 -3
  85. workbench/utils/pytorch_utils.py +526 -0
  86. workbench/utils/shap_utils.py +10 -2
  87. workbench/utils/workbench_logging.py +0 -3
  88. workbench/utils/workbench_sqs.py +1 -1
  89. workbench/utils/xgboost_model_utils.py +371 -156
  90. workbench/web_interface/components/model_plot.py +7 -1
  91. workbench/web_interface/components/plugin_unit_test.py +5 -2
  92. workbench/web_interface/components/plugins/dashboard_status.py +3 -1
  93. workbench/web_interface/components/plugins/generated_compounds.py +1 -1
  94. workbench/web_interface/components/plugins/model_details.py +9 -7
  95. workbench/web_interface/components/plugins/scatter_plot.py +3 -3
  96. {workbench-0.8.162.dist-info → workbench-0.8.202.dist-info}/METADATA +27 -6
  97. {workbench-0.8.162.dist-info → workbench-0.8.202.dist-info}/RECORD +101 -85
  98. {workbench-0.8.162.dist-info → workbench-0.8.202.dist-info}/entry_points.txt +4 -0
  99. {workbench-0.8.162.dist-info → workbench-0.8.202.dist-info}/licenses/LICENSE +1 -1
  100. workbench/model_scripts/custom_models/chem_info/local_utils.py +0 -769
  101. workbench/model_scripts/custom_models/chem_info/tautomerize.py +0 -83
  102. workbench/model_scripts/custom_models/proximity/generated_model_script.py +0 -138
  103. workbench/model_scripts/custom_models/uq_models/generated_model_script.py +0 -393
  104. workbench/model_scripts/custom_models/uq_models/mapie_xgb.template +0 -203
  105. workbench/model_scripts/ensemble_xgb/generated_model_script.py +0 -279
  106. workbench/model_scripts/quant_regression/quant_regression.template +0 -279
  107. workbench/model_scripts/quant_regression/requirements.txt +0 -1
  108. workbench/utils/chem_utils.py +0 -1556
  109. workbench/utils/execution_environment.py +0 -211
  110. workbench/utils/fast_inference.py +0 -167
  111. workbench/utils/resource_utils.py +0 -39
  112. {workbench-0.8.162.dist-info → workbench-0.8.202.dist-info}/WHEEL +0 -0
  113. {workbench-0.8.162.dist-info → workbench-0.8.202.dist-info}/top_level.txt +0 -0
@@ -13,39 +13,38 @@ from pytorch_tabular.models import CategoryEmbeddingModelConfig
13
13
  # Model Performance Scores
14
14
  from sklearn.metrics import (
15
15
  mean_absolute_error,
16
+ median_absolute_error,
16
17
  r2_score,
17
18
  root_mean_squared_error,
18
19
  precision_recall_fscore_support,
19
20
  confusion_matrix,
20
21
  )
22
+ from scipy.stats import spearmanr
21
23
 
22
24
  # Classification Encoder
23
25
  from sklearn.preprocessing import LabelEncoder
24
26
 
25
27
  # Scikit Learn Imports
26
- from sklearn.model_selection import train_test_split
28
+ from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
27
29
 
28
30
  from io import StringIO
29
31
  import json
30
32
  import argparse
31
33
  import joblib
32
- import os
33
34
  import pandas as pd
34
- from typing import List, Tuple
35
35
 
36
36
  # Template Parameters
37
37
  TEMPLATE_PARAMS = {
38
- "model_type": "classifier",
39
- "target_column": "solubility_class",
40
- "features": ['molwt', 'mollogp', 'molmr', 'heavyatomcount', 'numhacceptors', 'numhdonors', 'numheteroatoms', 'numrotatablebonds', 'numvalenceelectrons', 'numaromaticrings', 'numsaturatedrings', 'numaliphaticrings', 'ringcount', 'tpsa', 'labuteasa', 'balabanj', 'bertzct'],
38
+ "model_type": "uq_regressor",
39
+ "target": "udm_asy_res_efflux_ratio",
40
+ "features": ['smr_vsa4', 'tpsa', 'nhohcount', 'peoe_vsa1', 'mollogp', 'numhdonors', 'tertiary_amine_count', 'smr_vsa3', 'nitrogen_span', 'vsa_estate2', 'hba_hbd_ratio', 'minpartialcharge', 'estate_vsa4', 'asphericity', 'charge_centroid_distance', 'peoe_vsa8', 'mi', 'estate_vsa8', 'vsa_estate6', 'vsa_estate3', 'molecular_volume_3d', 'kappa3', 'smr_vsa5', 'sv', 'xp_6dv', 'xc_4dv', 'si', 'molecular_axis_length', 'axp_5d', 'estate_vsa3', 'estate_vsa10', 'axp_7dv', 'slogp_vsa1', 'molecular_asymmetry', 'molmr', 'qed', 'xp_3d', 'axp_0dv', 'fpdensitymorgan1', 'minabsestateindex', 'numatomstereocenters', 'fpdensitymorgan2', 'slogp_vsa2', 'xch_5dv', 'num_s_centers', 'aromatic_interaction_score', 'axp_2dv', 'chi1v', 'hallkieralpha', 'vsa_estate8', 'peoe_vsa9', 'type_ii_pattern_count', 'slogp_vsa5', 'xc_3d', 'amphiphilic_moment', 'bcut2d_logphi', 'estate_vsa6', 'xc_3dv', 'chi0n', 'vsa_estate5', 'xpc_6d', 'vsa_estate7', 'axp_1d', 'axp_7d', 'xch_4dv', 'phi', 'maxestateindex', 'sps', 'bcut2d_mrlow', 'vsa_estate4', 'avgipc', 'bcut2d_mrhi', 'bcut2d_logplow', 'axp_1dv', 'kappa1', 'vsa_estate9', 'fr_imidazole', 'axp_6d', 'radius_of_gyration', 'chi2v', 'chi4n', 'xp_7d', 'smr_vsa6', 'axp_2d', 'num_r_centers', 'xch_7dv', 'estate_vsa2', 'axp_4d', 'maxpartialcharge', 'xpc_5dv', 'xp_5d', 'chi3n', 'chi2n', 'vsa_estate1', 'slogp_vsa11', 'bcut2d_mwlow', 'mm', 'c3sp3', 'numhacceptors', 'fr_nhpyrrole', 'labuteasa', 'fpdensitymorgan3', 'bcut2d_chghi', 'axp_3dv', 'c1sp3', 'kappa2', 'smr_vsa9', 'xp_6d', 'estate_vsa7', 'axp_6dv', 'sp', 'estate_vsa5', 'peoe_vsa2', 'smr_vsa1', 'mp', 'minestateindex', 'axp_3d', 'axp_4dv', 'chi0v', 'slogp_vsa3', 'heavyatommolwt', 'smr_vsa7', 'peoe_vsa6', 'mv', 'xp_4d', 'peoe_vsa7', 'mpe', 'chi4v', 'maxabspartialcharge', 'bcut2d_chglo', 'c1sp2', 'xp_0dv', 'smr_vsa10', 'estate_vsa1', 'fr_pyridine', 'bcut2d_mwhi', 'spe', 'balabanj', 'xch_7d', 'estate_vsa9', 'xp_3dv', 'fr_piperzine', 'xch_6dv', 'slogp_vsa8', 'peoe_vsa10', 'xp_4dv', 'c3sp2', 'fr_al_oh', 'xc_5d', 'fractioncsp3', 'fr_bicyclic', 'fr_piperdine', 'peoe_vsa12', 'peoe_vsa11', 'numheteroatoms', 'mse', 'xp_7dv', 'chi1', 'xpc_6dv', 'numsaturatedcarbocycles', 'chi1n', 'bertzct', 'xc_5dv', 'chi3v', 'intramolecular_hbond_potential', 'peoe_vsa4', 'xpc_5d', 'xp_2d', 'nbase', 'fr_priamide', 'slogp_vsa4', 'naromatom', 'vsa_estate10', 'fr_nitrile', 'molwt', 'peoe_vsa13', 'xch_4d', 'xp_5dv', 'numaromaticheterocycles', 'xpc_4dv', 'fr_hoccn', 'nocount', 'fr_nh1', 'mz', 'xc_6dv', 'hybratio', 'fr_imine', 'fr_morpholine', 'xpc_4d', 'xch_5d', 'numvalenceelectrons', 'numheterocycles', 'fr_aniline', 'fr_nh0', 'frac_defined_stereo', 'fr_benzene', 'xp_2dv', 'type_i_pattern_count', 'fr_ketone_topliss', 'fr_aryl_methyl', 'heavyatomcount', 'mare', 'axp_5dv', 'exactmolwt', 'xch_6d', 'xp_1d', 'xch_3d', 'axp_0d', 'amide_count', 'sse', 'slogp_vsa7', 'c2sp2', 'numrotatablebonds', 'chi0', 'xc_4d', 'slogp_vsa10', 'fr_al_oh_notert', 'numspiroatoms', 'numsaturatedrings', 'minabspartialcharge', 'fr_sulfone', 'slogp_vsa6', 'smr_vsa2', 'num_defined_stereocenters', 'numbridgeheadatoms', 'peoe_vsa3', 'numaliphaticheterocycles', 'fr_ndealkylation1', 'xc_6d'],
41
+ "id_column": "udm_mol_bat_id",
41
42
  "compressed_features": [],
42
- "model_metrics_s3_path": "s3://sandbox-sageworks-artifacts/models/aqsol-pytorch-class/training",
43
- "train_all_data": False,
44
- "hyperparameters": {'training_config': {'max_epochs': 150}, 'model_config': {'layers': '256-128-64'}}
43
+ "model_metrics_s3_path": "s3://ideaya-sageworks-bucket/models/caco2-er-reg-pytorch-test/training",
44
+ "hyperparameters": {'n_folds': 5, 'training_config': {'max_epochs': 200, 'early_stopping_patience': 20}, 'model_config': {'layers': '256-128-64', 'dropout': 0.1, 'learning_rate': 0.001, 'activation': 'LeakyReLU'}},
45
45
  }
46
46
 
47
47
 
48
- # Function to check if dataframe is empty
49
48
  def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
50
49
  """
51
50
  Check if the provided dataframe is empty and raise an exception if it is.
@@ -60,19 +59,17 @@ def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
60
59
  raise ValueError(msg)
61
60
 
62
61
 
63
- def expand_proba_column(df: pd.DataFrame, class_labels: List[str]) -> pd.DataFrame:
62
+ def expand_proba_column(df: pd.DataFrame, class_labels: list[str]) -> pd.DataFrame:
64
63
  """
65
64
  Expands a column in a DataFrame containing a list of probabilities into separate columns.
66
65
 
67
66
  Args:
68
67
  df (pd.DataFrame): DataFrame containing a "pred_proba" column
69
- class_labels (List[str]): List of class labels
68
+ class_labels (list[str]): List of class labels
70
69
 
71
70
  Returns:
72
71
  pd.DataFrame: DataFrame with the "pred_proba" expanded into separate columns
73
72
  """
74
-
75
- # Sanity check
76
73
  proba_column = "pred_proba"
77
74
  if proba_column not in df.columns:
78
75
  raise ValueError('DataFrame does not contain a "pred_proba" column')
@@ -89,11 +86,10 @@ def expand_proba_column(df: pd.DataFrame, class_labels: List[str]) -> pd.DataFra
89
86
 
90
87
  # Concatenate the new columns with the original DataFrame
91
88
  df = pd.concat([df, proba_df], axis=1)
92
- print(df)
93
89
  return df
94
90
 
95
91
 
96
- def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> pd.DataFrame:
92
+ def match_features_case_insensitive(df: pd.DataFrame, model_features: list[str]) -> pd.DataFrame:
97
93
  """
98
94
  Matches and renames DataFrame columns to match model feature names (case-insensitive).
99
95
  Prioritizes exact matches, then case-insensitive matches.
@@ -103,7 +99,6 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
103
99
  df_columns_lower = {col.lower(): col for col in df.columns}
104
100
  rename_dict = {}
105
101
  missing = []
106
-
107
102
  for feature in model_features:
108
103
  if feature in df.columns:
109
104
  continue # Exact match
@@ -115,58 +110,64 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
115
110
  if missing:
116
111
  raise ValueError(f"Features not found: {missing}")
117
112
 
113
+ # Rename the DataFrame columns to match the model features
118
114
  return df.rename(columns=rename_dict)
119
115
 
120
116
 
121
- def convert_categorical_types(df: pd.DataFrame, features: list, category_mappings={}) -> tuple:
117
+ def convert_categorical_types(
118
+ df: pd.DataFrame, features: list[str], category_mappings: dict[str, list[str]] | None = None
119
+ ) -> tuple[pd.DataFrame, dict[str, list[str]]]:
122
120
  """
123
121
  Converts appropriate columns to categorical type with consistent mappings.
124
122
 
125
123
  Args:
126
124
  df (pd.DataFrame): The DataFrame to process.
127
125
  features (list): List of feature names to consider for conversion.
128
- category_mappings (dict, optional): Existing category mappings. If empty dict, we're in
129
- training mode. If populated, we're in inference mode.
126
+ category_mappings (dict, optional): Existing category mappings. If None or empty,
127
+ we're in training mode. If populated, we're in
128
+ inference mode.
130
129
 
131
130
  Returns:
132
131
  tuple: (processed DataFrame, category mappings dictionary)
133
132
  """
133
+ if category_mappings is None:
134
+ category_mappings = {}
135
+
134
136
  # Training mode
135
- if category_mappings == {}:
137
+ if not category_mappings:
136
138
  for col in df.select_dtypes(include=["object", "string"]):
137
139
  if col in features and df[col].nunique() < 20:
138
140
  print(f"Training mode: Converting {col} to category")
139
141
  df[col] = df[col].astype("category")
140
- category_mappings[col] = df[col].cat.categories.tolist() # Store category mappings
142
+ category_mappings[col] = df[col].cat.categories.tolist()
141
143
 
142
144
  # Inference mode
143
145
  else:
144
146
  for col, categories in category_mappings.items():
145
147
  if col in df.columns:
146
148
  print(f"Inference mode: Applying categorical mapping for {col}")
147
- df[col] = pd.Categorical(df[col], categories=categories) # Apply consistent categorical mapping
149
+ df[col] = pd.Categorical(df[col], categories=categories)
148
150
 
149
151
  return df, category_mappings
150
152
 
151
153
 
152
154
  def decompress_features(
153
- df: pd.DataFrame, features: List[str], compressed_features: List[str]
154
- ) -> Tuple[pd.DataFrame, List[str]]:
155
+ df: pd.DataFrame, features: list[str], compressed_features: list[str]
156
+ ) -> tuple[pd.DataFrame, list[str]]:
155
157
  """Prepare features for the model
156
158
 
157
159
  Args:
158
160
  df (pd.DataFrame): The features DataFrame
159
- features (List[str]): Full list of feature names
160
- compressed_features (List[str]): List of feature names to decompress (bitstrings)
161
+ features (list[str]): Full list of feature names
162
+ compressed_features (list[str]): List of feature names to decompress (bitstrings)
161
163
 
162
164
  Returns:
163
165
  pd.DataFrame: DataFrame with the decompressed features
164
- List[str]: Updated list of feature names after decompression
166
+ list[str]: Updated list of feature names after decompression
165
167
 
166
168
  Raises:
167
169
  ValueError: If any missing values are found in the specified features
168
170
  """
169
-
170
171
  # Check for any missing values in the required features
171
172
  missing_counts = df[features].isna().sum()
172
173
  if missing_counts.any():
@@ -176,10 +177,11 @@ def decompress_features(
176
177
  "WARNING: You might want to remove/replace all NaN values before processing."
177
178
  )
178
179
 
179
- # Decompress the specified compressed features
180
- decompressed_features = features
180
+ # Make a copy to avoid mutating the original list
181
+ decompressed_features = features.copy()
182
+
181
183
  for feature in compressed_features:
182
- if (feature not in df.columns) or (feature not in features):
184
+ if (feature not in df.columns) or (feature not in decompressed_features):
183
185
  print(f"Feature '{feature}' not in the features list, skipping decompression.")
184
186
  continue
185
187
 
@@ -204,26 +206,60 @@ def decompress_features(
204
206
  return df, decompressed_features
205
207
 
206
208
 
207
- def model_fn(model_dir):
209
+ def model_fn(model_dir: str) -> dict:
210
+ """Load the PyTorch Tabular ensemble models from the specified directory.
211
+
212
+ Args:
213
+ model_dir: Directory containing the saved model(s)
214
+
215
+ Returns:
216
+ Dictionary with ensemble models and metadata
217
+ """
218
+ import torch
219
+ from functools import partial
220
+
221
+ # Load ensemble metadata if present
222
+ ensemble_metadata_path = os.path.join(model_dir, "ensemble_metadata.joblib")
223
+ if os.path.exists(ensemble_metadata_path):
224
+ ensemble_metadata = joblib.load(ensemble_metadata_path)
225
+ n_ensemble = ensemble_metadata["n_ensemble"]
226
+ else:
227
+ n_ensemble = 1
228
+
229
+ # Determine map_location for loading models (handle CUDA trained models on CPU inference)
230
+ map_location = torch.device("cuda" if torch.cuda.is_available() else "cpu")
231
+
232
+ # Patch torch.load globally to use map_location (needed for joblib-loaded callbacks)
233
+ # This handles the case where pytorch-tabular loads callbacks.sav via joblib,
234
+ # which internally calls torch.load without map_location
235
+ original_torch_load = torch.load
236
+ torch.load = partial(original_torch_load, map_location=map_location)
208
237
 
209
238
  # Save current working directory
210
239
  original_cwd = os.getcwd()
240
+ ensemble_models = []
241
+
211
242
  try:
212
243
  # Change to /tmp because Pytorch Tabular needs write access (creates a .pt_tmp directory)
213
- os.chdir('/tmp')
244
+ os.chdir("/tmp")
214
245
 
215
- # Load the model
216
- model_path = os.path.join(model_dir, "tabular_model")
217
- model = TabularModel.load_model(model_path)
246
+ for ens_idx in range(n_ensemble):
247
+ # Try numbered model path first, fall back to legacy path
248
+ model_path = os.path.join(model_dir, f"tabular_model_{ens_idx}")
249
+ if not os.path.exists(model_path):
250
+ model_path = os.path.join(model_dir, "tabular_model")
251
+ model = TabularModel.load_model(model_path, map_location=map_location)
252
+ ensemble_models.append(model)
218
253
 
219
- # Restore the original working directory
220
254
  finally:
255
+ # Restore torch.load and working directory
256
+ torch.load = original_torch_load
221
257
  os.chdir(original_cwd)
222
258
 
223
- return model
259
+ return {"ensemble_models": ensemble_models, "n_ensemble": n_ensemble}
224
260
 
225
261
 
226
- def input_fn(input_data, content_type):
262
+ def input_fn(input_data, content_type: str) -> pd.DataFrame:
227
263
  """Parse input data and return a DataFrame."""
228
264
  if not input_data:
229
265
  raise ValueError("Empty input data is not supported!")
@@ -240,29 +276,34 @@ def input_fn(input_data, content_type):
240
276
  raise ValueError(f"{content_type} not supported!")
241
277
 
242
278
 
243
- def output_fn(output_df, accept_type):
279
+ def output_fn(output_df: pd.DataFrame, accept_type: str) -> tuple[str, str]:
244
280
  """Supports both CSV and JSON output formats."""
245
281
  if "text/csv" in accept_type:
246
- csv_output = output_df.fillna("N/A").to_csv(index=False) # CSV with N/A for missing values
282
+ csv_output = output_df.fillna("N/A").to_csv(index=False)
247
283
  return csv_output, "text/csv"
248
284
  elif "application/json" in accept_type:
249
- return output_df.to_json(orient="records"), "application/json" # JSON array of records (NaNs -> null)
285
+ return output_df.to_json(orient="records"), "application/json"
250
286
  else:
251
287
  raise RuntimeError(f"{accept_type} accept type is not supported by this script.")
252
288
 
253
289
 
254
- def predict_fn(df, model) -> pd.DataFrame:
255
- """Make Predictions with our PyTorch Tabular Model
290
+ def predict_fn(df: pd.DataFrame, model_dict: dict) -> pd.DataFrame:
291
+ """Make Predictions with our PyTorch Tabular Model ensemble.
256
292
 
257
293
  Args:
258
294
  df (pd.DataFrame): The input DataFrame
259
- model: The TabularModel use for predictions
295
+ model_dict: Dictionary containing ensemble models and metadata
260
296
 
261
297
  Returns:
262
- pd.DataFrame: The DataFrame with the predictions added
298
+ pd.DataFrame: The DataFrame with predictions (and prediction_std for ensembles)
263
299
  """
300
+ model_type = TEMPLATE_PARAMS["model_type"]
264
301
  compressed_features = TEMPLATE_PARAMS["compressed_features"]
265
302
 
303
+ # Extract ensemble models
304
+ ensemble_models = model_dict["ensemble_models"]
305
+ n_ensemble = model_dict["n_ensemble"]
306
+
266
307
  # Grab our feature columns (from training)
267
308
  model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
268
309
  with open(os.path.join(model_dir, "feature_columns.json")) as fp:
@@ -275,12 +316,11 @@ def predict_fn(df, model) -> pd.DataFrame:
275
316
 
276
317
  # Load our Label Encoder if we have one
277
318
  label_encoder = None
278
- if os.path.exists(os.path.join(model_dir, "label_encoder.joblib")):
279
- label_encoder = joblib.load(os.path.join(model_dir, "label_encoder.joblib"))
319
+ label_encoder_path = os.path.join(model_dir, "label_encoder.joblib")
320
+ if os.path.exists(label_encoder_path):
321
+ label_encoder = joblib.load(label_encoder_path)
280
322
 
281
- # We're going match features in a case-insensitive manner, accounting for all the permutations
282
- # - Model has a feature list that's any case ("Id", "taCos", "cOunT", "likes_tacos")
283
- # - Incoming data has columns that are mixed case ("ID", "Tacos", "Count", "Likes_Tacos")
323
+ # Match features in a case-insensitive manner
284
324
  matched_df = match_features_case_insensitive(df, features)
285
325
 
286
326
  # Detect categorical types in the incoming DataFrame
@@ -291,36 +331,80 @@ def predict_fn(df, model) -> pd.DataFrame:
291
331
  print("Decompressing features for prediction...")
292
332
  matched_df, features = decompress_features(matched_df, features, compressed_features)
293
333
 
294
- # Make predictions using the TabularModel
295
- result = model.predict(matched_df[features])
334
+ # Track rows with missing features
335
+ missing_mask = matched_df[features].isna().any(axis=1)
336
+ if missing_mask.any():
337
+ print(f"Warning: {missing_mask.sum()} rows have missing features, will return NaN predictions")
338
+
339
+ # Initialize prediction columns
340
+ df["prediction"] = np.nan
341
+ if model_type in ["regressor", "uq_regressor"]:
342
+ df["prediction_std"] = np.nan
343
+
344
+ # Only predict on complete rows
345
+ complete_df = matched_df[~missing_mask]
346
+ if len(complete_df) == 0:
347
+ print("Warning: No complete rows to predict on")
348
+ return df
296
349
 
297
350
  # pytorch-tabular returns predictions using f"{target}_prediction" column
298
- # and classification probabilities in columns ending with "_probability"
299
- target = TEMPLATE_PARAMS["target_column"]
351
+ target = TEMPLATE_PARAMS["target"]
300
352
  prediction_column = f"{target}_prediction"
301
- if prediction_column in result.columns:
302
- predictions = result[prediction_column].values
303
- else:
304
- raise ValueError(f"Cannot find prediction column in: {result.columns.tolist()}")
305
353
 
306
- # If we have a label encoder, decode the predictions
307
- if label_encoder:
308
- predictions = label_encoder.inverse_transform(predictions.astype(int))
354
+ # Collect predictions from all ensemble members
355
+ all_ensemble_preds = []
356
+ all_ensemble_probs = []
357
+
358
+ for ens_idx, ens_model in enumerate(ensemble_models):
359
+ result = ens_model.predict(complete_df[features])
360
+
361
+ if prediction_column in result.columns:
362
+ ens_preds = result[prediction_column].values
363
+ else:
364
+ raise ValueError(f"Cannot find prediction column in: {result.columns.tolist()}")
365
+
366
+ all_ensemble_preds.append(ens_preds)
309
367
 
310
- # Set the predictions on the DataFrame
311
- df["prediction"] = predictions
368
+ # For classification, collect probabilities
369
+ if label_encoder is not None:
370
+ prob_cols = sorted([col for col in result.columns if col.endswith("_probability")])
371
+ if prob_cols:
372
+ all_ensemble_probs.append(result[prob_cols].values)
312
373
 
313
- # For classification, get probabilities
374
+ # Stack and compute mean/std (std is 0 for single model)
375
+ ensemble_preds = np.stack(all_ensemble_preds, axis=0) # (n_ensemble, n_samples)
376
+ preds = np.mean(ensemble_preds, axis=0)
377
+ preds_std = np.std(ensemble_preds, axis=0) # Will be 0s for n_ensemble=1
378
+
379
+ print(f"Inference: Ensemble predictions shape: {preds.shape}, n_ensemble: {n_ensemble}")
380
+
381
+ # Handle classification vs regression
314
382
  if label_encoder is not None:
315
- prob_cols = [col for col in result.columns if col.endswith("_probability")]
316
- if prob_cols:
317
- probs = result[prob_cols].values
318
- df["pred_proba"] = [p.tolist() for p in probs]
383
+ # For classification, average probabilities then take argmax
384
+ if all_ensemble_probs:
385
+ ensemble_probs = np.stack(all_ensemble_probs, axis=0) # (n_ensemble, n_samples, n_classes)
386
+ avg_probs = np.mean(ensemble_probs, axis=0) # (n_samples, n_classes)
387
+ class_preds = np.argmax(avg_probs, axis=1)
388
+ predictions = label_encoder.inverse_transform(class_preds)
389
+
390
+ # Build full proba Series with None for missing rows
391
+ all_proba = pd.Series([None] * len(df), index=df.index, dtype=object)
392
+ all_proba.loc[~missing_mask] = [p.tolist() for p in avg_probs]
393
+ df["pred_proba"] = all_proba
319
394
 
320
395
  # Expand the pred_proba column into separate columns for each class
321
396
  df = expand_proba_column(df, label_encoder.classes_)
397
+ else:
398
+ # No probabilities, use averaged predictions
399
+ predictions = label_encoder.inverse_transform(preds.astype(int))
400
+ else:
401
+ # Regression (includes uq_regressor)
402
+ predictions = preds
403
+ df.loc[~missing_mask, "prediction_std"] = preds_std
404
+
405
+ # Set predictions only for complete rows
406
+ df.loc[~missing_mask, "prediction"] = predictions
322
407
 
323
- # All done, return the DataFrame with new columns for the predictions
324
408
  return df
325
409
 
326
410
 
@@ -328,15 +412,14 @@ if __name__ == "__main__":
328
412
  """The main function is for training the PyTorch Tabular model"""
329
413
 
330
414
  # Harness Template Parameters
331
- target = TEMPLATE_PARAMS["target_column"]
415
+ target = TEMPLATE_PARAMS["target"]
332
416
  features = TEMPLATE_PARAMS["features"]
333
417
  orig_features = features.copy()
418
+ id_column = TEMPLATE_PARAMS["id_column"]
334
419
  compressed_features = TEMPLATE_PARAMS["compressed_features"]
335
420
  model_type = TEMPLATE_PARAMS["model_type"]
336
421
  model_metrics_s3_path = TEMPLATE_PARAMS["model_metrics_s3_path"]
337
- train_all_data = TEMPLATE_PARAMS["train_all_data"]
338
422
  hyperparameters = TEMPLATE_PARAMS["hyperparameters"]
339
- validation_split = 0.2
340
423
 
341
424
  # Script arguments for input/output directories
342
425
  parser = argparse.ArgumentParser()
@@ -348,19 +431,27 @@ if __name__ == "__main__":
348
431
  args = parser.parse_args()
349
432
 
350
433
  # Read the training data into DataFrames
351
- training_files = [
352
- os.path.join(args.train, file)
353
- for file in os.listdir(args.train)
354
- if file.endswith(".csv")
355
- ]
434
+ training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
356
435
  print(f"Training Files: {training_files}")
357
436
 
358
437
  # Combine files and read them all into a single pandas dataframe
359
438
  all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
360
439
 
440
+ # Print out some info about the dataframe
441
+ print(f"All Data Shape: {all_df.shape}")
442
+ print(f"Feature dtypes:\n{all_df[features].dtypes.value_counts()}")
443
+ print(f"Int64 columns: {all_df[features].select_dtypes(include=['int64']).columns.tolist()}")
444
+
361
445
  # Check if the dataframe is empty
362
446
  check_dataframe(all_df, "training_df")
363
447
 
448
+ # Drop any rows with missing feature values
449
+ initial_row_count = all_df.shape[0]
450
+ all_df = all_df.dropna(subset=features)
451
+ dropped_rows = initial_row_count - all_df.shape[0]
452
+ if dropped_rows > 0:
453
+ print(f"Dropped {dropped_rows} rows due to missing feature values.")
454
+
364
455
  # Features/Target output
365
456
  print(f"Target: {target}")
366
457
  print(f"Features: {str(features)}")
@@ -368,151 +459,228 @@ if __name__ == "__main__":
368
459
  # Convert any features that might be categorical to 'category' type
369
460
  all_df, category_mappings = convert_categorical_types(all_df, features)
370
461
 
462
+ # Print out some info about the dataframe
463
+ print(f"All Data Shape: {all_df.shape}")
464
+ print(f"Feature dtypes:\n{all_df[features].dtypes.value_counts()}")
465
+ print(f"Int64 columns: {all_df[features].select_dtypes(include=['int64']).columns.tolist()}")
466
+
371
467
  # If we have compressed features, decompress them
372
468
  if compressed_features:
373
469
  print(f"Decompressing features {compressed_features}...")
374
470
  all_df, features = decompress_features(all_df, features, compressed_features)
375
471
 
376
- # Do we want to train on all the data?
377
- if train_all_data:
378
- print("Training on ALL of the data")
379
- df_train = all_df.copy()
380
- df_val = all_df.copy()
381
-
382
- # Does the dataframe have a training column?
383
- elif "training" in all_df.columns:
384
- print("Found training column, splitting data based on training column")
385
- df_train = all_df[all_df["training"]]
386
- df_val = all_df[~all_df["training"]]
387
- else:
388
- # Just do a random training Split
389
- print("WARNING: No training column found, splitting data with random state=42")
390
- df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
391
- print(f"FIT/TRAIN: {df_train.shape}")
392
- print(f"VALIDATION: {df_val.shape}")
393
-
394
472
  # Determine categorical and continuous columns
395
- categorical_cols = [col for col in features if df_train[col].dtype.name == "category"]
473
+ categorical_cols = [col for col in features if all_df[col].dtype.name == "category"]
396
474
  continuous_cols = [col for col in features if col not in categorical_cols]
397
-
398
475
  print(f"Categorical columns: {categorical_cols}")
399
476
  print(f"Continuous columns: {continuous_cols}")
400
477
 
401
- # Set up PyTorch Tabular configuration
402
- data_config = DataConfig(
403
- target=[target],
404
- continuous_cols=continuous_cols,
405
- categorical_cols=categorical_cols,
406
- )
478
+ # Cast continuous columns to float
479
+ all_df[continuous_cols] = all_df[continuous_cols].astype("float64")
407
480
 
408
- # Choose the 'task' based on model type also set up the label encoder if needed
481
+ # Choose the 'task' based on model type and set up the label encoder if needed
409
482
  if model_type == "classifier":
410
483
  task = "classification"
411
- # Encode the target column
484
+ # Encode the target column on full dataset for consistent encoding
412
485
  label_encoder = LabelEncoder()
413
- df_train[target] = label_encoder.fit_transform(df_train[target])
414
- df_val[target] = label_encoder.transform(df_val[target])
486
+ all_df[target] = label_encoder.fit_transform(all_df[target])
487
+ num_classes = len(label_encoder.classes_)
415
488
  else:
416
489
  task = "regression"
417
490
  label_encoder = None
491
+ num_classes = None
418
492
 
419
493
  # Use any hyperparameters to set up both the trainer and model configurations
420
494
  print(f"Hyperparameters: {hyperparameters}")
495
+ n_folds = hyperparameters.get("n_folds", 5) # Number of CV folds (default: 5)
496
+
497
+ # =========================================================================
498
+ # UNIFIED TRAINING: Works for n_folds=1 (single model) or n_folds>1 (K-fold CV)
499
+ # =========================================================================
500
+ print(f"Training {'single model' if n_folds == 1 else f'{n_folds}-fold cross-validation ensemble'}...")
501
+
502
+ # Create fold splits
503
+ if n_folds == 1:
504
+ # Single fold: use train/val split from "training" column or random split
505
+ if "training" in all_df.columns:
506
+ print("Found training column, splitting data based on training column")
507
+ train_idx = np.where(all_df["training"])[0]
508
+ val_idx = np.where(~all_df["training"])[0]
509
+ else:
510
+ print("WARNING: No training column found, splitting data with random 80/20 split")
511
+ indices = np.arange(len(all_df))
512
+ train_idx, val_idx = train_test_split(indices, test_size=0.2, random_state=42)
513
+ folds = [(train_idx, val_idx)]
514
+ else:
515
+ # K-Fold CV
516
+ if model_type == "classifier":
517
+ kfold = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
518
+ split_target = all_df[target]
519
+ else:
520
+ kfold = KFold(n_splits=n_folds, shuffle=True, random_state=42)
521
+ split_target = None
522
+ folds = list(kfold.split(all_df, split_target))
523
+
524
+ # Initialize storage for out-of-fold predictions
525
+ oof_predictions = np.full(len(all_df), np.nan, dtype=np.float64)
526
+ if model_type == "classifier" and num_classes and num_classes > 1:
527
+ oof_proba = np.full((len(all_df), num_classes), np.nan, dtype=np.float64)
528
+ else:
529
+ oof_proba = None
421
530
 
422
- # Set up PyTorch Tabular configuration with defaults
423
- trainer_defaults = {
424
- "auto_lr_find": True,
425
- "batch_size": min(1024, max(32, len(df_train) // 4)),
426
- "max_epochs": 100,
427
- "early_stopping": "valid_loss",
428
- "early_stopping_patience": 15,
429
- "checkpoints": "valid_loss",
430
- "accelerator": "auto",
431
- "progress_bar": "none",
432
- "gradient_clip_val": 1.0,
433
- }
531
+ ensemble_models = []
434
532
 
435
- # Override defaults with training_config if present
436
- training_overrides = {k: v for k, v in hyperparameters.get('training_config', {}).items()
437
- if k in trainer_defaults}
438
- # Print overwrites
439
- for key, value in training_overrides.items():
440
- print(f"TRAINING CONFIG Override: {key}: {trainer_defaults[key]} → {value}")
441
- trainer_params = {**trainer_defaults, **training_overrides}
442
- trainer_config = TrainerConfig(**trainer_params)
533
+ # Set up PyTorch Tabular data configuration (shared across folds)
534
+ data_config = DataConfig(
535
+ target=[target],
536
+ continuous_cols=continuous_cols,
537
+ categorical_cols=categorical_cols,
538
+ )
443
539
 
444
540
  # Model config defaults
445
541
  model_defaults = {
446
- "layers": "1024-512-512",
447
- "activation": "ReLU",
542
+ "layers": "256-128-64",
543
+ "activation": "LeakyReLU",
448
544
  "learning_rate": 1e-3,
449
545
  "dropout": 0.1,
450
546
  "use_batch_norm": True,
451
547
  "initialization": "kaiming",
452
548
  }
453
549
  # Override defaults with model_config if present
454
- model_overrides = {k: v for k, v in hyperparameters.get('model_config', {}).items()
455
- if k in model_defaults}
456
- # Print overwrites
550
+ model_overrides = {k: v for k, v in hyperparameters.get("model_config", {}).items() if k in model_defaults}
457
551
  for key, value in model_overrides.items():
458
552
  print(f"MODEL CONFIG Override: {key}: {model_defaults[key]} → {value}")
459
553
  model_params = {**model_defaults, **model_overrides}
460
554
 
461
- # Use CategoryEmbedding model configuration for general-purpose tabular modeling.
462
- # Works effectively for both regression and classification as the foundational
463
- # architecture in PyTorch Tabular
464
- model_config = CategoryEmbeddingModelConfig(
465
- task=task,
466
- **model_params
467
- )
555
+ model_config = CategoryEmbeddingModelConfig(task=task, **model_params)
468
556
  optimizer_config = OptimizerConfig()
469
557
 
470
- #####################################
471
- # Create and train the TabularModel #
472
- #####################################
473
- tabular_model = TabularModel(
474
- data_config=data_config,
475
- model_config=model_config,
476
- optimizer_config=optimizer_config,
477
- trainer_config=trainer_config,
478
- )
479
- tabular_model.fit(train=df_train, validation=df_val)
558
+ for fold_idx, (train_idx, val_idx) in enumerate(folds):
559
+ print(f"\n{'='*50}")
560
+ print(f"Training Fold {fold_idx + 1}/{len(folds)}")
561
+ print(f"{'='*50}")
562
+
563
+ # Split data for this fold
564
+ df_train = all_df.iloc[train_idx].reset_index(drop=True)
565
+ df_val = all_df.iloc[val_idx].reset_index(drop=True)
566
+
567
+ print(f"Fold {fold_idx + 1} - Train: {len(df_train)}, Val: {len(df_val)}")
568
+
569
+ # Set up PyTorch Tabular trainer configuration (per-fold for batch_size)
570
+ # Calculate batch size that avoids single-sample last batch (batch norm requires >1)
571
+ batch_size = min(128, max(32, len(df_train) // 16))
572
+ if len(df_train) % batch_size == 1:
573
+ batch_size += 1 # Adjust to avoid last batch of size 1
574
+ trainer_defaults = {
575
+ "auto_lr_find": False,
576
+ "batch_size": batch_size,
577
+ "max_epochs": 200,
578
+ "min_epochs": 10,
579
+ "early_stopping": "valid_loss",
580
+ "early_stopping_patience": 20,
581
+ "checkpoints": "valid_loss",
582
+ "accelerator": "auto",
583
+ "progress_bar": "none",
584
+ "gradient_clip_val": 1.0,
585
+ "seed": 42 + fold_idx,
586
+ }
587
+
588
+ # Override defaults with training_config if present
589
+ training_overrides = {k: v for k, v in hyperparameters.get("training_config", {}).items() if k in trainer_defaults}
590
+ if fold_idx == 0: # Only print overrides once
591
+ for key, value in training_overrides.items():
592
+ print(f"TRAINING CONFIG Override: {key}: {trainer_defaults[key]} → {value}")
593
+ trainer_params = {**trainer_defaults, **training_overrides}
594
+ trainer_config = TrainerConfig(**trainer_params)
595
+
596
+ # Create and train the TabularModel for this fold
597
+ tabular_model = TabularModel(
598
+ data_config=data_config,
599
+ model_config=model_config,
600
+ optimizer_config=optimizer_config,
601
+ trainer_config=trainer_config,
602
+ )
603
+ tabular_model.fit(train=df_train, validation=df_val)
604
+ ensemble_models.append(tabular_model)
605
+
606
+ # Make out-of-fold predictions
607
+ result = tabular_model.predict(df_val, include_input_features=False)
608
+ fold_preds = result[f"{target}_prediction"].values
609
+
610
+ # Store out-of-fold predictions
611
+ if model_type == "classifier":
612
+ oof_predictions[val_idx] = fold_preds.astype(int)
613
+ prob_cols = sorted([col for col in result.columns if col.endswith("_probability")])
614
+ if prob_cols and oof_proba is not None:
615
+ oof_proba[val_idx] = result[prob_cols].values
616
+ else:
617
+ oof_predictions[val_idx] = fold_preds.flatten()
480
618
 
481
- # Make Predictions on the Validation Set
482
- print("Making Predictions on Validation Set...")
483
- result = tabular_model.predict(df_val, include_input_features=False)
619
+ print(f"Fold {fold_idx + 1} complete!")
484
620
 
485
- # pytorch-tabular returns predictions using f"{target}_prediction" column
486
- # and classification probabilities in columns ending with "_probability"
487
- if model_type == "classifier":
488
- preds = result[f"{target}_prediction"].values
621
+ print(f"\nTraining complete! Trained {len(ensemble_models)} model(s).")
622
+
623
+ # Use out-of-fold predictions for metrics
624
+ # For n_folds=1, we only have predictions for val_idx, so filter to those rows
625
+ if n_folds == 1:
626
+ val_mask = ~np.isnan(oof_predictions)
627
+ preds = oof_predictions[val_mask]
628
+ df_val = all_df[val_mask].copy()
629
+ if oof_proba is not None:
630
+ oof_proba = oof_proba[val_mask]
489
631
  else:
490
- # Regression: use the target column name
491
- preds = result[f"{target}_prediction"].values
632
+ preds = oof_predictions
633
+ df_val = all_df.copy()
634
+
635
+ # Compute prediction_std by running all ensemble models on validation data
636
+ # For n_folds=1, std will be 0 (only one model). For n_folds>1, std shows ensemble disagreement.
637
+ preds_std = None
638
+ if model_type in ["regressor", "uq_regressor"] and len(ensemble_models) > 0:
639
+ print("Computing prediction_std from ensemble predictions on validation data...")
640
+ all_ensemble_preds_for_std = []
641
+ for ens_model in ensemble_models:
642
+ result = ens_model.predict(df_val[features], include_input_features=False)
643
+ ens_preds = result[f"{target}_prediction"].values.flatten()
644
+ all_ensemble_preds_for_std.append(ens_preds)
645
+
646
+ ensemble_preds_stacked = np.stack(all_ensemble_preds_for_std, axis=0)
647
+ preds_std = np.std(ensemble_preds_stacked, axis=0)
648
+ print(f"Ensemble prediction_std - mean: {np.mean(preds_std):.4f}, max: {np.max(preds_std):.4f}")
492
649
 
493
650
  if model_type == "classifier":
494
651
  # Get probabilities for classification
495
- print("Processing Probabilities...")
496
- prob_cols = [col for col in result.columns if col.endswith("_probability")]
497
- if prob_cols:
498
- probs = result[prob_cols].values
499
- df_val["pred_proba"] = [p.tolist() for p in probs]
500
-
501
- # Expand the pred_proba column into separate columns for each class
502
- print(df_val.columns)
652
+ if oof_proba is not None:
653
+ df_val = df_val.copy()
654
+ df_val["pred_proba"] = [p.tolist() for p in oof_proba]
503
655
  df_val = expand_proba_column(df_val, label_encoder.classes_)
504
- print(df_val.columns)
505
656
 
506
657
  # Decode the target and prediction labels
507
658
  y_validate = label_encoder.inverse_transform(df_val[target])
508
- preds = label_encoder.inverse_transform(preds.astype(int))
659
+ preds_decoded = label_encoder.inverse_transform(preds.astype(int))
509
660
  else:
510
661
  y_validate = df_val[target].values
662
+ preds_decoded = preds
663
+
664
+ # Save predictions to S3
665
+ df_val = df_val.copy()
666
+ df_val["prediction"] = preds_decoded
667
+
668
+ # Build output columns - include id_column if it exists
669
+ output_columns = []
670
+ if id_column in df_val.columns:
671
+ output_columns.append(id_column)
672
+ output_columns += [target, "prediction"]
673
+
674
+ # Add prediction_std for regression models (always present, 0 for single model)
675
+ if model_type in ["regressor", "uq_regressor"]:
676
+ if preds_std is not None:
677
+ df_val["prediction_std"] = preds_std
678
+ else:
679
+ df_val["prediction_std"] = 0.0
680
+ output_columns.append("prediction_std")
681
+ print(f"Ensemble std - mean: {df_val['prediction_std'].mean():.4f}, max: {df_val['prediction_std'].max():.4f}")
511
682
 
512
- # Save predictions to S3 (just the target, prediction, and '_probability' columns)
513
- df_val["prediction"] = preds
514
- output_columns = [target, "prediction"]
515
- output_columns += [col for col in df_val.columns if col.endswith("_probability")]
683
+ output_columns += [col for col in df_val.columns if col.endswith("_proba")]
516
684
  wr.s3.to_csv(
517
685
  df_val[output_columns],
518
686
  path=f"{model_metrics_s3_path}/validation_predictions.csv",
@@ -525,7 +693,7 @@ if __name__ == "__main__":
525
693
  label_names = label_encoder.classes_
526
694
 
527
695
  # Calculate various model performance metrics
528
- scores = precision_recall_fscore_support(y_validate, preds, average=None, labels=label_names)
696
+ scores = precision_recall_fscore_support(y_validate, preds_decoded, average=None, labels=label_names)
529
697
 
530
698
  # Put the scores into a dataframe
531
699
  score_df = pd.DataFrame(
@@ -533,20 +701,20 @@ if __name__ == "__main__":
533
701
  target: label_names,
534
702
  "precision": scores[0],
535
703
  "recall": scores[1],
536
- "fscore": scores[2],
704
+ "f1": scores[2],
537
705
  "support": scores[3],
538
706
  }
539
707
  )
540
708
 
541
- # We need to get creative with the Classification Metrics
542
- metrics = ["precision", "recall", "fscore", "support"]
709
+ # Output metrics per class
710
+ metrics = ["precision", "recall", "f1", "support"]
543
711
  for t in label_names:
544
712
  for m in metrics:
545
713
  value = score_df.loc[score_df[target] == t, m].iloc[0]
546
714
  print(f"Metrics:{t}:{m} {value}")
547
715
 
548
716
  # Compute and output the confusion matrix
549
- conf_mtx = confusion_matrix(y_validate, preds, labels=label_names)
717
+ conf_mtx = confusion_matrix(y_validate, preds_decoded, labels=label_names)
550
718
  for i, row_name in enumerate(label_names):
551
719
  for j, col_name in enumerate(label_names):
552
720
  value = conf_mtx[i, j]
@@ -554,22 +722,37 @@ if __name__ == "__main__":
554
722
 
555
723
  else:
556
724
  # Calculate various model performance metrics (regression)
557
- rmse = root_mean_squared_error(y_validate, preds)
558
- mae = mean_absolute_error(y_validate, preds)
559
- r2 = r2_score(y_validate, preds)
560
- print(f"RMSE: {rmse:.3f}")
561
- print(f"MAE: {mae:.3f}")
562
- print(f"R2: {r2:.3f}")
563
- print(f"NumRows: {len(df_val)}")
564
-
565
- # Save the model to the standard place/name
566
- tabular_model.save_model(os.path.join(args.model_dir, "tabular_model"))
725
+ rmse = root_mean_squared_error(y_validate, preds_decoded)
726
+ mae = mean_absolute_error(y_validate, preds_decoded)
727
+ medae = median_absolute_error(y_validate, preds_decoded)
728
+ r2 = r2_score(y_validate, preds_decoded)
729
+ spearman_corr = spearmanr(y_validate, preds_decoded).correlation
730
+ support = len(df_val)
731
+ print(f"rmse: {rmse:.3f}")
732
+ print(f"mae: {mae:.3f}")
733
+ print(f"medae: {medae:.3f}")
734
+ print(f"r2: {r2:.3f}")
735
+ print(f"spearmanr: {spearman_corr:.3f}")
736
+ print(f"support: {support}")
737
+
738
+ # Save ensemble models
739
+ for model_idx, ens_model in enumerate(ensemble_models):
740
+ model_path = os.path.join(args.model_dir, f"tabular_model_{model_idx}")
741
+ ens_model.save_model(model_path)
742
+ print(f"Saved model {model_idx + 1} to {model_path}")
743
+
744
+ # Save ensemble metadata
745
+ n_ensemble = len(ensemble_models)
746
+ ensemble_metadata = {"n_ensemble": n_ensemble, "n_folds": n_folds}
747
+ joblib.dump(ensemble_metadata, os.path.join(args.model_dir, "ensemble_metadata.joblib"))
748
+ print(f"Saved ensemble metadata (n_ensemble={n_ensemble}, n_folds={n_folds})")
749
+
567
750
  if label_encoder:
568
751
  joblib.dump(label_encoder, os.path.join(args.model_dir, "label_encoder.joblib"))
569
752
 
570
753
  # Save the features (this will validate input during predictions)
571
754
  with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
572
- json.dump(orig_features, fp) # We save the original features, not the decompressed ones
755
+ json.dump(orig_features, fp)
573
756
 
574
757
  # Save the category mappings
575
758
  with open(os.path.join(args.model_dir, "category_mappings.json"), "w") as fp: