workbench 0.8.160__py3-none-any.whl → 0.8.202__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of workbench might be problematic. Click here for more details.

Files changed (114) hide show
  1. workbench/algorithms/dataframe/__init__.py +1 -2
  2. workbench/algorithms/dataframe/fingerprint_proximity.py +2 -2
  3. workbench/algorithms/dataframe/proximity.py +261 -235
  4. workbench/algorithms/graph/light/proximity_graph.py +10 -8
  5. workbench/api/__init__.py +2 -1
  6. workbench/api/compound.py +1 -1
  7. workbench/api/endpoint.py +11 -0
  8. workbench/api/feature_set.py +12 -8
  9. workbench/api/meta.py +5 -2
  10. workbench/api/model.py +16 -15
  11. workbench/api/monitor.py +1 -16
  12. workbench/api/parameter_store.py +5 -0
  13. workbench/core/artifacts/__init__.py +11 -2
  14. workbench/core/artifacts/artifact.py +11 -3
  15. workbench/core/artifacts/data_capture_core.py +355 -0
  16. workbench/core/artifacts/endpoint_core.py +256 -118
  17. workbench/core/artifacts/feature_set_core.py +265 -16
  18. workbench/core/artifacts/model_core.py +110 -63
  19. workbench/core/artifacts/monitor_core.py +33 -248
  20. workbench/core/cloud_platform/aws/aws_account_clamp.py +50 -1
  21. workbench/core/cloud_platform/aws/aws_meta.py +12 -5
  22. workbench/core/cloud_platform/aws/aws_parameter_store.py +18 -2
  23. workbench/core/cloud_platform/aws/aws_session.py +4 -4
  24. workbench/core/transforms/data_to_features/light/molecular_descriptors.py +4 -4
  25. workbench/core/transforms/features_to_model/features_to_model.py +45 -33
  26. workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +36 -6
  27. workbench/core/transforms/pandas_transforms/pandas_to_features.py +27 -0
  28. workbench/core/views/training_view.py +113 -42
  29. workbench/core/views/view.py +53 -3
  30. workbench/core/views/view_utils.py +4 -4
  31. workbench/model_scripts/chemprop/chemprop.template +852 -0
  32. workbench/model_scripts/chemprop/generated_model_script.py +852 -0
  33. workbench/model_scripts/chemprop/requirements.txt +11 -0
  34. workbench/model_scripts/custom_models/chem_info/fingerprints.py +134 -0
  35. workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +483 -0
  36. workbench/model_scripts/custom_models/chem_info/mol_standardize.py +450 -0
  37. workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +7 -9
  38. workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py +1 -1
  39. workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +3 -5
  40. workbench/model_scripts/custom_models/proximity/proximity.py +261 -235
  41. workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
  42. workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +20 -21
  43. workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
  44. workbench/model_scripts/custom_models/uq_models/meta_uq.template +166 -62
  45. workbench/model_scripts/custom_models/uq_models/ngboost.template +30 -18
  46. workbench/model_scripts/custom_models/uq_models/proximity.py +261 -235
  47. workbench/model_scripts/custom_models/uq_models/requirements.txt +1 -3
  48. workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +15 -17
  49. workbench/model_scripts/pytorch_model/generated_model_script.py +390 -188
  50. workbench/model_scripts/pytorch_model/pytorch.template +387 -176
  51. workbench/model_scripts/scikit_learn/generated_model_script.py +7 -12
  52. workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
  53. workbench/model_scripts/script_generation.py +19 -10
  54. workbench/model_scripts/uq_models/generated_model_script.py +605 -0
  55. workbench/model_scripts/uq_models/mapie.template +605 -0
  56. workbench/model_scripts/uq_models/requirements.txt +1 -0
  57. workbench/model_scripts/xgb_model/generated_model_script.py +37 -46
  58. workbench/model_scripts/xgb_model/xgb_model.template +44 -46
  59. workbench/repl/workbench_shell.py +28 -14
  60. workbench/scripts/endpoint_test.py +162 -0
  61. workbench/scripts/lambda_test.py +73 -0
  62. workbench/scripts/ml_pipeline_batch.py +137 -0
  63. workbench/scripts/ml_pipeline_sqs.py +186 -0
  64. workbench/scripts/monitor_cloud_watch.py +20 -100
  65. workbench/utils/aws_utils.py +4 -3
  66. workbench/utils/chem_utils/__init__.py +0 -0
  67. workbench/utils/chem_utils/fingerprints.py +134 -0
  68. workbench/utils/chem_utils/misc.py +194 -0
  69. workbench/utils/chem_utils/mol_descriptors.py +483 -0
  70. workbench/utils/chem_utils/mol_standardize.py +450 -0
  71. workbench/utils/chem_utils/mol_tagging.py +348 -0
  72. workbench/utils/chem_utils/projections.py +209 -0
  73. workbench/utils/chem_utils/salts.py +256 -0
  74. workbench/utils/chem_utils/sdf.py +292 -0
  75. workbench/utils/chem_utils/toxicity.py +250 -0
  76. workbench/utils/chem_utils/vis.py +253 -0
  77. workbench/utils/chemprop_utils.py +760 -0
  78. workbench/utils/cloudwatch_handler.py +1 -1
  79. workbench/utils/cloudwatch_utils.py +137 -0
  80. workbench/utils/config_manager.py +3 -7
  81. workbench/utils/endpoint_utils.py +5 -7
  82. workbench/utils/license_manager.py +2 -6
  83. workbench/utils/model_utils.py +95 -34
  84. workbench/utils/monitor_utils.py +44 -62
  85. workbench/utils/pandas_utils.py +3 -3
  86. workbench/utils/pytorch_utils.py +526 -0
  87. workbench/utils/shap_utils.py +10 -2
  88. workbench/utils/workbench_logging.py +0 -3
  89. workbench/utils/workbench_sqs.py +1 -1
  90. workbench/utils/xgboost_model_utils.py +371 -156
  91. workbench/web_interface/components/model_plot.py +7 -1
  92. workbench/web_interface/components/plugin_unit_test.py +5 -2
  93. workbench/web_interface/components/plugins/dashboard_status.py +3 -1
  94. workbench/web_interface/components/plugins/generated_compounds.py +1 -1
  95. workbench/web_interface/components/plugins/model_details.py +9 -7
  96. workbench/web_interface/components/plugins/scatter_plot.py +3 -3
  97. {workbench-0.8.160.dist-info → workbench-0.8.202.dist-info}/METADATA +27 -6
  98. {workbench-0.8.160.dist-info → workbench-0.8.202.dist-info}/RECORD +102 -86
  99. {workbench-0.8.160.dist-info → workbench-0.8.202.dist-info}/entry_points.txt +4 -0
  100. {workbench-0.8.160.dist-info → workbench-0.8.202.dist-info}/licenses/LICENSE +1 -1
  101. workbench/model_scripts/custom_models/chem_info/local_utils.py +0 -769
  102. workbench/model_scripts/custom_models/chem_info/tautomerize.py +0 -83
  103. workbench/model_scripts/custom_models/proximity/generated_model_script.py +0 -138
  104. workbench/model_scripts/custom_models/uq_models/generated_model_script.py +0 -393
  105. workbench/model_scripts/custom_models/uq_models/mapie_xgb.template +0 -203
  106. workbench/model_scripts/ensemble_xgb/generated_model_script.py +0 -279
  107. workbench/model_scripts/quant_regression/quant_regression.template +0 -279
  108. workbench/model_scripts/quant_regression/requirements.txt +0 -1
  109. workbench/utils/chem_utils.py +0 -1556
  110. workbench/utils/execution_environment.py +0 -211
  111. workbench/utils/fast_inference.py +0 -167
  112. workbench/utils/resource_utils.py +0 -39
  113. {workbench-0.8.160.dist-info → workbench-0.8.202.dist-info}/WHEEL +0 -0
  114. {workbench-0.8.160.dist-info → workbench-0.8.202.dist-info}/top_level.txt +0 -0
@@ -8,43 +8,43 @@ import numpy as np
8
8
  os.environ["TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD"] = "1"
9
9
  from pytorch_tabular import TabularModel
10
10
  from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig
11
- from pytorch_tabular.models import CategoryEmbeddingModelConfig, TabNetModelConfig
11
+ from pytorch_tabular.models import CategoryEmbeddingModelConfig
12
12
 
13
13
  # Model Performance Scores
14
14
  from sklearn.metrics import (
15
15
  mean_absolute_error,
16
+ median_absolute_error,
16
17
  r2_score,
17
18
  root_mean_squared_error,
18
19
  precision_recall_fscore_support,
19
20
  confusion_matrix,
20
21
  )
22
+ from scipy.stats import spearmanr
21
23
 
22
24
  # Classification Encoder
23
25
  from sklearn.preprocessing import LabelEncoder
24
26
 
25
27
  # Scikit Learn Imports
26
- from sklearn.model_selection import train_test_split
28
+ from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
27
29
 
28
30
  from io import StringIO
29
31
  import json
30
32
  import argparse
31
33
  import joblib
32
- import os
33
34
  import pandas as pd
34
- from typing import List, Tuple
35
35
 
36
36
  # Template Parameters
37
37
  TEMPLATE_PARAMS = {
38
- "model_type": "classifier",
39
- "target_column": "solubility_class",
40
- "features": ['molwt', 'mollogp', 'molmr', 'heavyatomcount', 'numhacceptors', 'numhdonors', 'numheteroatoms', 'numrotatablebonds', 'numvalenceelectrons', 'numaromaticrings', 'numsaturatedrings', 'numaliphaticrings', 'ringcount', 'tpsa', 'labuteasa', 'balabanj', 'bertzct'],
38
+ "model_type": "uq_regressor",
39
+ "target": "udm_asy_res_efflux_ratio",
40
+ "features": ['smr_vsa4', 'tpsa', 'nhohcount', 'peoe_vsa1', 'mollogp', 'numhdonors', 'tertiary_amine_count', 'smr_vsa3', 'nitrogen_span', 'vsa_estate2', 'hba_hbd_ratio', 'minpartialcharge', 'estate_vsa4', 'asphericity', 'charge_centroid_distance', 'peoe_vsa8', 'mi', 'estate_vsa8', 'vsa_estate6', 'vsa_estate3', 'molecular_volume_3d', 'kappa3', 'smr_vsa5', 'sv', 'xp_6dv', 'xc_4dv', 'si', 'molecular_axis_length', 'axp_5d', 'estate_vsa3', 'estate_vsa10', 'axp_7dv', 'slogp_vsa1', 'molecular_asymmetry', 'molmr', 'qed', 'xp_3d', 'axp_0dv', 'fpdensitymorgan1', 'minabsestateindex', 'numatomstereocenters', 'fpdensitymorgan2', 'slogp_vsa2', 'xch_5dv', 'num_s_centers', 'aromatic_interaction_score', 'axp_2dv', 'chi1v', 'hallkieralpha', 'vsa_estate8', 'peoe_vsa9', 'type_ii_pattern_count', 'slogp_vsa5', 'xc_3d', 'amphiphilic_moment', 'bcut2d_logphi', 'estate_vsa6', 'xc_3dv', 'chi0n', 'vsa_estate5', 'xpc_6d', 'vsa_estate7', 'axp_1d', 'axp_7d', 'xch_4dv', 'phi', 'maxestateindex', 'sps', 'bcut2d_mrlow', 'vsa_estate4', 'avgipc', 'bcut2d_mrhi', 'bcut2d_logplow', 'axp_1dv', 'kappa1', 'vsa_estate9', 'fr_imidazole', 'axp_6d', 'radius_of_gyration', 'chi2v', 'chi4n', 'xp_7d', 'smr_vsa6', 'axp_2d', 'num_r_centers', 'xch_7dv', 'estate_vsa2', 'axp_4d', 'maxpartialcharge', 'xpc_5dv', 'xp_5d', 'chi3n', 'chi2n', 'vsa_estate1', 'slogp_vsa11', 'bcut2d_mwlow', 'mm', 'c3sp3', 'numhacceptors', 'fr_nhpyrrole', 'labuteasa', 'fpdensitymorgan3', 'bcut2d_chghi', 'axp_3dv', 'c1sp3', 'kappa2', 'smr_vsa9', 'xp_6d', 'estate_vsa7', 'axp_6dv', 'sp', 'estate_vsa5', 'peoe_vsa2', 'smr_vsa1', 'mp', 'minestateindex', 'axp_3d', 'axp_4dv', 'chi0v', 'slogp_vsa3', 'heavyatommolwt', 'smr_vsa7', 'peoe_vsa6', 'mv', 'xp_4d', 'peoe_vsa7', 'mpe', 'chi4v', 'maxabspartialcharge', 'bcut2d_chglo', 'c1sp2', 'xp_0dv', 'smr_vsa10', 'estate_vsa1', 'fr_pyridine', 'bcut2d_mwhi', 'spe', 'balabanj', 'xch_7d', 'estate_vsa9', 'xp_3dv', 'fr_piperzine', 'xch_6dv', 'slogp_vsa8', 'peoe_vsa10', 'xp_4dv', 'c3sp2', 'fr_al_oh', 'xc_5d', 'fractioncsp3', 'fr_bicyclic', 'fr_piperdine', 'peoe_vsa12', 'peoe_vsa11', 'numheteroatoms', 'mse', 'xp_7dv', 'chi1', 'xpc_6dv', 'numsaturatedcarbocycles', 'chi1n', 'bertzct', 'xc_5dv', 'chi3v', 'intramolecular_hbond_potential', 'peoe_vsa4', 'xpc_5d', 'xp_2d', 'nbase', 'fr_priamide', 'slogp_vsa4', 'naromatom', 'vsa_estate10', 'fr_nitrile', 'molwt', 'peoe_vsa13', 'xch_4d', 'xp_5dv', 'numaromaticheterocycles', 'xpc_4dv', 'fr_hoccn', 'nocount', 'fr_nh1', 'mz', 'xc_6dv', 'hybratio', 'fr_imine', 'fr_morpholine', 'xpc_4d', 'xch_5d', 'numvalenceelectrons', 'numheterocycles', 'fr_aniline', 'fr_nh0', 'frac_defined_stereo', 'fr_benzene', 'xp_2dv', 'type_i_pattern_count', 'fr_ketone_topliss', 'fr_aryl_methyl', 'heavyatomcount', 'mare', 'axp_5dv', 'exactmolwt', 'xch_6d', 'xp_1d', 'xch_3d', 'axp_0d', 'amide_count', 'sse', 'slogp_vsa7', 'c2sp2', 'numrotatablebonds', 'chi0', 'xc_4d', 'slogp_vsa10', 'fr_al_oh_notert', 'numspiroatoms', 'numsaturatedrings', 'minabspartialcharge', 'fr_sulfone', 'slogp_vsa6', 'smr_vsa2', 'num_defined_stereocenters', 'numbridgeheadatoms', 'peoe_vsa3', 'numaliphaticheterocycles', 'fr_ndealkylation1', 'xc_6d'],
41
+ "id_column": "udm_mol_bat_id",
41
42
  "compressed_features": [],
42
- "model_metrics_s3_path": "s3://sandbox-sageworks-artifacts/models/aqsol-pytorch-class/training",
43
- "train_all_data": False
43
+ "model_metrics_s3_path": "s3://ideaya-sageworks-bucket/models/caco2-er-reg-pytorch-test/training",
44
+ "hyperparameters": {'n_folds': 5, 'training_config': {'max_epochs': 200, 'early_stopping_patience': 20}, 'model_config': {'layers': '256-128-64', 'dropout': 0.1, 'learning_rate': 0.001, 'activation': 'LeakyReLU'}},
44
45
  }
45
46
 
46
47
 
47
- # Function to check if dataframe is empty
48
48
  def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
49
49
  """
50
50
  Check if the provided dataframe is empty and raise an exception if it is.
@@ -59,19 +59,17 @@ def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
59
59
  raise ValueError(msg)
60
60
 
61
61
 
62
- def expand_proba_column(df: pd.DataFrame, class_labels: List[str]) -> pd.DataFrame:
62
+ def expand_proba_column(df: pd.DataFrame, class_labels: list[str]) -> pd.DataFrame:
63
63
  """
64
64
  Expands a column in a DataFrame containing a list of probabilities into separate columns.
65
65
 
66
66
  Args:
67
67
  df (pd.DataFrame): DataFrame containing a "pred_proba" column
68
- class_labels (List[str]): List of class labels
68
+ class_labels (list[str]): List of class labels
69
69
 
70
70
  Returns:
71
71
  pd.DataFrame: DataFrame with the "pred_proba" expanded into separate columns
72
72
  """
73
-
74
- # Sanity check
75
73
  proba_column = "pred_proba"
76
74
  if proba_column not in df.columns:
77
75
  raise ValueError('DataFrame does not contain a "pred_proba" column')
@@ -88,11 +86,10 @@ def expand_proba_column(df: pd.DataFrame, class_labels: List[str]) -> pd.DataFra
88
86
 
89
87
  # Concatenate the new columns with the original DataFrame
90
88
  df = pd.concat([df, proba_df], axis=1)
91
- print(df)
92
89
  return df
93
90
 
94
91
 
95
- def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> pd.DataFrame:
92
+ def match_features_case_insensitive(df: pd.DataFrame, model_features: list[str]) -> pd.DataFrame:
96
93
  """
97
94
  Matches and renames DataFrame columns to match model feature names (case-insensitive).
98
95
  Prioritizes exact matches, then case-insensitive matches.
@@ -102,7 +99,6 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
102
99
  df_columns_lower = {col.lower(): col for col in df.columns}
103
100
  rename_dict = {}
104
101
  missing = []
105
-
106
102
  for feature in model_features:
107
103
  if feature in df.columns:
108
104
  continue # Exact match
@@ -114,58 +110,64 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
114
110
  if missing:
115
111
  raise ValueError(f"Features not found: {missing}")
116
112
 
113
+ # Rename the DataFrame columns to match the model features
117
114
  return df.rename(columns=rename_dict)
118
115
 
119
116
 
120
- def convert_categorical_types(df: pd.DataFrame, features: list, category_mappings={}) -> tuple:
117
+ def convert_categorical_types(
118
+ df: pd.DataFrame, features: list[str], category_mappings: dict[str, list[str]] | None = None
119
+ ) -> tuple[pd.DataFrame, dict[str, list[str]]]:
121
120
  """
122
121
  Converts appropriate columns to categorical type with consistent mappings.
123
122
 
124
123
  Args:
125
124
  df (pd.DataFrame): The DataFrame to process.
126
125
  features (list): List of feature names to consider for conversion.
127
- category_mappings (dict, optional): Existing category mappings. If empty dict, we're in
128
- training mode. If populated, we're in inference mode.
126
+ category_mappings (dict, optional): Existing category mappings. If None or empty,
127
+ we're in training mode. If populated, we're in
128
+ inference mode.
129
129
 
130
130
  Returns:
131
131
  tuple: (processed DataFrame, category mappings dictionary)
132
132
  """
133
+ if category_mappings is None:
134
+ category_mappings = {}
135
+
133
136
  # Training mode
134
- if category_mappings == {}:
137
+ if not category_mappings:
135
138
  for col in df.select_dtypes(include=["object", "string"]):
136
139
  if col in features and df[col].nunique() < 20:
137
140
  print(f"Training mode: Converting {col} to category")
138
141
  df[col] = df[col].astype("category")
139
- category_mappings[col] = df[col].cat.categories.tolist() # Store category mappings
142
+ category_mappings[col] = df[col].cat.categories.tolist()
140
143
 
141
144
  # Inference mode
142
145
  else:
143
146
  for col, categories in category_mappings.items():
144
147
  if col in df.columns:
145
148
  print(f"Inference mode: Applying categorical mapping for {col}")
146
- df[col] = pd.Categorical(df[col], categories=categories) # Apply consistent categorical mapping
149
+ df[col] = pd.Categorical(df[col], categories=categories)
147
150
 
148
151
  return df, category_mappings
149
152
 
150
153
 
151
154
  def decompress_features(
152
- df: pd.DataFrame, features: List[str], compressed_features: List[str]
153
- ) -> Tuple[pd.DataFrame, List[str]]:
155
+ df: pd.DataFrame, features: list[str], compressed_features: list[str]
156
+ ) -> tuple[pd.DataFrame, list[str]]:
154
157
  """Prepare features for the model
155
158
 
156
159
  Args:
157
160
  df (pd.DataFrame): The features DataFrame
158
- features (List[str]): Full list of feature names
159
- compressed_features (List[str]): List of feature names to decompress (bitstrings)
161
+ features (list[str]): Full list of feature names
162
+ compressed_features (list[str]): List of feature names to decompress (bitstrings)
160
163
 
161
164
  Returns:
162
165
  pd.DataFrame: DataFrame with the decompressed features
163
- List[str]: Updated list of feature names after decompression
166
+ list[str]: Updated list of feature names after decompression
164
167
 
165
168
  Raises:
166
169
  ValueError: If any missing values are found in the specified features
167
170
  """
168
-
169
171
  # Check for any missing values in the required features
170
172
  missing_counts = df[features].isna().sum()
171
173
  if missing_counts.any():
@@ -175,10 +177,11 @@ def decompress_features(
175
177
  "WARNING: You might want to remove/replace all NaN values before processing."
176
178
  )
177
179
 
178
- # Decompress the specified compressed features
179
- decompressed_features = features
180
+ # Make a copy to avoid mutating the original list
181
+ decompressed_features = features.copy()
182
+
180
183
  for feature in compressed_features:
181
- if (feature not in df.columns) or (feature not in features):
184
+ if (feature not in df.columns) or (feature not in decompressed_features):
182
185
  print(f"Feature '{feature}' not in the features list, skipping decompression.")
183
186
  continue
184
187
 
@@ -203,35 +206,60 @@ def decompress_features(
203
206
  return df, decompressed_features
204
207
 
205
208
 
206
- def model_fn(model_dir):
207
- """Deserialize and return fitted PyTorch Tabular model"""
208
- #
209
- os.environ['TEMP'] = '/tmp'
210
- model_path = os.path.join(model_dir, "tabular_model")
211
- model = TabularModel.load_model(model_path)
212
- return model
209
+ def model_fn(model_dir: str) -> dict:
210
+ """Load the PyTorch Tabular ensemble models from the specified directory.
213
211
 
212
+ Args:
213
+ model_dir: Directory containing the saved model(s)
214
214
 
215
- def model_fn(model_dir):
215
+ Returns:
216
+ Dictionary with ensemble models and metadata
217
+ """
218
+ import torch
219
+ from functools import partial
220
+
221
+ # Load ensemble metadata if present
222
+ ensemble_metadata_path = os.path.join(model_dir, "ensemble_metadata.joblib")
223
+ if os.path.exists(ensemble_metadata_path):
224
+ ensemble_metadata = joblib.load(ensemble_metadata_path)
225
+ n_ensemble = ensemble_metadata["n_ensemble"]
226
+ else:
227
+ n_ensemble = 1
228
+
229
+ # Determine map_location for loading models (handle CUDA trained models on CPU inference)
230
+ map_location = torch.device("cuda" if torch.cuda.is_available() else "cpu")
231
+
232
+ # Patch torch.load globally to use map_location (needed for joblib-loaded callbacks)
233
+ # This handles the case where pytorch-tabular loads callbacks.sav via joblib,
234
+ # which internally calls torch.load without map_location
235
+ original_torch_load = torch.load
236
+ torch.load = partial(original_torch_load, map_location=map_location)
216
237
 
217
238
  # Save current working directory
218
239
  original_cwd = os.getcwd()
240
+ ensemble_models = []
241
+
219
242
  try:
220
243
  # Change to /tmp because Pytorch Tabular needs write access (creates a .pt_tmp directory)
221
- os.chdir('/tmp')
244
+ os.chdir("/tmp")
222
245
 
223
- # Load the model
224
- model_path = os.path.join(model_dir, "tabular_model")
225
- model = TabularModel.load_model(model_path)
246
+ for ens_idx in range(n_ensemble):
247
+ # Try numbered model path first, fall back to legacy path
248
+ model_path = os.path.join(model_dir, f"tabular_model_{ens_idx}")
249
+ if not os.path.exists(model_path):
250
+ model_path = os.path.join(model_dir, "tabular_model")
251
+ model = TabularModel.load_model(model_path, map_location=map_location)
252
+ ensemble_models.append(model)
226
253
 
227
- # Restore the original working directory
228
254
  finally:
255
+ # Restore torch.load and working directory
256
+ torch.load = original_torch_load
229
257
  os.chdir(original_cwd)
230
258
 
231
- return model
259
+ return {"ensemble_models": ensemble_models, "n_ensemble": n_ensemble}
232
260
 
233
261
 
234
- def input_fn(input_data, content_type):
262
+ def input_fn(input_data, content_type: str) -> pd.DataFrame:
235
263
  """Parse input data and return a DataFrame."""
236
264
  if not input_data:
237
265
  raise ValueError("Empty input data is not supported!")
@@ -248,29 +276,34 @@ def input_fn(input_data, content_type):
248
276
  raise ValueError(f"{content_type} not supported!")
249
277
 
250
278
 
251
- def output_fn(output_df, accept_type):
279
+ def output_fn(output_df: pd.DataFrame, accept_type: str) -> tuple[str, str]:
252
280
  """Supports both CSV and JSON output formats."""
253
281
  if "text/csv" in accept_type:
254
- csv_output = output_df.fillna("N/A").to_csv(index=False) # CSV with N/A for missing values
282
+ csv_output = output_df.fillna("N/A").to_csv(index=False)
255
283
  return csv_output, "text/csv"
256
284
  elif "application/json" in accept_type:
257
- return output_df.to_json(orient="records"), "application/json" # JSON array of records (NaNs -> null)
285
+ return output_df.to_json(orient="records"), "application/json"
258
286
  else:
259
287
  raise RuntimeError(f"{accept_type} accept type is not supported by this script.")
260
288
 
261
289
 
262
- def predict_fn(df, model) -> pd.DataFrame:
263
- """Make Predictions with our PyTorch Tabular Model
290
+ def predict_fn(df: pd.DataFrame, model_dict: dict) -> pd.DataFrame:
291
+ """Make Predictions with our PyTorch Tabular Model ensemble.
264
292
 
265
293
  Args:
266
294
  df (pd.DataFrame): The input DataFrame
267
- model: The TabularModel use for predictions
295
+ model_dict: Dictionary containing ensemble models and metadata
268
296
 
269
297
  Returns:
270
- pd.DataFrame: The DataFrame with the predictions added
298
+ pd.DataFrame: The DataFrame with predictions (and prediction_std for ensembles)
271
299
  """
300
+ model_type = TEMPLATE_PARAMS["model_type"]
272
301
  compressed_features = TEMPLATE_PARAMS["compressed_features"]
273
302
 
303
+ # Extract ensemble models
304
+ ensemble_models = model_dict["ensemble_models"]
305
+ n_ensemble = model_dict["n_ensemble"]
306
+
274
307
  # Grab our feature columns (from training)
275
308
  model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
276
309
  with open(os.path.join(model_dir, "feature_columns.json")) as fp:
@@ -283,12 +316,11 @@ def predict_fn(df, model) -> pd.DataFrame:
283
316
 
284
317
  # Load our Label Encoder if we have one
285
318
  label_encoder = None
286
- if os.path.exists(os.path.join(model_dir, "label_encoder.joblib")):
287
- label_encoder = joblib.load(os.path.join(model_dir, "label_encoder.joblib"))
319
+ label_encoder_path = os.path.join(model_dir, "label_encoder.joblib")
320
+ if os.path.exists(label_encoder_path):
321
+ label_encoder = joblib.load(label_encoder_path)
288
322
 
289
- # We're going match features in a case-insensitive manner, accounting for all the permutations
290
- # - Model has a feature list that's any case ("Id", "taCos", "cOunT", "likes_tacos")
291
- # - Incoming data has columns that are mixed case ("ID", "Tacos", "Count", "Likes_Tacos")
323
+ # Match features in a case-insensitive manner
292
324
  matched_df = match_features_case_insensitive(df, features)
293
325
 
294
326
  # Detect categorical types in the incoming DataFrame
@@ -299,36 +331,80 @@ def predict_fn(df, model) -> pd.DataFrame:
299
331
  print("Decompressing features for prediction...")
300
332
  matched_df, features = decompress_features(matched_df, features, compressed_features)
301
333
 
302
- # Make predictions using the TabularModel
303
- result = model.predict(matched_df[features])
334
+ # Track rows with missing features
335
+ missing_mask = matched_df[features].isna().any(axis=1)
336
+ if missing_mask.any():
337
+ print(f"Warning: {missing_mask.sum()} rows have missing features, will return NaN predictions")
338
+
339
+ # Initialize prediction columns
340
+ df["prediction"] = np.nan
341
+ if model_type in ["regressor", "uq_regressor"]:
342
+ df["prediction_std"] = np.nan
343
+
344
+ # Only predict on complete rows
345
+ complete_df = matched_df[~missing_mask]
346
+ if len(complete_df) == 0:
347
+ print("Warning: No complete rows to predict on")
348
+ return df
304
349
 
305
350
  # pytorch-tabular returns predictions using f"{target}_prediction" column
306
- # and classification probabilities in columns ending with "_probability"
307
- target = TEMPLATE_PARAMS["target_column"]
351
+ target = TEMPLATE_PARAMS["target"]
308
352
  prediction_column = f"{target}_prediction"
309
- if prediction_column in result.columns:
310
- predictions = result[prediction_column].values
311
- else:
312
- raise ValueError(f"Cannot find prediction column in: {result.columns.tolist()}")
313
353
 
314
- # If we have a label encoder, decode the predictions
315
- if label_encoder:
316
- predictions = label_encoder.inverse_transform(predictions.astype(int))
354
+ # Collect predictions from all ensemble members
355
+ all_ensemble_preds = []
356
+ all_ensemble_probs = []
357
+
358
+ for ens_idx, ens_model in enumerate(ensemble_models):
359
+ result = ens_model.predict(complete_df[features])
360
+
361
+ if prediction_column in result.columns:
362
+ ens_preds = result[prediction_column].values
363
+ else:
364
+ raise ValueError(f"Cannot find prediction column in: {result.columns.tolist()}")
317
365
 
318
- # Set the predictions on the DataFrame
319
- df["prediction"] = predictions
366
+ all_ensemble_preds.append(ens_preds)
320
367
 
321
- # For classification, get probabilities
368
+ # For classification, collect probabilities
369
+ if label_encoder is not None:
370
+ prob_cols = sorted([col for col in result.columns if col.endswith("_probability")])
371
+ if prob_cols:
372
+ all_ensemble_probs.append(result[prob_cols].values)
373
+
374
+ # Stack and compute mean/std (std is 0 for single model)
375
+ ensemble_preds = np.stack(all_ensemble_preds, axis=0) # (n_ensemble, n_samples)
376
+ preds = np.mean(ensemble_preds, axis=0)
377
+ preds_std = np.std(ensemble_preds, axis=0) # Will be 0s for n_ensemble=1
378
+
379
+ print(f"Inference: Ensemble predictions shape: {preds.shape}, n_ensemble: {n_ensemble}")
380
+
381
+ # Handle classification vs regression
322
382
  if label_encoder is not None:
323
- prob_cols = [col for col in result.columns if col.endswith("_probability")]
324
- if prob_cols:
325
- probs = result[prob_cols].values
326
- df["pred_proba"] = [p.tolist() for p in probs]
383
+ # For classification, average probabilities then take argmax
384
+ if all_ensemble_probs:
385
+ ensemble_probs = np.stack(all_ensemble_probs, axis=0) # (n_ensemble, n_samples, n_classes)
386
+ avg_probs = np.mean(ensemble_probs, axis=0) # (n_samples, n_classes)
387
+ class_preds = np.argmax(avg_probs, axis=1)
388
+ predictions = label_encoder.inverse_transform(class_preds)
389
+
390
+ # Build full proba Series with None for missing rows
391
+ all_proba = pd.Series([None] * len(df), index=df.index, dtype=object)
392
+ all_proba.loc[~missing_mask] = [p.tolist() for p in avg_probs]
393
+ df["pred_proba"] = all_proba
327
394
 
328
395
  # Expand the pred_proba column into separate columns for each class
329
396
  df = expand_proba_column(df, label_encoder.classes_)
397
+ else:
398
+ # No probabilities, use averaged predictions
399
+ predictions = label_encoder.inverse_transform(preds.astype(int))
400
+ else:
401
+ # Regression (includes uq_regressor)
402
+ predictions = preds
403
+ df.loc[~missing_mask, "prediction_std"] = preds_std
404
+
405
+ # Set predictions only for complete rows
406
+ df.loc[~missing_mask, "prediction"] = predictions
330
407
 
331
- # All done, return the DataFrame with new columns for the predictions
332
408
  return df
333
409
 
334
410
 
@@ -336,14 +412,14 @@ if __name__ == "__main__":
336
412
  """The main function is for training the PyTorch Tabular model"""
337
413
 
338
414
  # Harness Template Parameters
339
- target = TEMPLATE_PARAMS["target_column"]
415
+ target = TEMPLATE_PARAMS["target"]
340
416
  features = TEMPLATE_PARAMS["features"]
341
417
  orig_features = features.copy()
418
+ id_column = TEMPLATE_PARAMS["id_column"]
342
419
  compressed_features = TEMPLATE_PARAMS["compressed_features"]
343
420
  model_type = TEMPLATE_PARAMS["model_type"]
344
421
  model_metrics_s3_path = TEMPLATE_PARAMS["model_metrics_s3_path"]
345
- train_all_data = TEMPLATE_PARAMS["train_all_data"]
346
- validation_split = 0.2
422
+ hyperparameters = TEMPLATE_PARAMS["hyperparameters"]
347
423
 
348
424
  # Script arguments for input/output directories
349
425
  parser = argparse.ArgumentParser()
@@ -355,19 +431,27 @@ if __name__ == "__main__":
355
431
  args = parser.parse_args()
356
432
 
357
433
  # Read the training data into DataFrames
358
- training_files = [
359
- os.path.join(args.train, file)
360
- for file in os.listdir(args.train)
361
- if file.endswith(".csv")
362
- ]
434
+ training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
363
435
  print(f"Training Files: {training_files}")
364
436
 
365
437
  # Combine files and read them all into a single pandas dataframe
366
438
  all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
367
439
 
440
+ # Print out some info about the dataframe
441
+ print(f"All Data Shape: {all_df.shape}")
442
+ print(f"Feature dtypes:\n{all_df[features].dtypes.value_counts()}")
443
+ print(f"Int64 columns: {all_df[features].select_dtypes(include=['int64']).columns.tolist()}")
444
+
368
445
  # Check if the dataframe is empty
369
446
  check_dataframe(all_df, "training_df")
370
447
 
448
+ # Drop any rows with missing feature values
449
+ initial_row_count = all_df.shape[0]
450
+ all_df = all_df.dropna(subset=features)
451
+ dropped_rows = initial_row_count - all_df.shape[0]
452
+ if dropped_rows > 0:
453
+ print(f"Dropped {dropped_rows} rows due to missing feature values.")
454
+
371
455
  # Features/Target output
372
456
  print(f"Target: {target}")
373
457
  print(f"Features: {str(features)}")
@@ -375,125 +459,228 @@ if __name__ == "__main__":
375
459
  # Convert any features that might be categorical to 'category' type
376
460
  all_df, category_mappings = convert_categorical_types(all_df, features)
377
461
 
462
+ # Print out some info about the dataframe
463
+ print(f"All Data Shape: {all_df.shape}")
464
+ print(f"Feature dtypes:\n{all_df[features].dtypes.value_counts()}")
465
+ print(f"Int64 columns: {all_df[features].select_dtypes(include=['int64']).columns.tolist()}")
466
+
378
467
  # If we have compressed features, decompress them
379
468
  if compressed_features:
380
469
  print(f"Decompressing features {compressed_features}...")
381
470
  all_df, features = decompress_features(all_df, features, compressed_features)
382
471
 
383
- # Do we want to train on all the data?
384
- if train_all_data:
385
- print("Training on ALL of the data")
386
- df_train = all_df.copy()
387
- df_val = all_df.copy()
388
-
389
- # Does the dataframe have a training column?
390
- elif "training" in all_df.columns:
391
- print("Found training column, splitting data based on training column")
392
- df_train = all_df[all_df["training"]]
393
- df_val = all_df[~all_df["training"]]
394
- else:
395
- # Just do a random training Split
396
- print("WARNING: No training column found, splitting data with random state=42")
397
- df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
398
- print(f"FIT/TRAIN: {df_train.shape}")
399
- print(f"VALIDATION: {df_val.shape}")
400
-
401
472
  # Determine categorical and continuous columns
402
- categorical_cols = [col for col in features if df_train[col].dtype.name == "category"]
473
+ categorical_cols = [col for col in features if all_df[col].dtype.name == "category"]
403
474
  continuous_cols = [col for col in features if col not in categorical_cols]
404
-
405
475
  print(f"Categorical columns: {categorical_cols}")
406
476
  print(f"Continuous columns: {continuous_cols}")
407
477
 
408
- # Set up PyTorch Tabular configuration
409
- data_config = DataConfig(
410
- target=[target],
411
- continuous_cols=continuous_cols,
412
- categorical_cols=categorical_cols,
413
- )
414
-
415
- trainer_config = TrainerConfig(
416
- auto_lr_find=True,
417
- batch_size=min(1024, len(df_train) // 4),
418
- max_epochs=100,
419
- early_stopping="valid_loss",
420
- early_stopping_patience=15,
421
- checkpoints="valid_loss",
422
- accelerator="auto",
423
- progress_bar="none",
424
- gradient_clip_val=1.0,
425
- )
478
+ # Cast continuous columns to float
479
+ all_df[continuous_cols] = all_df[continuous_cols].astype("float64")
426
480
 
427
- optimizer_config = OptimizerConfig()
428
-
429
- # Choose model configuration based on model type
481
+ # Choose the 'task' based on model type and set up the label encoder if needed
430
482
  if model_type == "classifier":
431
483
  task = "classification"
432
- # Encode the target column
484
+ # Encode the target column on full dataset for consistent encoding
433
485
  label_encoder = LabelEncoder()
434
- df_train[target] = label_encoder.fit_transform(df_train[target])
435
- df_val[target] = label_encoder.transform(df_val[target])
486
+ all_df[target] = label_encoder.fit_transform(all_df[target])
487
+ num_classes = len(label_encoder.classes_)
436
488
  else:
437
489
  task = "regression"
438
490
  label_encoder = None
491
+ num_classes = None
492
+
493
+ # Use any hyperparameters to set up both the trainer and model configurations
494
+ print(f"Hyperparameters: {hyperparameters}")
495
+ n_folds = hyperparameters.get("n_folds", 5) # Number of CV folds (default: 5)
496
+
497
+ # =========================================================================
498
+ # UNIFIED TRAINING: Works for n_folds=1 (single model) or n_folds>1 (K-fold CV)
499
+ # =========================================================================
500
+ print(f"Training {'single model' if n_folds == 1 else f'{n_folds}-fold cross-validation ensemble'}...")
501
+
502
+ # Create fold splits
503
+ if n_folds == 1:
504
+ # Single fold: use train/val split from "training" column or random split
505
+ if "training" in all_df.columns:
506
+ print("Found training column, splitting data based on training column")
507
+ train_idx = np.where(all_df["training"])[0]
508
+ val_idx = np.where(~all_df["training"])[0]
509
+ else:
510
+ print("WARNING: No training column found, splitting data with random 80/20 split")
511
+ indices = np.arange(len(all_df))
512
+ train_idx, val_idx = train_test_split(indices, test_size=0.2, random_state=42)
513
+ folds = [(train_idx, val_idx)]
514
+ else:
515
+ # K-Fold CV
516
+ if model_type == "classifier":
517
+ kfold = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
518
+ split_target = all_df[target]
519
+ else:
520
+ kfold = KFold(n_splits=n_folds, shuffle=True, random_state=42)
521
+ split_target = None
522
+ folds = list(kfold.split(all_df, split_target))
523
+
524
+ # Initialize storage for out-of-fold predictions
525
+ oof_predictions = np.full(len(all_df), np.nan, dtype=np.float64)
526
+ if model_type == "classifier" and num_classes and num_classes > 1:
527
+ oof_proba = np.full((len(all_df), num_classes), np.nan, dtype=np.float64)
528
+ else:
529
+ oof_proba = None
439
530
 
440
- # Use CategoryEmbedding for both regression and classification tasks
441
- model_config = CategoryEmbeddingModelConfig(
442
- task=task,
443
- layers="1024-512-512",
444
- activation="ReLU",
445
- learning_rate=1e-3,
446
- dropout=0.1,
447
- use_batch_norm=True,
448
- initialization="kaiming",
449
- )
531
+ ensemble_models = []
450
532
 
451
- # Create and train the TabularModel
452
- tabular_model = TabularModel(
453
- data_config=data_config,
454
- model_config=model_config,
455
- optimizer_config=optimizer_config,
456
- trainer_config=trainer_config,
533
+ # Set up PyTorch Tabular data configuration (shared across folds)
534
+ data_config = DataConfig(
535
+ target=[target],
536
+ continuous_cols=continuous_cols,
537
+ categorical_cols=categorical_cols,
457
538
  )
458
539
 
459
- # Train the model
460
- tabular_model.fit(train=df_train, validation=df_val)
540
+ # Model config defaults
541
+ model_defaults = {
542
+ "layers": "256-128-64",
543
+ "activation": "LeakyReLU",
544
+ "learning_rate": 1e-3,
545
+ "dropout": 0.1,
546
+ "use_batch_norm": True,
547
+ "initialization": "kaiming",
548
+ }
549
+ # Override defaults with model_config if present
550
+ model_overrides = {k: v for k, v in hyperparameters.get("model_config", {}).items() if k in model_defaults}
551
+ for key, value in model_overrides.items():
552
+ print(f"MODEL CONFIG Override: {key}: {model_defaults[key]} → {value}")
553
+ model_params = {**model_defaults, **model_overrides}
554
+
555
+ model_config = CategoryEmbeddingModelConfig(task=task, **model_params)
556
+ optimizer_config = OptimizerConfig()
461
557
 
462
- # Make Predictions on the Validation Set
463
- print(f"Making Predictions on Validation Set...")
464
- result = tabular_model.predict(df_val, include_input_features=False)
558
+ for fold_idx, (train_idx, val_idx) in enumerate(folds):
559
+ print(f"\n{'='*50}")
560
+ print(f"Training Fold {fold_idx + 1}/{len(folds)}")
561
+ print(f"{'='*50}")
562
+
563
+ # Split data for this fold
564
+ df_train = all_df.iloc[train_idx].reset_index(drop=True)
565
+ df_val = all_df.iloc[val_idx].reset_index(drop=True)
566
+
567
+ print(f"Fold {fold_idx + 1} - Train: {len(df_train)}, Val: {len(df_val)}")
568
+
569
+ # Set up PyTorch Tabular trainer configuration (per-fold for batch_size)
570
+ # Calculate batch size that avoids single-sample last batch (batch norm requires >1)
571
+ batch_size = min(128, max(32, len(df_train) // 16))
572
+ if len(df_train) % batch_size == 1:
573
+ batch_size += 1 # Adjust to avoid last batch of size 1
574
+ trainer_defaults = {
575
+ "auto_lr_find": False,
576
+ "batch_size": batch_size,
577
+ "max_epochs": 200,
578
+ "min_epochs": 10,
579
+ "early_stopping": "valid_loss",
580
+ "early_stopping_patience": 20,
581
+ "checkpoints": "valid_loss",
582
+ "accelerator": "auto",
583
+ "progress_bar": "none",
584
+ "gradient_clip_val": 1.0,
585
+ "seed": 42 + fold_idx,
586
+ }
587
+
588
+ # Override defaults with training_config if present
589
+ training_overrides = {k: v for k, v in hyperparameters.get("training_config", {}).items() if k in trainer_defaults}
590
+ if fold_idx == 0: # Only print overrides once
591
+ for key, value in training_overrides.items():
592
+ print(f"TRAINING CONFIG Override: {key}: {trainer_defaults[key]} → {value}")
593
+ trainer_params = {**trainer_defaults, **training_overrides}
594
+ trainer_config = TrainerConfig(**trainer_params)
595
+
596
+ # Create and train the TabularModel for this fold
597
+ tabular_model = TabularModel(
598
+ data_config=data_config,
599
+ model_config=model_config,
600
+ optimizer_config=optimizer_config,
601
+ trainer_config=trainer_config,
602
+ )
603
+ tabular_model.fit(train=df_train, validation=df_val)
604
+ ensemble_models.append(tabular_model)
605
+
606
+ # Make out-of-fold predictions
607
+ result = tabular_model.predict(df_val, include_input_features=False)
608
+ fold_preds = result[f"{target}_prediction"].values
609
+
610
+ # Store out-of-fold predictions
611
+ if model_type == "classifier":
612
+ oof_predictions[val_idx] = fold_preds.astype(int)
613
+ prob_cols = sorted([col for col in result.columns if col.endswith("_probability")])
614
+ if prob_cols and oof_proba is not None:
615
+ oof_proba[val_idx] = result[prob_cols].values
616
+ else:
617
+ oof_predictions[val_idx] = fold_preds.flatten()
465
618
 
466
- # pytorch-tabular returns predictions using f"{target}_prediction" column
467
- # and classification probabilities in columns ending with "_probability"
468
- if model_type == "classifier":
469
- preds = result[f"{target}_prediction"].values
619
+ print(f"Fold {fold_idx + 1} complete!")
620
+
621
+ print(f"\nTraining complete! Trained {len(ensemble_models)} model(s).")
622
+
623
+ # Use out-of-fold predictions for metrics
624
+ # For n_folds=1, we only have predictions for val_idx, so filter to those rows
625
+ if n_folds == 1:
626
+ val_mask = ~np.isnan(oof_predictions)
627
+ preds = oof_predictions[val_mask]
628
+ df_val = all_df[val_mask].copy()
629
+ if oof_proba is not None:
630
+ oof_proba = oof_proba[val_mask]
470
631
  else:
471
- # Regression: use the target column name
472
- preds = result[f"{target}_prediction"].values
632
+ preds = oof_predictions
633
+ df_val = all_df.copy()
634
+
635
+ # Compute prediction_std by running all ensemble models on validation data
636
+ # For n_folds=1, std will be 0 (only one model). For n_folds>1, std shows ensemble disagreement.
637
+ preds_std = None
638
+ if model_type in ["regressor", "uq_regressor"] and len(ensemble_models) > 0:
639
+ print("Computing prediction_std from ensemble predictions on validation data...")
640
+ all_ensemble_preds_for_std = []
641
+ for ens_model in ensemble_models:
642
+ result = ens_model.predict(df_val[features], include_input_features=False)
643
+ ens_preds = result[f"{target}_prediction"].values.flatten()
644
+ all_ensemble_preds_for_std.append(ens_preds)
645
+
646
+ ensemble_preds_stacked = np.stack(all_ensemble_preds_for_std, axis=0)
647
+ preds_std = np.std(ensemble_preds_stacked, axis=0)
648
+ print(f"Ensemble prediction_std - mean: {np.mean(preds_std):.4f}, max: {np.max(preds_std):.4f}")
473
649
 
474
650
  if model_type == "classifier":
475
651
  # Get probabilities for classification
476
- print("Processing Probabilities...")
477
- prob_cols = [col for col in result.columns if col.endswith("_probability")]
478
- if prob_cols:
479
- probs = result[prob_cols].values
480
- df_val["pred_proba"] = [p.tolist() for p in probs]
481
-
482
- # Expand the pred_proba column into separate columns for each class
483
- print(df_val.columns)
652
+ if oof_proba is not None:
653
+ df_val = df_val.copy()
654
+ df_val["pred_proba"] = [p.tolist() for p in oof_proba]
484
655
  df_val = expand_proba_column(df_val, label_encoder.classes_)
485
- print(df_val.columns)
486
656
 
487
657
  # Decode the target and prediction labels
488
658
  y_validate = label_encoder.inverse_transform(df_val[target])
489
- preds = label_encoder.inverse_transform(preds.astype(int))
659
+ preds_decoded = label_encoder.inverse_transform(preds.astype(int))
490
660
  else:
491
661
  y_validate = df_val[target].values
662
+ preds_decoded = preds
663
+
664
+ # Save predictions to S3
665
+ df_val = df_val.copy()
666
+ df_val["prediction"] = preds_decoded
667
+
668
+ # Build output columns - include id_column if it exists
669
+ output_columns = []
670
+ if id_column in df_val.columns:
671
+ output_columns.append(id_column)
672
+ output_columns += [target, "prediction"]
673
+
674
+ # Add prediction_std for regression models (always present, 0 for single model)
675
+ if model_type in ["regressor", "uq_regressor"]:
676
+ if preds_std is not None:
677
+ df_val["prediction_std"] = preds_std
678
+ else:
679
+ df_val["prediction_std"] = 0.0
680
+ output_columns.append("prediction_std")
681
+ print(f"Ensemble std - mean: {df_val['prediction_std'].mean():.4f}, max: {df_val['prediction_std'].max():.4f}")
492
682
 
493
- # Save predictions to S3 (just the target, prediction, and '_probability' columns)
494
- df_val["prediction"] = preds
495
- output_columns = [target, "prediction"]
496
- output_columns += [col for col in df_val.columns if col.endswith("_probability")]
683
+ output_columns += [col for col in df_val.columns if col.endswith("_proba")]
497
684
  wr.s3.to_csv(
498
685
  df_val[output_columns],
499
686
  path=f"{model_metrics_s3_path}/validation_predictions.csv",
@@ -506,7 +693,7 @@ if __name__ == "__main__":
506
693
  label_names = label_encoder.classes_
507
694
 
508
695
  # Calculate various model performance metrics
509
- scores = precision_recall_fscore_support(y_validate, preds, average=None, labels=label_names)
696
+ scores = precision_recall_fscore_support(y_validate, preds_decoded, average=None, labels=label_names)
510
697
 
511
698
  # Put the scores into a dataframe
512
699
  score_df = pd.DataFrame(
@@ -514,20 +701,20 @@ if __name__ == "__main__":
514
701
  target: label_names,
515
702
  "precision": scores[0],
516
703
  "recall": scores[1],
517
- "fscore": scores[2],
704
+ "f1": scores[2],
518
705
  "support": scores[3],
519
706
  }
520
707
  )
521
708
 
522
- # We need to get creative with the Classification Metrics
523
- metrics = ["precision", "recall", "fscore", "support"]
709
+ # Output metrics per class
710
+ metrics = ["precision", "recall", "f1", "support"]
524
711
  for t in label_names:
525
712
  for m in metrics:
526
713
  value = score_df.loc[score_df[target] == t, m].iloc[0]
527
714
  print(f"Metrics:{t}:{m} {value}")
528
715
 
529
716
  # Compute and output the confusion matrix
530
- conf_mtx = confusion_matrix(y_validate, preds, labels=label_names)
717
+ conf_mtx = confusion_matrix(y_validate, preds_decoded, labels=label_names)
531
718
  for i, row_name in enumerate(label_names):
532
719
  for j, col_name in enumerate(label_names):
533
720
  value = conf_mtx[i, j]
@@ -535,23 +722,38 @@ if __name__ == "__main__":
535
722
 
536
723
  else:
537
724
  # Calculate various model performance metrics (regression)
538
- rmse = root_mean_squared_error(y_validate, preds)
539
- mae = mean_absolute_error(y_validate, preds)
540
- r2 = r2_score(y_validate, preds)
541
- print(f"RMSE: {rmse:.3f}")
542
- print(f"MAE: {mae:.3f}")
543
- print(f"R2: {r2:.3f}")
544
- print(f"NumRows: {len(df_val)}")
545
-
546
- # Save the model to the standard place/name
547
- tabular_model.save_model(os.path.join(args.model_dir, "tabular_model"))
725
+ rmse = root_mean_squared_error(y_validate, preds_decoded)
726
+ mae = mean_absolute_error(y_validate, preds_decoded)
727
+ medae = median_absolute_error(y_validate, preds_decoded)
728
+ r2 = r2_score(y_validate, preds_decoded)
729
+ spearman_corr = spearmanr(y_validate, preds_decoded).correlation
730
+ support = len(df_val)
731
+ print(f"rmse: {rmse:.3f}")
732
+ print(f"mae: {mae:.3f}")
733
+ print(f"medae: {medae:.3f}")
734
+ print(f"r2: {r2:.3f}")
735
+ print(f"spearmanr: {spearman_corr:.3f}")
736
+ print(f"support: {support}")
737
+
738
+ # Save ensemble models
739
+ for model_idx, ens_model in enumerate(ensemble_models):
740
+ model_path = os.path.join(args.model_dir, f"tabular_model_{model_idx}")
741
+ ens_model.save_model(model_path)
742
+ print(f"Saved model {model_idx + 1} to {model_path}")
743
+
744
+ # Save ensemble metadata
745
+ n_ensemble = len(ensemble_models)
746
+ ensemble_metadata = {"n_ensemble": n_ensemble, "n_folds": n_folds}
747
+ joblib.dump(ensemble_metadata, os.path.join(args.model_dir, "ensemble_metadata.joblib"))
748
+ print(f"Saved ensemble metadata (n_ensemble={n_ensemble}, n_folds={n_folds})")
749
+
548
750
  if label_encoder:
549
751
  joblib.dump(label_encoder, os.path.join(args.model_dir, "label_encoder.joblib"))
550
752
 
551
753
  # Save the features (this will validate input during predictions)
552
754
  with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
553
- json.dump(orig_features, fp) # We save the original features, not the decompressed ones
755
+ json.dump(orig_features, fp)
554
756
 
555
757
  # Save the category mappings
556
758
  with open(os.path.join(args.model_dir, "category_mappings.json"), "w") as fp:
557
- json.dump(category_mappings, fp)
759
+ json.dump(category_mappings, fp)