workbench 0.8.198__py3-none-any.whl → 0.8.201__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. workbench/algorithms/dataframe/proximity.py +11 -4
  2. workbench/api/__init__.py +2 -1
  3. workbench/api/feature_set.py +7 -4
  4. workbench/api/model.py +1 -1
  5. workbench/core/artifacts/__init__.py +11 -2
  6. workbench/core/artifacts/endpoint_core.py +84 -46
  7. workbench/core/artifacts/feature_set_core.py +69 -1
  8. workbench/core/artifacts/model_core.py +37 -7
  9. workbench/core/cloud_platform/aws/aws_parameter_store.py +18 -2
  10. workbench/core/transforms/features_to_model/features_to_model.py +23 -20
  11. workbench/core/views/view.py +2 -2
  12. workbench/model_scripts/chemprop/chemprop.template +931 -0
  13. workbench/model_scripts/chemprop/generated_model_script.py +931 -0
  14. workbench/model_scripts/chemprop/requirements.txt +11 -0
  15. workbench/model_scripts/custom_models/chem_info/fingerprints.py +134 -0
  16. workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py +1 -1
  17. workbench/model_scripts/custom_models/proximity/proximity.py +11 -4
  18. workbench/model_scripts/custom_models/uq_models/proximity.py +11 -4
  19. workbench/model_scripts/pytorch_model/generated_model_script.py +130 -88
  20. workbench/model_scripts/pytorch_model/pytorch.template +128 -86
  21. workbench/model_scripts/scikit_learn/generated_model_script.py +302 -0
  22. workbench/model_scripts/script_generation.py +10 -7
  23. workbench/model_scripts/uq_models/generated_model_script.py +25 -18
  24. workbench/model_scripts/uq_models/mapie.template +23 -16
  25. workbench/model_scripts/xgb_model/generated_model_script.py +6 -6
  26. workbench/model_scripts/xgb_model/xgb_model.template +2 -2
  27. workbench/repl/workbench_shell.py +14 -5
  28. workbench/scripts/endpoint_test.py +162 -0
  29. workbench/scripts/{lambda_launcher.py → lambda_test.py} +10 -0
  30. workbench/utils/chemprop_utils.py +724 -0
  31. workbench/utils/pytorch_utils.py +497 -0
  32. workbench/utils/xgboost_model_utils.py +10 -5
  33. {workbench-0.8.198.dist-info → workbench-0.8.201.dist-info}/METADATA +2 -2
  34. {workbench-0.8.198.dist-info → workbench-0.8.201.dist-info}/RECORD +38 -32
  35. {workbench-0.8.198.dist-info → workbench-0.8.201.dist-info}/entry_points.txt +2 -1
  36. workbench/model_scripts/__pycache__/script_generation.cpython-312.pyc +0 -0
  37. workbench/model_scripts/__pycache__/script_generation.cpython-313.pyc +0 -0
  38. {workbench-0.8.198.dist-info → workbench-0.8.201.dist-info}/WHEEL +0 -0
  39. {workbench-0.8.198.dist-info → workbench-0.8.201.dist-info}/licenses/LICENSE +0 -0
  40. {workbench-0.8.198.dist-info → workbench-0.8.201.dist-info}/top_level.txt +0 -0
@@ -29,23 +29,20 @@ from io import StringIO
29
29
  import json
30
30
  import argparse
31
31
  import joblib
32
- import os
33
32
  import pandas as pd
34
- from typing import List, Tuple
35
33
 
36
34
  # Template Parameters
37
35
  TEMPLATE_PARAMS = {
38
36
  "model_type": "regressor",
39
- "target": "udm_asy_res_efflux_ratio",
40
- "features": ['chi2v', 'fr_sulfone', 'chi1v', 'bcut2d_logplow', 'fr_piperzine', 'kappa3', 'smr_vsa1', 'slogp_vsa5', 'fr_ketone_topliss', 'fr_sulfonamd', 'fr_imine', 'fr_benzene', 'fr_ester', 'chi2n', 'labuteasa', 'peoe_vsa2', 'smr_vsa6', 'bcut2d_chglo', 'fr_sh', 'peoe_vsa1', 'fr_allylic_oxid', 'chi4n', 'fr_ar_oh', 'fr_nh0', 'fr_term_acetylene', 'slogp_vsa7', 'slogp_vsa4', 'estate_vsa1', 'vsa_estate4', 'numbridgeheadatoms', 'numheterocycles', 'fr_ketone', 'fr_morpholine', 'fr_guanido', 'estate_vsa2', 'numheteroatoms', 'fr_nitro_arom_nonortho', 'fr_piperdine', 'nocount', 'numspiroatoms', 'fr_aniline', 'fr_thiophene', 'slogp_vsa10', 'fr_amide', 'slogp_vsa2', 'fr_epoxide', 'vsa_estate7', 'fr_ar_coo', 'fr_imidazole', 'fr_nitrile', 'fr_oxazole', 'numsaturatedrings', 'fr_pyridine', 'fr_hoccn', 'fr_ndealkylation1', 'numaliphaticheterocycles', 'fr_phenol', 'maxpartialcharge', 'vsa_estate5', 'peoe_vsa13', 'minpartialcharge', 'qed', 'fr_al_oh', 'slogp_vsa11', 'chi0n', 'fr_bicyclic', 'peoe_vsa12', 'fpdensitymorgan1', 'fr_oxime', 'molwt', 'fr_dihydropyridine', 'smr_vsa5', 'peoe_vsa5', 'fr_nitro', 'hallkieralpha', 'heavyatommolwt', 'fr_alkyl_halide', 'peoe_vsa8', 'fr_nhpyrrole', 'fr_isocyan', 'bcut2d_chghi', 'fr_lactam', 'peoe_vsa11', 'smr_vsa9', 'tpsa', 'chi4v', 'slogp_vsa1', 'phi', 'bcut2d_logphi', 'avgipc', 'estate_vsa11', 'fr_coo', 'bcut2d_mwhi', 'numunspecifiedatomstereocenters', 'vsa_estate10', 'estate_vsa8', 'numvalenceelectrons', 'fr_nh2', 'fr_lactone', 'vsa_estate1', 'estate_vsa4', 'numatomstereocenters', 'vsa_estate8', 'fr_para_hydroxylation', 'peoe_vsa3', 'fr_thiazole', 'peoe_vsa10', 'fr_ndealkylation2', 'slogp_vsa12', 'peoe_vsa9', 'maxestateindex', 'fr_quatn', 'smr_vsa7', 'minestateindex', 'numaromaticheterocycles', 'numrotatablebonds', 'fr_ar_nh', 'fr_ether', 'exactmolwt', 'fr_phenol_noorthohbond', 'slogp_vsa3', 'fr_ar_n', 'sps', 'fr_c_o_nocoo', 'bertzct', 'peoe_vsa7', 'slogp_vsa8', 'numradicalelectrons', 'molmr', 'fr_tetrazole', 'numsaturatedcarbocycles', 'bcut2d_mrhi', 'kappa1', 'numamidebonds', 'fpdensitymorgan2', 'smr_vsa8', 'chi1n', 'estate_vsa6', 'fr_barbitur', 'fr_diazo', 'kappa2', 'chi0', 'bcut2d_mrlow', 'balabanj', 'peoe_vsa4', 'numhacceptors', 'fr_sulfide', 'chi3n', 'smr_vsa2', 'fr_al_oh_notert', 'fr_benzodiazepine', 'fr_phos_ester', 'fr_aldehyde', 'fr_coo2', 'estate_vsa5', 'fr_prisulfonamd', 'numaromaticcarbocycles', 'fr_unbrch_alkane', 'fr_urea', 'fr_nitroso', 'smr_vsa10', 'fr_c_s', 'smr_vsa3', 'fr_methoxy', 'maxabspartialcharge', 'slogp_vsa9', 'heavyatomcount', 'fr_azide', 'chi3v', 'smr_vsa4', 'mollogp', 'chi0v', 'fr_aryl_methyl', 'fr_nh1', 'fpdensitymorgan3', 'fr_furan', 'fr_hdrzine', 'fr_arn', 'numaromaticrings', 'vsa_estate3', 'fr_azo', 'fr_halogen', 'estate_vsa9', 'fr_hdrzone', 'numhdonors', 'fr_alkyl_carbamate', 'fr_isothiocyan', 'minabspartialcharge', 'fr_al_coo', 'ringcount', 'chi1', 'estate_vsa7', 'fr_nitro_arom', 'vsa_estate9', 'minabsestateindex', 'maxabsestateindex', 'vsa_estate6', 'estate_vsa10', 'estate_vsa3', 'fr_n_o', 'fr_amidine', 'fr_thiocyan', 'fr_phos_acid', 'fr_c_o', 'fr_imide', 'numaliphaticrings', 'peoe_vsa6', 'vsa_estate2', 'nhohcount', 'numsaturatedheterocycles', 'slogp_vsa6', 'peoe_vsa14', 'fractioncsp3', 'bcut2d_mwlow', 'numaliphaticcarbocycles', 'fr_priamide', 'nacid', 'nbase', 'naromatom', 'narombond', 'sz', 'sm', 'sv', 'sse', 'spe', 'sare', 'sp', 'si', 'mz', 'mm', 'mv', 'mse', 'mpe', 'mare', 'mp', 'mi', 'xch_3d', 'xch_4d', 'xch_5d', 'xch_6d', 'xch_7d', 'xch_3dv', 'xch_4dv', 'xch_5dv', 'xch_6dv', 'xch_7dv', 'xc_3d', 'xc_4d', 'xc_5d', 'xc_6d', 'xc_3dv', 'xc_4dv', 'xc_5dv', 'xc_6dv', 'xpc_4d', 'xpc_5d', 'xpc_6d', 'xpc_4dv', 'xpc_5dv', 'xpc_6dv', 'xp_0d', 'xp_1d', 'xp_2d', 'xp_3d', 'xp_4d', 'xp_5d', 'xp_6d', 'xp_7d', 'axp_0d', 'axp_1d', 'axp_2d', 'axp_3d', 'axp_4d', 'axp_5d', 'axp_6d', 'axp_7d', 'xp_0dv', 'xp_1dv', 'xp_2dv', 'xp_3dv', 'xp_4dv', 'xp_5dv', 'xp_6dv', 'xp_7dv', 'axp_0dv', 'axp_1dv', 'axp_2dv', 'axp_3dv', 'axp_4dv', 'axp_5dv', 'axp_6dv', 'axp_7dv', 'c1sp1', 'c2sp1', 'c1sp2', 'c2sp2', 'c3sp2', 'c1sp3', 'c2sp3', 'c3sp3', 'c4sp3', 'hybratio', 'fcsp3', 'num_stereocenters', 'num_unspecified_stereocenters', 'num_defined_stereocenters', 'num_r_centers', 'num_s_centers', 'num_stereobonds', 'num_e_bonds', 'num_z_bonds', 'stereo_complexity', 'frac_defined_stereo'],
37
+ "target": "caco_2_efflux",
38
+ "features": ['smr_vsa3', 'xch_6dv', 'tpsa', 'numhdonors', 'vsa_estate2', 'fr_imidazole', 'mollogp', 'molmr', 'vsa_estate3', 'bcut2d_mwhi', 'slogp_vsa1', 'peoe_vsa6', 'peoe_vsa9', 'fr_halogen', 'vsa_estate10', 'fr_pyridine', 'minabsestateindex', 'bcut2d_mwlow', 'peoe_vsa2', 'bcut2d_logplow', 'vsa_estate7', 'peoe_vsa1', 'bcut2d_mrlow', 'slogp_vsa4', 'peoe_vsa7', 'peoe_vsa8', 'bcut2d_logphi', 'estate_vsa8', 'sv', 'fpdensitymorgan1', 'peoe_vsa3', 'slogp_vsa2', 'vsa_estate8', 'hallkieralpha', 'chi2v', 'axp_3dv', 'estate_vsa7', 'peoe_vsa11', 'xc_3d', 'vsa_estate4', 'vsa_estate9', 'bcut2d_chghi', 'vsa_estate5', 'c1sp2', 'slogp_vsa6', 'axp_3d', 'axp_2dv', 'bcut2d_mrhi', 'xc_3dv', 'xpc_4d', 'chi3n', 'xp_3dv', 'kappa3', 'vsa_estate6', 'minestateindex', 'mp', 'avgipc', 'axp_5dv', 'heavyatommolwt', 'maxpartialcharge', 'peoe_vsa10', 'kappa1', 'nocount', 'xp_6d', 'xpc_4dv', 'axp_4dv', 'sz', 'axp_4d', 'xc_5d', 'qed', 'xch_7dv', 'axp_1dv', 'chi0v', 'smr_vsa5', 'maxabspartialcharge', 'minpartialcharge', 'estate_vsa5', 'fpdensitymorgan3', 'fpdensitymorgan2', 'labuteasa', 'fr_methoxy', 'kappa2', 'sps', 'mi', 'bcut2d_chglo', 'vsa_estate1', 'fr_nh2', 'axp_1d', 'maxestateindex', 'estate_vsa3', 'bertzct', 'phi', 'fractioncsp3', 'xc_5dv', 'smr_vsa6', 'estate_vsa6', 'fr_ar_coo', 'axp_7dv', 'slogp_vsa8', 'estate_vsa2', 'axp_2d', 'balabanj', 'mz', 'estate_vsa4', 'smr_vsa7', 'slogp_vsa5', 'chi0n', 'xp_5d', 'xch_7d', 'chi4n', 'smr_vsa10', 'numheterocycles', 'chi1v', 'axp_5d', 'axp_0dv', 'fr_al_oh', 'chi2n', 'fr_nh0', 'chi1n', 'xp_7dv', 'xpc_5d', 'ringcount', 'xpc_5dv', 'peoe_vsa12', 'xpc_6dv', 'xp_6dv', 'mse', 'sp', 'xpc_6d', 'chi4v', 'fr_aniline', 'c2sp3', 'peoe_vsa4', 'numheteroatoms', 'smr_vsa9', 'xp_7d', 'axp_6d', 'molwt', 'numrotatablebonds', 'smr_vsa4', 'stereo_complexity', 'axp_7d', 'slogp_vsa3', 'spe', 'num_r_centers', 'si', 'axp_0d', 'xp_2d', 'xp_4d', 'c2sp2', 'fr_aryl_methyl', 'mpe', 'xch_6d', 'axp_6dv', 'numsaturatedcarbocycles', 'fr_para_hydroxylation', 'estate_vsa10', 'estate_vsa1', 'hybratio', 'numhacceptors', 'naromatom', 'chi1', 'fr_urea', 'xp_3d', 'smr_vsa1', 'num_s_centers', 'xch_5dv', 'c3sp2', 'mare', 'xp_5dv', 'fr_al_oh_notert', 'estate_vsa9', 'fr_piperdine', 'numunspecifiedatomstereocenters', 'chi3v', 'c3sp3', 'chi0', 'numsaturatedheterocycles', 'xp_4dv', 'fr_amide', 'fr_nhpyrrole', 'mv', 'fr_ar_n', 'xc_4dv', 'fr_morpholine', 'fr_ndealkylation2', 'xch_3d', 'xch_4d', 'slogp_vsa10', 'fr_ar_oh', 'fr_benzene', 'fr_nh1', 'c1sp1', 'sse', 'num_defined_stereocenters', 'xch_4dv', 'peoe_vsa14', 'xch_5d', 'fr_hoccn', 'fr_nitrile', 'mm', 'fr_priamide', 'xc_6dv', 'num_unspecified_stereocenters', 'fr_ether', 'fr_piperzine', 'fr_bicyclic', 'fr_term_acetylene'],
41
39
  "compressed_features": [],
42
- "model_metrics_s3_path": "s3://ideaya-sageworks-bucket/models/caco2-er-reg-pytorch-test/training",
43
- "train_all_data": False,
40
+ "model_metrics_s3_path": "s3://sandbox-sageworks-artifacts/models/caco2-efflux-ref-pytorch/training",
41
+ "train_all_data": True,
44
42
  "hyperparameters": {},
45
43
  }
46
44
 
47
45
 
48
- # Function to check if dataframe is empty
49
46
  def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
50
47
  """
51
48
  Check if the provided dataframe is empty and raise an exception if it is.
@@ -60,19 +57,17 @@ def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
60
57
  raise ValueError(msg)
61
58
 
62
59
 
63
- def expand_proba_column(df: pd.DataFrame, class_labels: List[str]) -> pd.DataFrame:
60
+ def expand_proba_column(df: pd.DataFrame, class_labels: list[str]) -> pd.DataFrame:
64
61
  """
65
62
  Expands a column in a DataFrame containing a list of probabilities into separate columns.
66
63
 
67
64
  Args:
68
65
  df (pd.DataFrame): DataFrame containing a "pred_proba" column
69
- class_labels (List[str]): List of class labels
66
+ class_labels (list[str]): List of class labels
70
67
 
71
68
  Returns:
72
69
  pd.DataFrame: DataFrame with the "pred_proba" expanded into separate columns
73
70
  """
74
-
75
- # Sanity check
76
71
  proba_column = "pred_proba"
77
72
  if proba_column not in df.columns:
78
73
  raise ValueError('DataFrame does not contain a "pred_proba" column')
@@ -89,11 +84,10 @@ def expand_proba_column(df: pd.DataFrame, class_labels: List[str]) -> pd.DataFra
89
84
 
90
85
  # Concatenate the new columns with the original DataFrame
91
86
  df = pd.concat([df, proba_df], axis=1)
92
- print(df)
93
87
  return df
94
88
 
95
89
 
96
- def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> pd.DataFrame:
90
+ def match_features_case_insensitive(df: pd.DataFrame, model_features: list[str]) -> pd.DataFrame:
97
91
  """
98
92
  Matches and renames DataFrame columns to match model feature names (case-insensitive).
99
93
  Prioritizes exact matches, then case-insensitive matches.
@@ -118,55 +112,60 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
118
112
  return df.rename(columns=rename_dict)
119
113
 
120
114
 
121
- def convert_categorical_types(df: pd.DataFrame, features: list, category_mappings={}) -> tuple:
115
+ def convert_categorical_types(
116
+ df: pd.DataFrame, features: list[str], category_mappings: dict[str, list[str]] | None = None
117
+ ) -> tuple[pd.DataFrame, dict[str, list[str]]]:
122
118
  """
123
119
  Converts appropriate columns to categorical type with consistent mappings.
124
120
 
125
121
  Args:
126
122
  df (pd.DataFrame): The DataFrame to process.
127
123
  features (list): List of feature names to consider for conversion.
128
- category_mappings (dict, optional): Existing category mappings. If empty dict, we're in
129
- training mode. If populated, we're in inference mode.
124
+ category_mappings (dict, optional): Existing category mappings. If None or empty,
125
+ we're in training mode. If populated, we're in
126
+ inference mode.
130
127
 
131
128
  Returns:
132
129
  tuple: (processed DataFrame, category mappings dictionary)
133
130
  """
131
+ if category_mappings is None:
132
+ category_mappings = {}
133
+
134
134
  # Training mode
135
- if category_mappings == {}:
135
+ if not category_mappings:
136
136
  for col in df.select_dtypes(include=["object", "string"]):
137
137
  if col in features and df[col].nunique() < 20:
138
138
  print(f"Training mode: Converting {col} to category")
139
139
  df[col] = df[col].astype("category")
140
- category_mappings[col] = df[col].cat.categories.tolist() # Store category mappings
140
+ category_mappings[col] = df[col].cat.categories.tolist()
141
141
 
142
142
  # Inference mode
143
143
  else:
144
144
  for col, categories in category_mappings.items():
145
145
  if col in df.columns:
146
146
  print(f"Inference mode: Applying categorical mapping for {col}")
147
- df[col] = pd.Categorical(df[col], categories=categories) # Apply consistent categorical mapping
147
+ df[col] = pd.Categorical(df[col], categories=categories)
148
148
 
149
149
  return df, category_mappings
150
150
 
151
151
 
152
152
  def decompress_features(
153
- df: pd.DataFrame, features: List[str], compressed_features: List[str]
154
- ) -> Tuple[pd.DataFrame, List[str]]:
153
+ df: pd.DataFrame, features: list[str], compressed_features: list[str]
154
+ ) -> tuple[pd.DataFrame, list[str]]:
155
155
  """Prepare features for the model
156
156
 
157
157
  Args:
158
158
  df (pd.DataFrame): The features DataFrame
159
- features (List[str]): Full list of feature names
160
- compressed_features (List[str]): List of feature names to decompress (bitstrings)
159
+ features (list[str]): Full list of feature names
160
+ compressed_features (list[str]): List of feature names to decompress (bitstrings)
161
161
 
162
162
  Returns:
163
163
  pd.DataFrame: DataFrame with the decompressed features
164
- List[str]: Updated list of feature names after decompression
164
+ list[str]: Updated list of feature names after decompression
165
165
 
166
166
  Raises:
167
167
  ValueError: If any missing values are found in the specified features
168
168
  """
169
-
170
169
  # Check for any missing values in the required features
171
170
  missing_counts = df[features].isna().sum()
172
171
  if missing_counts.any():
@@ -176,10 +175,11 @@ def decompress_features(
176
175
  "WARNING: You might want to remove/replace all NaN values before processing."
177
176
  )
178
177
 
179
- # Decompress the specified compressed features
180
- decompressed_features = features
178
+ # Make a copy to avoid mutating the original list
179
+ decompressed_features = features.copy()
180
+
181
181
  for feature in compressed_features:
182
- if (feature not in df.columns) or (feature not in features):
182
+ if (feature not in df.columns) or (feature not in decompressed_features):
183
183
  print(f"Feature '{feature}' not in the features list, skipping decompression.")
184
184
  continue
185
185
 
@@ -204,26 +204,39 @@ def decompress_features(
204
204
  return df, decompressed_features
205
205
 
206
206
 
207
- def model_fn(model_dir):
207
+ def model_fn(model_dir: str) -> TabularModel:
208
+ """Load the PyTorch Tabular model from the specified directory.
208
209
 
210
+ Args:
211
+ model_dir: Directory containing the saved model
212
+
213
+ Returns:
214
+ Loaded TabularModel instance
215
+ """
209
216
  # Save current working directory
210
217
  original_cwd = os.getcwd()
211
218
  try:
212
219
  # Change to /tmp because Pytorch Tabular needs write access (creates a .pt_tmp directory)
213
220
  os.chdir("/tmp")
214
221
 
215
- # Load the model
222
+ # Remove callbacks.sav if it exists - it's not needed for inference and causes
223
+ # GPU->CPU loading issues (joblib.load doesn't support map_location)
216
224
  model_path = os.path.join(model_dir, "tabular_model")
217
- model = TabularModel.load_model(model_path)
225
+ callbacks_path = os.path.join(model_path, "callbacks.sav")
226
+ if os.path.exists(callbacks_path):
227
+ os.remove(callbacks_path)
228
+
229
+ # Load the model (map_location="cpu" ensures GPU-trained models work on CPU endpoints)
230
+ model = TabularModel.load_model(model_path, map_location="cpu")
218
231
 
219
- # Restore the original working directory
220
232
  finally:
233
+ # Restore the original working directory
221
234
  os.chdir(original_cwd)
222
235
 
223
236
  return model
224
237
 
225
238
 
226
- def input_fn(input_data, content_type):
239
+ def input_fn(input_data, content_type: str) -> pd.DataFrame:
227
240
  """Parse input data and return a DataFrame."""
228
241
  if not input_data:
229
242
  raise ValueError("Empty input data is not supported!")
@@ -240,18 +253,18 @@ def input_fn(input_data, content_type):
240
253
  raise ValueError(f"{content_type} not supported!")
241
254
 
242
255
 
243
- def output_fn(output_df, accept_type):
256
+ def output_fn(output_df: pd.DataFrame, accept_type: str) -> tuple[str, str]:
244
257
  """Supports both CSV and JSON output formats."""
245
258
  if "text/csv" in accept_type:
246
- csv_output = output_df.fillna("N/A").to_csv(index=False) # CSV with N/A for missing values
259
+ csv_output = output_df.fillna("N/A").to_csv(index=False)
247
260
  return csv_output, "text/csv"
248
261
  elif "application/json" in accept_type:
249
- return output_df.to_json(orient="records"), "application/json" # JSON array of records (NaNs -> null)
262
+ return output_df.to_json(orient="records"), "application/json"
250
263
  else:
251
264
  raise RuntimeError(f"{accept_type} accept type is not supported by this script.")
252
265
 
253
266
 
254
- def predict_fn(df, model) -> pd.DataFrame:
267
+ def predict_fn(df: pd.DataFrame, model: TabularModel) -> pd.DataFrame:
255
268
  """Make Predictions with our PyTorch Tabular Model
256
269
 
257
270
  Args:
@@ -275,12 +288,11 @@ def predict_fn(df, model) -> pd.DataFrame:
275
288
 
276
289
  # Load our Label Encoder if we have one
277
290
  label_encoder = None
278
- if os.path.exists(os.path.join(model_dir, "label_encoder.joblib")):
279
- label_encoder = joblib.load(os.path.join(model_dir, "label_encoder.joblib"))
291
+ label_encoder_path = os.path.join(model_dir, "label_encoder.joblib")
292
+ if os.path.exists(label_encoder_path):
293
+ label_encoder = joblib.load(label_encoder_path)
280
294
 
281
- # We're going match features in a case-insensitive manner, accounting for all the permutations
282
- # - Model has a feature list that's any case ("Id", "taCos", "cOunT", "likes_tacos")
283
- # - Incoming data has columns that are mixed case ("ID", "Tacos", "Count", "Likes_Tacos")
295
+ # Match features in a case-insensitive manner
284
296
  matched_df = match_features_case_insensitive(df, features)
285
297
 
286
298
  # Detect categorical types in the incoming DataFrame
@@ -291,12 +303,25 @@ def predict_fn(df, model) -> pd.DataFrame:
291
303
  print("Decompressing features for prediction...")
292
304
  matched_df, features = decompress_features(matched_df, features, compressed_features)
293
305
 
306
+ # Track rows with missing features
307
+ missing_mask = matched_df[features].isna().any(axis=1)
308
+ if missing_mask.any():
309
+ print(f"Warning: {missing_mask.sum()} rows have missing features, will return NaN predictions")
310
+
311
+ # Initialize prediction column with NaN
312
+ df["prediction"] = np.nan
313
+
314
+ # Only predict on complete rows
315
+ complete_df = matched_df[~missing_mask]
316
+ if len(complete_df) == 0:
317
+ print("Warning: No complete rows to predict on")
318
+ return df
319
+
294
320
  # Make predictions using the TabularModel
295
- result = model.predict(matched_df[features])
321
+ result = model.predict(complete_df[features])
296
322
 
297
323
  # pytorch-tabular returns predictions using f"{target}_prediction" column
298
- # and classification probabilities in columns ending with "_probability"
299
- target = TEMPLATE_PARAMS["target_column"]
324
+ target = TEMPLATE_PARAMS["target"]
300
325
  prediction_column = f"{target}_prediction"
301
326
  if prediction_column in result.columns:
302
327
  predictions = result[prediction_column].values
@@ -307,20 +332,23 @@ def predict_fn(df, model) -> pd.DataFrame:
307
332
  if label_encoder:
308
333
  predictions = label_encoder.inverse_transform(predictions.astype(int))
309
334
 
310
- # Set the predictions on the DataFrame
311
- df["prediction"] = predictions
335
+ # Set predictions only for complete rows
336
+ df.loc[~missing_mask, "prediction"] = predictions
312
337
 
313
338
  # For classification, get probabilities
314
339
  if label_encoder is not None:
315
340
  prob_cols = [col for col in result.columns if col.endswith("_probability")]
316
341
  if prob_cols:
317
342
  probs = result[prob_cols].values
318
- df["pred_proba"] = [p.tolist() for p in probs]
343
+
344
+ # Build full proba Series with None for missing rows
345
+ all_proba = pd.Series([None] * len(df), index=df.index, dtype=object)
346
+ all_proba.loc[~missing_mask] = [p.tolist() for p in probs]
347
+ df["pred_proba"] = all_proba
319
348
 
320
349
  # Expand the pred_proba column into separate columns for each class
321
350
  df = expand_proba_column(df, label_encoder.classes_)
322
351
 
323
- # All done, return the DataFrame with new columns for the predictions
324
352
  return df
325
353
 
326
354
 
@@ -354,9 +382,21 @@ if __name__ == "__main__":
354
382
  # Combine files and read them all into a single pandas dataframe
355
383
  all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
356
384
 
385
+ # Print out some info about the dataframe
386
+ print(f"All Data Shape: {all_df.shape}")
387
+ print(f"Feature dtypes:\n{all_df[features].dtypes.value_counts()}")
388
+ print(f"Int64 columns: {all_df[features].select_dtypes(include=['int64']).columns.tolist()}")
389
+
357
390
  # Check if the dataframe is empty
358
391
  check_dataframe(all_df, "training_df")
359
392
 
393
+ # Drop any rows with missing feature values
394
+ initial_row_count = all_df.shape[0]
395
+ all_df = all_df.dropna(subset=features)
396
+ dropped_rows = initial_row_count - all_df.shape[0]
397
+ if dropped_rows > 0:
398
+ print(f"Dropped {dropped_rows} rows due to missing feature values.")
399
+
360
400
  # Features/Target output
361
401
  print(f"Target: {target}")
362
402
  print(f"Features: {str(features)}")
@@ -364,11 +404,25 @@ if __name__ == "__main__":
364
404
  # Convert any features that might be categorical to 'category' type
365
405
  all_df, category_mappings = convert_categorical_types(all_df, features)
366
406
 
407
+ # Print out some info about the dataframe
408
+ print(f"All Data Shape: {all_df.shape}")
409
+ print(f"Feature dtypes:\n{all_df[features].dtypes.value_counts()}")
410
+ print(f"Int64 columns: {all_df[features].select_dtypes(include=['int64']).columns.tolist()}")
411
+
367
412
  # If we have compressed features, decompress them
368
413
  if compressed_features:
369
414
  print(f"Decompressing features {compressed_features}...")
370
415
  all_df, features = decompress_features(all_df, features, compressed_features)
371
416
 
417
+ # Determine categorical and continuous columns
418
+ categorical_cols = [col for col in features if all_df[col].dtype.name == "category"]
419
+ continuous_cols = [col for col in features if col not in categorical_cols]
420
+ print(f"Categorical columns: {categorical_cols}")
421
+ print(f"Continuous columns: {continuous_cols}")
422
+
423
+ # Cast continuous columns to float
424
+ all_df[continuous_cols] = all_df[continuous_cols].astype("float64")
425
+
372
426
  # Do we want to train on all the data?
373
427
  if train_all_data:
374
428
  print("Training on ALL of the data")
@@ -378,8 +432,8 @@ if __name__ == "__main__":
378
432
  # Does the dataframe have a training column?
379
433
  elif "training" in all_df.columns:
380
434
  print("Found training column, splitting data based on training column")
381
- df_train = all_df[all_df["training"]]
382
- df_val = all_df[~all_df["training"]]
435
+ df_train = all_df[all_df["training"]].copy()
436
+ df_val = all_df[~all_df["training"]].copy()
383
437
  else:
384
438
  # Just do a random training Split
385
439
  print("WARNING: No training column found, splitting data with random state=42")
@@ -387,13 +441,6 @@ if __name__ == "__main__":
387
441
  print(f"FIT/TRAIN: {df_train.shape}")
388
442
  print(f"VALIDATION: {df_val.shape}")
389
443
 
390
- # Determine categorical and continuous columns
391
- categorical_cols = [col for col in features if df_train[col].dtype.name == "category"]
392
- continuous_cols = [col for col in features if col not in categorical_cols]
393
-
394
- print(f"Categorical columns: {categorical_cols}")
395
- print(f"Continuous columns: {continuous_cols}")
396
-
397
444
  # Set up PyTorch Tabular configuration
398
445
  data_config = DataConfig(
399
446
  target=[target],
@@ -417,11 +464,12 @@ if __name__ == "__main__":
417
464
 
418
465
  # Set up PyTorch Tabular configuration with defaults
419
466
  trainer_defaults = {
420
- "auto_lr_find": True,
421
- "batch_size": min(1024, max(32, len(df_train) // 4)),
467
+ "auto_lr_find": False,
468
+ "batch_size": min(128, max(32, len(df_train) // 16)),
422
469
  "max_epochs": 100,
470
+ "min_epochs": 10,
423
471
  "early_stopping": "valid_loss",
424
- "early_stopping_patience": 15,
472
+ "early_stopping_patience": 10,
425
473
  "checkpoints": "valid_loss",
426
474
  "accelerator": "auto",
427
475
  "progress_bar": "none",
@@ -430,7 +478,6 @@ if __name__ == "__main__":
430
478
 
431
479
  # Override defaults with training_config if present
432
480
  training_overrides = {k: v for k, v in hyperparameters.get("training_config", {}).items() if k in trainer_defaults}
433
- # Print overwrites
434
481
  for key, value in training_overrides.items():
435
482
  print(f"TRAINING CONFIG Override: {key}: {trainer_defaults[key]} → {value}")
436
483
  trainer_params = {**trainer_defaults, **training_overrides}
@@ -438,23 +485,20 @@ if __name__ == "__main__":
438
485
 
439
486
  # Model config defaults
440
487
  model_defaults = {
441
- "layers": "1024-512-512",
442
- "activation": "ReLU",
488
+ "layers": "256-128-64",
489
+ "activation": "LeakyReLU",
443
490
  "learning_rate": 1e-3,
444
- "dropout": 0.1,
491
+ "dropout": 0.3,
445
492
  "use_batch_norm": True,
446
493
  "initialization": "kaiming",
447
494
  }
448
495
  # Override defaults with model_config if present
449
496
  model_overrides = {k: v for k, v in hyperparameters.get("model_config", {}).items() if k in model_defaults}
450
- # Print overwrites
451
497
  for key, value in model_overrides.items():
452
498
  print(f"MODEL CONFIG Override: {key}: {model_defaults[key]} → {value}")
453
499
  model_params = {**model_defaults, **model_overrides}
454
500
 
455
501
  # Use CategoryEmbedding model configuration for general-purpose tabular modeling.
456
- # Works effectively for both regression and classification as the foundational
457
- # architecture in PyTorch Tabular
458
502
  model_config = CategoryEmbeddingModelConfig(task=task, **model_params)
459
503
  optimizer_config = OptimizerConfig()
460
504
 
@@ -474,36 +518,34 @@ if __name__ == "__main__":
474
518
  result = tabular_model.predict(df_val, include_input_features=False)
475
519
 
476
520
  # pytorch-tabular returns predictions using f"{target}_prediction" column
477
- # and classification probabilities in columns ending with "_probability"
478
- if model_type == "classifier":
479
- preds = result[f"{target}_prediction"].values
480
- else:
481
- # Regression: use the target column name
482
- preds = result[f"{target}_prediction"].values
521
+ preds = result[f"{target}_prediction"].values
483
522
 
484
523
  if model_type == "classifier":
485
524
  # Get probabilities for classification
486
525
  print("Processing Probabilities...")
487
- prob_cols = [col for col in result.columns if col.endswith("_probability")]
526
+ prob_cols = sorted([col for col in result.columns if col.endswith("_probability")])
488
527
  if prob_cols:
489
528
  probs = result[prob_cols].values
529
+ df_val = df_val.copy() # Avoid SettingWithCopyWarning
490
530
  df_val["pred_proba"] = [p.tolist() for p in probs]
491
531
 
492
532
  # Expand the pred_proba column into separate columns for each class
493
- print(df_val.columns)
533
+ print(df_val.columns.tolist())
494
534
  df_val = expand_proba_column(df_val, label_encoder.classes_)
495
- print(df_val.columns)
535
+ print(df_val.columns.tolist())
496
536
 
497
537
  # Decode the target and prediction labels
498
538
  y_validate = label_encoder.inverse_transform(df_val[target])
499
- preds = label_encoder.inverse_transform(preds.astype(int))
539
+ preds_decoded = label_encoder.inverse_transform(preds.astype(int))
500
540
  else:
501
541
  y_validate = df_val[target].values
542
+ preds_decoded = preds
502
543
 
503
- # Save predictions to S3 (just the target, prediction, and '_probability' columns)
504
- df_val["prediction"] = preds
544
+ # Save predictions to S3
545
+ df_val = df_val.copy()
546
+ df_val["prediction"] = preds_decoded
505
547
  output_columns = [target, "prediction"]
506
- output_columns += [col for col in df_val.columns if col.endswith("_probability")]
548
+ output_columns += [col for col in df_val.columns if col.endswith("_proba")]
507
549
  wr.s3.to_csv(
508
550
  df_val[output_columns],
509
551
  path=f"{model_metrics_s3_path}/validation_predictions.csv",
@@ -516,7 +558,7 @@ if __name__ == "__main__":
516
558
  label_names = label_encoder.classes_
517
559
 
518
560
  # Calculate various model performance metrics
519
- scores = precision_recall_fscore_support(y_validate, preds, average=None, labels=label_names)
561
+ scores = precision_recall_fscore_support(y_validate, preds_decoded, average=None, labels=label_names)
520
562
 
521
563
  # Put the scores into a dataframe
522
564
  score_df = pd.DataFrame(
@@ -529,7 +571,7 @@ if __name__ == "__main__":
529
571
  }
530
572
  )
531
573
 
532
- # We need to get creative with the Classification Metrics
574
+ # Output metrics per class
533
575
  metrics = ["precision", "recall", "f1", "support"]
534
576
  for t in label_names:
535
577
  for m in metrics:
@@ -537,7 +579,7 @@ if __name__ == "__main__":
537
579
  print(f"Metrics:{t}:{m} {value}")
538
580
 
539
581
  # Compute and output the confusion matrix
540
- conf_mtx = confusion_matrix(y_validate, preds, labels=label_names)
582
+ conf_mtx = confusion_matrix(y_validate, preds_decoded, labels=label_names)
541
583
  for i, row_name in enumerate(label_names):
542
584
  for j, col_name in enumerate(label_names):
543
585
  value = conf_mtx[i, j]
@@ -545,9 +587,9 @@ if __name__ == "__main__":
545
587
 
546
588
  else:
547
589
  # Calculate various model performance metrics (regression)
548
- rmse = root_mean_squared_error(y_validate, preds)
549
- mae = mean_absolute_error(y_validate, preds)
550
- r2 = r2_score(y_validate, preds)
590
+ rmse = root_mean_squared_error(y_validate, preds_decoded)
591
+ mae = mean_absolute_error(y_validate, preds_decoded)
592
+ r2 = r2_score(y_validate, preds_decoded)
551
593
  print(f"RMSE: {rmse:.3f}")
552
594
  print(f"MAE: {mae:.3f}")
553
595
  print(f"R2: {r2:.3f}")
@@ -560,7 +602,7 @@ if __name__ == "__main__":
560
602
 
561
603
  # Save the features (this will validate input during predictions)
562
604
  with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
563
- json.dump(orig_features, fp) # We save the original features, not the decompressed ones
605
+ json.dump(orig_features, fp)
564
606
 
565
607
  # Save the category mappings
566
608
  with open(os.path.join(args.model_dir, "category_mappings.json"), "w") as fp: