workbench 0.8.177__py3-none-any.whl → 0.8.179__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of workbench might be problematic. Click here for more details.

Files changed (27) hide show
  1. workbench/api/endpoint.py +3 -2
  2. workbench/core/artifacts/endpoint_core.py +5 -5
  3. workbench/core/artifacts/feature_set_core.py +67 -8
  4. workbench/core/views/training_view.py +38 -48
  5. workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +3 -5
  6. workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
  7. workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +10 -17
  8. workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
  9. workbench/model_scripts/custom_models/uq_models/generated_model_script.py +44 -45
  10. workbench/model_scripts/custom_models/uq_models/mapie.template +42 -43
  11. workbench/model_scripts/custom_models/uq_models/meta_uq.template +7 -22
  12. workbench/model_scripts/custom_models/uq_models/ngboost.template +5 -12
  13. workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +5 -13
  14. workbench/model_scripts/pytorch_model/pytorch.template +9 -18
  15. workbench/model_scripts/quant_regression/quant_regression.template +5 -10
  16. workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
  17. workbench/model_scripts/xgb_model/generated_model_script.py +24 -33
  18. workbench/model_scripts/xgb_model/xgb_model.template +23 -32
  19. workbench/scripts/ml_pipeline_sqs.py +14 -2
  20. workbench/utils/model_utils.py +12 -2
  21. workbench/utils/xgboost_model_utils.py +161 -138
  22. {workbench-0.8.177.dist-info → workbench-0.8.179.dist-info}/METADATA +1 -1
  23. {workbench-0.8.177.dist-info → workbench-0.8.179.dist-info}/RECORD +27 -27
  24. {workbench-0.8.177.dist-info → workbench-0.8.179.dist-info}/WHEEL +0 -0
  25. {workbench-0.8.177.dist-info → workbench-0.8.179.dist-info}/entry_points.txt +0 -0
  26. {workbench-0.8.177.dist-info → workbench-0.8.179.dist-info}/licenses/LICENSE +0 -0
  27. {workbench-0.8.177.dist-info → workbench-0.8.179.dist-info}/top_level.txt +0 -0
@@ -36,12 +36,12 @@ from typing import List, Tuple
36
36
  # Template Parameters
37
37
  TEMPLATE_PARAMS = {
38
38
  "model_type": "{{model_type}}",
39
- "target_column": "{{target_column}}",
39
+ "target": "{{target_column}}",
40
40
  "features": "{{feature_list}}",
41
41
  "compressed_features": "{{compressed_features}}",
42
42
  "model_metrics_s3_path": "{{model_metrics_s3_path}}",
43
43
  "train_all_data": "{{train_all_data}}",
44
- "hyperparameters": "{{hyperparameters}}"
44
+ "hyperparameters": "{{hyperparameters}}",
45
45
  }
46
46
 
47
47
 
@@ -103,7 +103,6 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
103
103
  df_columns_lower = {col.lower(): col for col in df.columns}
104
104
  rename_dict = {}
105
105
  missing = []
106
-
107
106
  for feature in model_features:
108
107
  if feature in df.columns:
109
108
  continue # Exact match
@@ -115,6 +114,7 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
115
114
  if missing:
116
115
  raise ValueError(f"Features not found: {missing}")
117
116
 
117
+ # Rename the DataFrame columns to match the model features
118
118
  return df.rename(columns=rename_dict)
119
119
 
120
120
 
@@ -210,7 +210,7 @@ def model_fn(model_dir):
210
210
  original_cwd = os.getcwd()
211
211
  try:
212
212
  # Change to /tmp because Pytorch Tabular needs write access (creates a .pt_tmp directory)
213
- os.chdir('/tmp')
213
+ os.chdir("/tmp")
214
214
 
215
215
  # Load the model
216
216
  model_path = os.path.join(model_dir, "tabular_model")
@@ -328,7 +328,7 @@ if __name__ == "__main__":
328
328
  """The main function is for training the PyTorch Tabular model"""
329
329
 
330
330
  # Harness Template Parameters
331
- target = TEMPLATE_PARAMS["target_column"]
331
+ target = TEMPLATE_PARAMS["target"]
332
332
  features = TEMPLATE_PARAMS["features"]
333
333
  orig_features = features.copy()
334
334
  compressed_features = TEMPLATE_PARAMS["compressed_features"]
@@ -348,11 +348,7 @@ if __name__ == "__main__":
348
348
  args = parser.parse_args()
349
349
 
350
350
  # Read the training data into DataFrames
351
- training_files = [
352
- os.path.join(args.train, file)
353
- for file in os.listdir(args.train)
354
- if file.endswith(".csv")
355
- ]
351
+ training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
356
352
  print(f"Training Files: {training_files}")
357
353
 
358
354
  # Combine files and read them all into a single pandas dataframe
@@ -433,8 +429,7 @@ if __name__ == "__main__":
433
429
  }
434
430
 
435
431
  # Override defaults with training_config if present
436
- training_overrides = {k: v for k, v in hyperparameters.get('training_config', {}).items()
437
- if k in trainer_defaults}
432
+ training_overrides = {k: v for k, v in hyperparameters.get("training_config", {}).items() if k in trainer_defaults}
438
433
  # Print overwrites
439
434
  for key, value in training_overrides.items():
440
435
  print(f"TRAINING CONFIG Override: {key}: {trainer_defaults[key]} → {value}")
@@ -451,8 +446,7 @@ if __name__ == "__main__":
451
446
  "initialization": "kaiming",
452
447
  }
453
448
  # Override defaults with model_config if present
454
- model_overrides = {k: v for k, v in hyperparameters.get('model_config', {}).items()
455
- if k in model_defaults}
449
+ model_overrides = {k: v for k, v in hyperparameters.get("model_config", {}).items() if k in model_defaults}
456
450
  # Print overwrites
457
451
  for key, value in model_overrides.items():
458
452
  print(f"MODEL CONFIG Override: {key}: {model_defaults[key]} → {value}")
@@ -461,10 +455,7 @@ if __name__ == "__main__":
461
455
  # Use CategoryEmbedding model configuration for general-purpose tabular modeling.
462
456
  # Works effectively for both regression and classification as the foundational
463
457
  # architecture in PyTorch Tabular
464
- model_config = CategoryEmbeddingModelConfig(
465
- task=task,
466
- **model_params
467
- )
458
+ model_config = CategoryEmbeddingModelConfig(task=task, **model_params)
468
459
  optimizer_config = OptimizerConfig()
469
460
 
470
461
  #####################################
@@ -4,11 +4,7 @@ import awswrangler as wr
4
4
  from sklearn.model_selection import train_test_split
5
5
 
6
6
  # Model Performance Scores
7
- from sklearn.metrics import (
8
- mean_absolute_error,
9
- r2_score,
10
- root_mean_squared_error
11
- )
7
+ from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
12
8
 
13
9
  from io import StringIO
14
10
  import json
@@ -22,9 +18,10 @@ TEMPLATE_PARAMS = {
22
18
  "target_column": "{{target_column}}",
23
19
  "features": "{{feature_list}}",
24
20
  "model_metrics_s3_path": "{{model_metrics_s3_path}}",
25
- "train_all_data": "{{train_all_data}}"
21
+ "train_all_data": "{{train_all_data}}",
26
22
  }
27
23
 
24
+
28
25
  # Function to check if dataframe is empty
29
26
  def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
30
27
  """
@@ -64,6 +61,7 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
64
61
  # Rename the DataFrame columns to match the model features
65
62
  return df.rename(columns=rename_dict)
66
63
 
64
+
67
65
  if __name__ == "__main__":
68
66
  """The main function is for training the XGBoost Quantile Regression models"""
69
67
 
@@ -86,10 +84,7 @@ if __name__ == "__main__":
86
84
  args = parser.parse_args()
87
85
 
88
86
  # Load training data from the specified directory
89
- training_files = [
90
- os.path.join(args.train, file)
91
- for file in os.listdir(args.train) if file.endswith(".csv")
92
- ]
87
+ training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
93
88
  print(f"Training Files: {training_files}")
94
89
 
95
90
  # Combine files and read them all into a single pandas dataframe
@@ -8,7 +8,7 @@ TEMPLATE_PARAMS = {
8
8
  "feature_list": "{{feature_list}}",
9
9
  "model_class": "{{model_class}}",
10
10
  "model_metrics_s3_path": "{{model_metrics_s3_path}}",
11
- "train_all_data": "{{train_all_data}}"
11
+ "train_all_data": "{{train_all_data}}",
12
12
  }
13
13
 
14
14
  import awswrangler as wr
@@ -99,10 +99,7 @@ if __name__ == "__main__":
99
99
  args = parser.parse_args()
100
100
 
101
101
  # Load training data from the specified directory
102
- training_files = [
103
- os.path.join(args.train, file)
104
- for file in os.listdir(args.train) if file.endswith(".csv")
105
- ]
102
+ training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
106
103
  all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
107
104
 
108
105
  # Check if the DataFrame is empty
@@ -116,10 +113,7 @@ if __name__ == "__main__":
116
113
 
117
114
  if needs_standardization:
118
115
  # Create a pipeline with standardization and the model
119
- model = Pipeline([
120
- ("scaler", StandardScaler()),
121
- ("model", model)
122
- ])
116
+ model = Pipeline([("scaler", StandardScaler()), ("model", model)])
123
117
 
124
118
  # Handle logic based on the model_type
125
119
  if model_type in ["classifier", "regressor"]:
@@ -206,6 +200,7 @@ if __name__ == "__main__":
206
200
  with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
207
201
  json.dump(feature_list, fp)
208
202
 
203
+
209
204
  #
210
205
  # Inference Section
211
206
  #
@@ -32,10 +32,12 @@ TEMPLATE_PARAMS = {
32
32
  "target": "udm_asy_res_value",
33
33
  "features": ['chi2v', 'fr_sulfone', 'chi1v', 'bcut2d_logplow', 'fr_piperzine', 'kappa3', 'smr_vsa1', 'slogp_vsa5', 'fr_ketone_topliss', 'fr_sulfonamd', 'fr_imine', 'fr_benzene', 'fr_ester', 'chi2n', 'labuteasa', 'peoe_vsa2', 'smr_vsa6', 'bcut2d_chglo', 'fr_sh', 'peoe_vsa1', 'fr_allylic_oxid', 'chi4n', 'fr_ar_oh', 'fr_nh0', 'fr_term_acetylene', 'slogp_vsa7', 'slogp_vsa4', 'estate_vsa1', 'vsa_estate4', 'numbridgeheadatoms', 'numheterocycles', 'fr_ketone', 'fr_morpholine', 'fr_guanido', 'estate_vsa2', 'numheteroatoms', 'fr_nitro_arom_nonortho', 'fr_piperdine', 'nocount', 'numspiroatoms', 'fr_aniline', 'fr_thiophene', 'slogp_vsa10', 'fr_amide', 'slogp_vsa2', 'fr_epoxide', 'vsa_estate7', 'fr_ar_coo', 'fr_imidazole', 'fr_nitrile', 'fr_oxazole', 'numsaturatedrings', 'fr_pyridine', 'fr_hoccn', 'fr_ndealkylation1', 'numaliphaticheterocycles', 'fr_phenol', 'maxpartialcharge', 'vsa_estate5', 'peoe_vsa13', 'minpartialcharge', 'qed', 'fr_al_oh', 'slogp_vsa11', 'chi0n', 'fr_bicyclic', 'peoe_vsa12', 'fpdensitymorgan1', 'fr_oxime', 'molwt', 'fr_dihydropyridine', 'smr_vsa5', 'peoe_vsa5', 'fr_nitro', 'hallkieralpha', 'heavyatommolwt', 'fr_alkyl_halide', 'peoe_vsa8', 'fr_nhpyrrole', 'fr_isocyan', 'bcut2d_chghi', 'fr_lactam', 'peoe_vsa11', 'smr_vsa9', 'tpsa', 'chi4v', 'slogp_vsa1', 'phi', 'bcut2d_logphi', 'avgipc', 'estate_vsa11', 'fr_coo', 'bcut2d_mwhi', 'numunspecifiedatomstereocenters', 'vsa_estate10', 'estate_vsa8', 'numvalenceelectrons', 'fr_nh2', 'fr_lactone', 'vsa_estate1', 'estate_vsa4', 'numatomstereocenters', 'vsa_estate8', 'fr_para_hydroxylation', 'peoe_vsa3', 'fr_thiazole', 'peoe_vsa10', 'fr_ndealkylation2', 'slogp_vsa12', 'peoe_vsa9', 'maxestateindex', 'fr_quatn', 'smr_vsa7', 'minestateindex', 'numaromaticheterocycles', 'numrotatablebonds', 'fr_ar_nh', 'fr_ether', 'exactmolwt', 'fr_phenol_noorthohbond', 'slogp_vsa3', 'fr_ar_n', 'sps', 'fr_c_o_nocoo', 'bertzct', 'peoe_vsa7', 'slogp_vsa8', 'numradicalelectrons', 'molmr', 'fr_tetrazole', 'numsaturatedcarbocycles', 'bcut2d_mrhi', 'kappa1', 'numamidebonds', 'fpdensitymorgan2', 'smr_vsa8', 'chi1n', 'estate_vsa6', 'fr_barbitur', 'fr_diazo', 'kappa2', 'chi0', 'bcut2d_mrlow', 'balabanj', 'peoe_vsa4', 'numhacceptors', 'fr_sulfide', 'chi3n', 'smr_vsa2', 'fr_al_oh_notert', 'fr_benzodiazepine', 'fr_phos_ester', 'fr_aldehyde', 'fr_coo2', 'estate_vsa5', 'fr_prisulfonamd', 'numaromaticcarbocycles', 'fr_unbrch_alkane', 'fr_urea', 'fr_nitroso', 'smr_vsa10', 'fr_c_s', 'smr_vsa3', 'fr_methoxy', 'maxabspartialcharge', 'slogp_vsa9', 'heavyatomcount', 'fr_azide', 'chi3v', 'smr_vsa4', 'mollogp', 'chi0v', 'fr_aryl_methyl', 'fr_nh1', 'fpdensitymorgan3', 'fr_furan', 'fr_hdrzine', 'fr_arn', 'numaromaticrings', 'vsa_estate3', 'fr_azo', 'fr_halogen', 'estate_vsa9', 'fr_hdrzone', 'numhdonors', 'fr_alkyl_carbamate', 'fr_isothiocyan', 'minabspartialcharge', 'fr_al_coo', 'ringcount', 'chi1', 'estate_vsa7', 'fr_nitro_arom', 'vsa_estate9', 'minabsestateindex', 'maxabsestateindex', 'vsa_estate6', 'estate_vsa10', 'estate_vsa3', 'fr_n_o', 'fr_amidine', 'fr_thiocyan', 'fr_phos_acid', 'fr_c_o', 'fr_imide', 'numaliphaticrings', 'peoe_vsa6', 'vsa_estate2', 'nhohcount', 'numsaturatedheterocycles', 'slogp_vsa6', 'peoe_vsa14', 'fractioncsp3', 'bcut2d_mwlow', 'numaliphaticcarbocycles', 'fr_priamide', 'nacid', 'nbase', 'naromatom', 'narombond', 'sz', 'sm', 'sv', 'sse', 'spe', 'sare', 'sp', 'si', 'mz', 'mm', 'mv', 'mse', 'mpe', 'mare', 'mp', 'mi', 'xch_3d', 'xch_4d', 'xch_5d', 'xch_6d', 'xch_7d', 'xch_3dv', 'xch_4dv', 'xch_5dv', 'xch_6dv', 'xch_7dv', 'xc_3d', 'xc_4d', 'xc_5d', 'xc_6d', 'xc_3dv', 'xc_4dv', 'xc_5dv', 'xc_6dv', 'xpc_4d', 'xpc_5d', 'xpc_6d', 'xpc_4dv', 'xpc_5dv', 'xpc_6dv', 'xp_0d', 'xp_1d', 'xp_2d', 'xp_3d', 'xp_4d', 'xp_5d', 'xp_6d', 'xp_7d', 'axp_0d', 'axp_1d', 'axp_2d', 'axp_3d', 'axp_4d', 'axp_5d', 'axp_6d', 'axp_7d', 'xp_0dv', 'xp_1dv', 'xp_2dv', 'xp_3dv', 'xp_4dv', 'xp_5dv', 'xp_6dv', 'xp_7dv', 'axp_0dv', 'axp_1dv', 'axp_2dv', 'axp_3dv', 'axp_4dv', 'axp_5dv', 'axp_6dv', 'axp_7dv', 'c1sp1', 'c2sp1', 'c1sp2', 'c2sp2', 'c3sp2', 'c1sp3', 'c2sp3', 'c3sp3', 'c4sp3', 'hybratio', 'fcsp3', 'num_stereocenters', 'num_unspecified_stereocenters', 'num_defined_stereocenters', 'num_r_centers', 'num_s_centers', 'num_stereobonds', 'num_e_bonds', 'num_z_bonds', 'stereo_complexity', 'frac_defined_stereo'],
34
34
  "compressed_features": [],
35
- "model_metrics_s3_path": "s3://ideaya-sageworks-bucket/models/pka-a1-reg-0-nightly-100-test/training",
36
- "train_all_data": True
35
+ "model_metrics_s3_path": "s3://ideaya-sageworks-bucket/models/logd-hyper-80/training",
36
+ "train_all_data": False,
37
+ "hyperparameters": {'n_estimators': 200, 'max_depth': 6, 'learning_rate': 0.05, 'subsample': 0.7, 'colsample_bytree': 0.3, 'colsample_bylevel': 0.5, 'min_child_weight': 5, 'gamma': 0.2, 'reg_alpha': 0.5, 'reg_lambda': 2.0, 'scale_pos_weight': 1},
37
38
  }
38
39
 
40
+
39
41
  # Function to check if dataframe is empty
40
42
  def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
41
43
  """
@@ -75,7 +77,7 @@ def expand_proba_column(df: pd.DataFrame, class_labels: List[str]) -> pd.DataFra
75
77
  proba_df = pd.DataFrame(df[proba_column].tolist(), columns=proba_splits)
76
78
 
77
79
  # Drop any proba columns and reset the index in prep for the concat
78
- df = df.drop(columns=[proba_column]+proba_splits, errors="ignore")
80
+ df = df.drop(columns=[proba_column] + proba_splits, errors="ignore")
79
81
  df = df.reset_index(drop=True)
80
82
 
81
83
  # Concatenate the new columns with the original DataFrame
@@ -140,8 +142,10 @@ def convert_categorical_types(df: pd.DataFrame, features: list, category_mapping
140
142
  return df, category_mappings
141
143
 
142
144
 
143
- def decompress_features(df: pd.DataFrame, features: List[str], compressed_features: List[str]) -> Tuple[pd.DataFrame, List[str]]:
144
- """Prepare features for the XGBoost model
145
+ def decompress_features(
146
+ df: pd.DataFrame, features: List[str], compressed_features: List[str]
147
+ ) -> Tuple[pd.DataFrame, List[str]]:
148
+ """Prepare features for the model
145
149
 
146
150
  Args:
147
151
  df (pd.DataFrame): The features DataFrame
@@ -204,6 +208,7 @@ if __name__ == "__main__":
204
208
  model_type = TEMPLATE_PARAMS["model_type"]
205
209
  model_metrics_s3_path = TEMPLATE_PARAMS["model_metrics_s3_path"]
206
210
  train_all_data = TEMPLATE_PARAMS["train_all_data"]
211
+ hyperparameters = TEMPLATE_PARAMS["hyperparameters"]
207
212
  validation_split = 0.2
208
213
 
209
214
  # Script arguments for input/output directories
@@ -216,11 +221,7 @@ if __name__ == "__main__":
216
221
  args = parser.parse_args()
217
222
 
218
223
  # Read the training data into DataFrames
219
- training_files = [
220
- os.path.join(args.train, file)
221
- for file in os.listdir(args.train)
222
- if file.endswith(".csv")
223
- ]
224
+ training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
224
225
  print(f"Training Files: {training_files}")
225
226
 
226
227
  # Combine files and read them all into a single pandas dataframe
@@ -255,15 +256,16 @@ if __name__ == "__main__":
255
256
  else:
256
257
  # Just do a random training Split
257
258
  print("WARNING: No training column found, splitting data with random state=42")
258
- df_train, df_val = train_test_split(
259
- all_df, test_size=validation_split, random_state=42
260
- )
259
+ df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
261
260
  print(f"FIT/TRAIN: {df_train.shape}")
262
261
  print(f"VALIDATION: {df_val.shape}")
263
262
 
263
+ # Use any hyperparameters to set up both the trainer and model configurations
264
+ print(f"Hyperparameters: {hyperparameters}")
265
+
264
266
  # Now spin up our XGB Model
265
267
  if model_type == "classifier":
266
- xgb_model = xgb.XGBClassifier(enable_categorical=True)
268
+ xgb_model = xgb.XGBClassifier(enable_categorical=True, **hyperparameters)
267
269
 
268
270
  # Encode the target column
269
271
  label_encoder = LabelEncoder()
@@ -271,12 +273,12 @@ if __name__ == "__main__":
271
273
  df_val[target] = label_encoder.transform(df_val[target])
272
274
 
273
275
  else:
274
- xgb_model = xgb.XGBRegressor(enable_categorical=True)
276
+ xgb_model = xgb.XGBRegressor(enable_categorical=True, **hyperparameters)
275
277
  label_encoder = None # We don't need this for regression
276
278
 
277
279
  # Grab our Features, Target and Train the Model
278
280
  y_train = df_train[target]
279
- X_train= df_train[features]
281
+ X_train = df_train[features]
280
282
  xgb_model.fit(X_train, y_train)
281
283
 
282
284
  # Make Predictions on the Validation Set
@@ -315,9 +317,7 @@ if __name__ == "__main__":
315
317
  label_names = label_encoder.classes_
316
318
 
317
319
  # Calculate various model performance metrics
318
- scores = precision_recall_fscore_support(
319
- y_validate, preds, average=None, labels=label_names
320
- )
320
+ scores = precision_recall_fscore_support(y_validate, preds, average=None, labels=label_names)
321
321
 
322
322
  # Put the scores into a dataframe
323
323
  score_df = pd.DataFrame(
@@ -355,7 +355,9 @@ if __name__ == "__main__":
355
355
  print(f"NumRows: {len(df_val)}")
356
356
 
357
357
  # Now save the model to the standard place/name
358
- xgb_model.save_model(os.path.join(args.model_dir, "xgb_model.json"))
358
+ joblib.dump(xgb_model, os.path.join(args.model_dir, "xgb_model.joblib"))
359
+
360
+ # Save the label encoder if we have one
359
361
  if label_encoder:
360
362
  joblib.dump(label_encoder, os.path.join(args.model_dir, "label_encoder.joblib"))
361
363
 
@@ -370,19 +372,8 @@ if __name__ == "__main__":
370
372
 
371
373
  def model_fn(model_dir):
372
374
  """Deserialize and return fitted XGBoost model"""
373
-
374
- model_path = os.path.join(model_dir, "xgb_model.json")
375
-
376
- with open(model_path, "r") as f:
377
- model_json = json.load(f)
378
-
379
- sklearn_data = model_json['learner']['attributes']['scikit_learn']
380
- model_type = json.loads(sklearn_data)['_estimator_type']
381
-
382
- model_class = xgb.XGBClassifier if model_type == "classifier" else xgb.XGBRegressor
383
- model = model_class(enable_categorical=True)
384
- model.load_model(model_path)
385
-
375
+ model_path = os.path.join(model_dir, "xgb_model.joblib")
376
+ model = joblib.load(model_path)
386
377
  return model
387
378
 
388
379
 
@@ -33,9 +33,11 @@ TEMPLATE_PARAMS = {
33
33
  "features": "{{feature_list}}",
34
34
  "compressed_features": "{{compressed_features}}",
35
35
  "model_metrics_s3_path": "{{model_metrics_s3_path}}",
36
- "train_all_data": "{{train_all_data}}"
36
+ "train_all_data": "{{train_all_data}}",
37
+ "hyperparameters": "{{hyperparameters}}",
37
38
  }
38
39
 
40
+
39
41
  # Function to check if dataframe is empty
40
42
  def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
41
43
  """
@@ -75,7 +77,7 @@ def expand_proba_column(df: pd.DataFrame, class_labels: List[str]) -> pd.DataFra
75
77
  proba_df = pd.DataFrame(df[proba_column].tolist(), columns=proba_splits)
76
78
 
77
79
  # Drop any proba columns and reset the index in prep for the concat
78
- df = df.drop(columns=[proba_column]+proba_splits, errors="ignore")
80
+ df = df.drop(columns=[proba_column] + proba_splits, errors="ignore")
79
81
  df = df.reset_index(drop=True)
80
82
 
81
83
  # Concatenate the new columns with the original DataFrame
@@ -140,8 +142,10 @@ def convert_categorical_types(df: pd.DataFrame, features: list, category_mapping
140
142
  return df, category_mappings
141
143
 
142
144
 
143
- def decompress_features(df: pd.DataFrame, features: List[str], compressed_features: List[str]) -> Tuple[pd.DataFrame, List[str]]:
144
- """Prepare features for the XGBoost model
145
+ def decompress_features(
146
+ df: pd.DataFrame, features: List[str], compressed_features: List[str]
147
+ ) -> Tuple[pd.DataFrame, List[str]]:
148
+ """Prepare features for the model
145
149
 
146
150
  Args:
147
151
  df (pd.DataFrame): The features DataFrame
@@ -204,6 +208,7 @@ if __name__ == "__main__":
204
208
  model_type = TEMPLATE_PARAMS["model_type"]
205
209
  model_metrics_s3_path = TEMPLATE_PARAMS["model_metrics_s3_path"]
206
210
  train_all_data = TEMPLATE_PARAMS["train_all_data"]
211
+ hyperparameters = TEMPLATE_PARAMS["hyperparameters"]
207
212
  validation_split = 0.2
208
213
 
209
214
  # Script arguments for input/output directories
@@ -216,11 +221,7 @@ if __name__ == "__main__":
216
221
  args = parser.parse_args()
217
222
 
218
223
  # Read the training data into DataFrames
219
- training_files = [
220
- os.path.join(args.train, file)
221
- for file in os.listdir(args.train)
222
- if file.endswith(".csv")
223
- ]
224
+ training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
224
225
  print(f"Training Files: {training_files}")
225
226
 
226
227
  # Combine files and read them all into a single pandas dataframe
@@ -255,15 +256,16 @@ if __name__ == "__main__":
255
256
  else:
256
257
  # Just do a random training Split
257
258
  print("WARNING: No training column found, splitting data with random state=42")
258
- df_train, df_val = train_test_split(
259
- all_df, test_size=validation_split, random_state=42
260
- )
259
+ df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
261
260
  print(f"FIT/TRAIN: {df_train.shape}")
262
261
  print(f"VALIDATION: {df_val.shape}")
263
262
 
263
+ # Use any hyperparameters to set up both the trainer and model configurations
264
+ print(f"Hyperparameters: {hyperparameters}")
265
+
264
266
  # Now spin up our XGB Model
265
267
  if model_type == "classifier":
266
- xgb_model = xgb.XGBClassifier(enable_categorical=True)
268
+ xgb_model = xgb.XGBClassifier(enable_categorical=True, **hyperparameters)
267
269
 
268
270
  # Encode the target column
269
271
  label_encoder = LabelEncoder()
@@ -271,12 +273,12 @@ if __name__ == "__main__":
271
273
  df_val[target] = label_encoder.transform(df_val[target])
272
274
 
273
275
  else:
274
- xgb_model = xgb.XGBRegressor(enable_categorical=True)
276
+ xgb_model = xgb.XGBRegressor(enable_categorical=True, **hyperparameters)
275
277
  label_encoder = None # We don't need this for regression
276
278
 
277
279
  # Grab our Features, Target and Train the Model
278
280
  y_train = df_train[target]
279
- X_train= df_train[features]
281
+ X_train = df_train[features]
280
282
  xgb_model.fit(X_train, y_train)
281
283
 
282
284
  # Make Predictions on the Validation Set
@@ -315,9 +317,7 @@ if __name__ == "__main__":
315
317
  label_names = label_encoder.classes_
316
318
 
317
319
  # Calculate various model performance metrics
318
- scores = precision_recall_fscore_support(
319
- y_validate, preds, average=None, labels=label_names
320
- )
320
+ scores = precision_recall_fscore_support(y_validate, preds, average=None, labels=label_names)
321
321
 
322
322
  # Put the scores into a dataframe
323
323
  score_df = pd.DataFrame(
@@ -355,7 +355,9 @@ if __name__ == "__main__":
355
355
  print(f"NumRows: {len(df_val)}")
356
356
 
357
357
  # Now save the model to the standard place/name
358
- xgb_model.save_model(os.path.join(args.model_dir, "xgb_model.json"))
358
+ joblib.dump(xgb_model, os.path.join(args.model_dir, "xgb_model.joblib"))
359
+
360
+ # Save the label encoder if we have one
359
361
  if label_encoder:
360
362
  joblib.dump(label_encoder, os.path.join(args.model_dir, "label_encoder.joblib"))
361
363
 
@@ -370,19 +372,8 @@ if __name__ == "__main__":
370
372
 
371
373
  def model_fn(model_dir):
372
374
  """Deserialize and return fitted XGBoost model"""
373
-
374
- model_path = os.path.join(model_dir, "xgb_model.json")
375
-
376
- with open(model_path, "r") as f:
377
- model_json = json.load(f)
378
-
379
- sklearn_data = model_json['learner']['attributes']['scikit_learn']
380
- model_type = json.loads(sklearn_data)['_estimator_type']
381
-
382
- model_class = xgb.XGBClassifier if model_type == "classifier" else xgb.XGBRegressor
383
- model = model_class(enable_categorical=True)
384
- model.load_model(model_path)
385
-
375
+ model_path = os.path.join(model_dir, "xgb_model.joblib")
376
+ model = joblib.load(model_path)
386
377
  return model
387
378
 
388
379
 
@@ -13,12 +13,13 @@ cm = ConfigManager()
13
13
  workbench_bucket = cm.get_config("WORKBENCH_BUCKET")
14
14
 
15
15
 
16
- def submit_to_sqs(script_path: str, size: str = "small") -> None:
16
+ def submit_to_sqs(script_path: str, size: str = "small", realtime: bool = False) -> None:
17
17
  """
18
18
  Upload script to S3 and submit message to SQS queue for processing.
19
19
  Args:
20
20
  script_path: Local path to the ML pipeline script
21
21
  size: Job size tier - "small" (default), "medium", or "large"
22
+ realtime: If True, sets serverless=False for real-time processing (default: False, meaning serverless=True)
22
23
  """
23
24
  print(f"\n{'=' * 60}")
24
25
  print("🚀 SUBMITTING ML PIPELINE JOB")
@@ -33,6 +34,7 @@ def submit_to_sqs(script_path: str, size: str = "small") -> None:
33
34
 
34
35
  print(f"📄 Script: {script_file.name}")
35
36
  print(f"📏 Size tier: {size}")
37
+ print(f"⚡ Mode: {'Real-time' if realtime else 'Serverless'} (serverless={'False' if realtime else 'True'})")
36
38
  print(f"🪣 Bucket: {workbench_bucket}")
37
39
  sqs = AWSAccountClamp().boto3_session.client("sqs")
38
40
  script_name = script_file.name
@@ -88,6 +90,10 @@ def submit_to_sqs(script_path: str, size: str = "small") -> None:
88
90
 
89
91
  # Prepare message
90
92
  message = {"script_path": s3_path, "size": size}
93
+
94
+ # Set serverless environment variable (defaults to True, False if --realtime)
95
+ message["environment"] = {"SERVERLESS": "False" if realtime else "True"}
96
+
91
97
  print("\n📨 Sending message to SQS...")
92
98
 
93
99
  # Send the message to SQS
@@ -110,6 +116,7 @@ def submit_to_sqs(script_path: str, size: str = "small") -> None:
110
116
  print(f"{'=' * 60}")
111
117
  print(f"📄 Script: {script_name}")
112
118
  print(f"📏 Size: {size}")
119
+ print(f"⚡ Mode: {'Real-time' if realtime else 'Serverless'} (SERVERLESS={'False' if realtime else 'True'})")
113
120
  print(f"🆔 Message ID: {message_id}")
114
121
  print("\n🔍 MONITORING LOCATIONS:")
115
122
  print(f" • SQS Queue: AWS Console → SQS → {queue_name}")
@@ -126,9 +133,14 @@ def main():
126
133
  parser.add_argument(
127
134
  "--size", default="small", choices=["small", "medium", "large"], help="Job size tier (default: small)"
128
135
  )
136
+ parser.add_argument(
137
+ "--realtime",
138
+ action="store_true",
139
+ help="Run in real-time mode (sets serverless=False). Default is serverless mode (serverless=True)",
140
+ )
129
141
  args = parser.parse_args()
130
142
  try:
131
- submit_to_sqs(args.script_file, args.size)
143
+ submit_to_sqs(args.script_file, args.size, realtime=args.realtime)
132
144
  except Exception as e:
133
145
  print(f"\n❌ ERROR: {e}")
134
146
  log.error(f"Error: {e}")
@@ -222,6 +222,8 @@ def uq_metrics(df: pd.DataFrame, target_col: str) -> Dict[str, Any]:
222
222
  lower_95, upper_95 = df["q_025"], df["q_975"]
223
223
  lower_90, upper_90 = df["q_05"], df["q_95"]
224
224
  lower_80, upper_80 = df["q_10"], df["q_90"]
225
+ lower_68 = df.get("q_16", 0)
226
+ upper_68 = df.get("q_84", 0)
225
227
  lower_50, upper_50 = df["q_25"], df["q_75"]
226
228
  elif "prediction_std" in df.columns:
227
229
  lower_95 = df["prediction"] - 1.96 * df["prediction_std"]
@@ -230,6 +232,8 @@ def uq_metrics(df: pd.DataFrame, target_col: str) -> Dict[str, Any]:
230
232
  upper_90 = df["prediction"] + 1.645 * df["prediction_std"]
231
233
  lower_80 = df["prediction"] - 1.282 * df["prediction_std"]
232
234
  upper_80 = df["prediction"] + 1.282 * df["prediction_std"]
235
+ lower_68 = df["prediction"] - 1.0 * df["prediction_std"]
236
+ upper_68 = df["prediction"] + 1.0 * df["prediction_std"]
233
237
  lower_50 = df["prediction"] - 0.674 * df["prediction_std"]
234
238
  upper_50 = df["prediction"] + 0.674 * df["prediction_std"]
235
239
  else:
@@ -241,11 +245,13 @@ def uq_metrics(df: pd.DataFrame, target_col: str) -> Dict[str, Any]:
241
245
  coverage_95 = np.mean((df[target_col] >= lower_95) & (df[target_col] <= upper_95))
242
246
  coverage_90 = np.mean((df[target_col] >= lower_90) & (df[target_col] <= upper_90))
243
247
  coverage_80 = np.mean((df[target_col] >= lower_80) & (df[target_col] <= upper_80))
248
+ coverage_68 = np.mean((df[target_col] >= lower_68) & (df[target_col] <= upper_68))
244
249
  coverage_50 = np.mean((df[target_col] >= lower_50) & (df[target_col] <= upper_50))
245
250
  avg_width_95 = np.mean(upper_95 - lower_95)
246
251
  avg_width_90 = np.mean(upper_90 - lower_90)
247
252
  avg_width_80 = np.mean(upper_80 - lower_80)
248
253
  avg_width_50 = np.mean(upper_50 - lower_50)
254
+ avg_width_68 = np.mean(upper_68 - lower_68)
249
255
 
250
256
  # --- CRPS (measures calibration + sharpness) ---
251
257
  z = (df[target_col] - df["prediction"]) / df["prediction_std"]
@@ -269,12 +275,14 @@ def uq_metrics(df: pd.DataFrame, target_col: str) -> Dict[str, Any]:
269
275
  # Collect results
270
276
  results = {
271
277
  "coverage_50": coverage_50,
278
+ "coverage_68": coverage_68,
272
279
  "coverage_80": coverage_80,
273
280
  "coverage_90": coverage_90,
274
281
  "coverage_95": coverage_95,
275
- "avg_std": avg_std,
276
282
  "median_std": median_std,
283
+ "avg_std": avg_std,
277
284
  "avg_width_50": avg_width_50,
285
+ "avg_width_68": avg_width_68,
278
286
  "avg_width_80": avg_width_80,
279
287
  "avg_width_90": avg_width_90,
280
288
  "avg_width_95": avg_width_95,
@@ -286,12 +294,14 @@ def uq_metrics(df: pd.DataFrame, target_col: str) -> Dict[str, Any]:
286
294
 
287
295
  print("\n=== UQ Metrics ===")
288
296
  print(f"Coverage @ 50%: {coverage_50:.3f} (target: 0.50)")
297
+ print(f"Coverage @ 68%: {coverage_68:.3f} (target: 0.68)")
289
298
  print(f"Coverage @ 80%: {coverage_80:.3f} (target: 0.80)")
290
299
  print(f"Coverage @ 90%: {coverage_90:.3f} (target: 0.90)")
291
300
  print(f"Coverage @ 95%: {coverage_95:.3f} (target: 0.95)")
292
- print(f"Avg Prediction StdDev: {avg_std:.3f}")
293
301
  print(f"Median Prediction StdDev: {median_std:.3f}")
302
+ print(f"Avg Prediction StdDev: {avg_std:.3f}")
294
303
  print(f"Average 50% Width: {avg_width_50:.3f}")
304
+ print(f"Average 68% Width: {avg_width_68:.3f}")
295
305
  print(f"Average 80% Width: {avg_width_80:.3f}")
296
306
  print(f"Average 90% Width: {avg_width_90:.3f}")
297
307
  print(f"Average 95% Width: {avg_width_95:.3f}")