workbench 0.8.161__py3-none-any.whl → 0.8.192__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. workbench/algorithms/dataframe/proximity.py +143 -102
  2. workbench/algorithms/graph/light/proximity_graph.py +2 -1
  3. workbench/api/compound.py +1 -1
  4. workbench/api/endpoint.py +12 -0
  5. workbench/api/feature_set.py +4 -4
  6. workbench/api/meta.py +5 -2
  7. workbench/api/model.py +16 -12
  8. workbench/api/monitor.py +1 -16
  9. workbench/core/artifacts/artifact.py +11 -3
  10. workbench/core/artifacts/data_capture_core.py +355 -0
  11. workbench/core/artifacts/endpoint_core.py +168 -78
  12. workbench/core/artifacts/feature_set_core.py +72 -13
  13. workbench/core/artifacts/model_core.py +50 -15
  14. workbench/core/artifacts/monitor_core.py +33 -248
  15. workbench/core/cloud_platform/aws/aws_account_clamp.py +50 -1
  16. workbench/core/cloud_platform/aws/aws_meta.py +12 -5
  17. workbench/core/cloud_platform/aws/aws_session.py +4 -4
  18. workbench/core/transforms/data_to_features/light/molecular_descriptors.py +4 -4
  19. workbench/core/transforms/features_to_model/features_to_model.py +9 -4
  20. workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +36 -6
  21. workbench/core/transforms/pandas_transforms/pandas_to_features.py +27 -0
  22. workbench/core/views/training_view.py +49 -53
  23. workbench/core/views/view.py +51 -1
  24. workbench/core/views/view_utils.py +4 -4
  25. workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +483 -0
  26. workbench/model_scripts/custom_models/chem_info/mol_standardize.py +450 -0
  27. workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +7 -9
  28. workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +3 -5
  29. workbench/model_scripts/custom_models/proximity/proximity.py +143 -102
  30. workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
  31. workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +10 -17
  32. workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
  33. workbench/model_scripts/custom_models/uq_models/meta_uq.template +156 -58
  34. workbench/model_scripts/custom_models/uq_models/ngboost.template +20 -14
  35. workbench/model_scripts/custom_models/uq_models/proximity.py +143 -102
  36. workbench/model_scripts/custom_models/uq_models/requirements.txt +1 -3
  37. workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +5 -13
  38. workbench/model_scripts/pytorch_model/pytorch.template +19 -20
  39. workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
  40. workbench/model_scripts/script_generation.py +7 -2
  41. workbench/model_scripts/uq_models/mapie.template +492 -0
  42. workbench/model_scripts/uq_models/requirements.txt +1 -0
  43. workbench/model_scripts/xgb_model/xgb_model.template +31 -40
  44. workbench/repl/workbench_shell.py +11 -6
  45. workbench/scripts/lambda_launcher.py +63 -0
  46. workbench/scripts/ml_pipeline_batch.py +137 -0
  47. workbench/scripts/ml_pipeline_sqs.py +186 -0
  48. workbench/scripts/monitor_cloud_watch.py +20 -100
  49. workbench/utils/aws_utils.py +4 -3
  50. workbench/utils/chem_utils/__init__.py +0 -0
  51. workbench/utils/chem_utils/fingerprints.py +134 -0
  52. workbench/utils/chem_utils/misc.py +194 -0
  53. workbench/utils/chem_utils/mol_descriptors.py +483 -0
  54. workbench/utils/chem_utils/mol_standardize.py +450 -0
  55. workbench/utils/chem_utils/mol_tagging.py +348 -0
  56. workbench/utils/chem_utils/projections.py +209 -0
  57. workbench/utils/chem_utils/salts.py +256 -0
  58. workbench/utils/chem_utils/sdf.py +292 -0
  59. workbench/utils/chem_utils/toxicity.py +250 -0
  60. workbench/utils/chem_utils/vis.py +253 -0
  61. workbench/utils/cloudwatch_handler.py +1 -1
  62. workbench/utils/cloudwatch_utils.py +137 -0
  63. workbench/utils/config_manager.py +3 -7
  64. workbench/utils/endpoint_utils.py +5 -7
  65. workbench/utils/license_manager.py +2 -6
  66. workbench/utils/model_utils.py +76 -30
  67. workbench/utils/monitor_utils.py +44 -62
  68. workbench/utils/pandas_utils.py +3 -3
  69. workbench/utils/shap_utils.py +10 -2
  70. workbench/utils/workbench_logging.py +0 -3
  71. workbench/utils/workbench_sqs.py +1 -1
  72. workbench/utils/xgboost_model_utils.py +283 -145
  73. workbench/web_interface/components/plugins/dashboard_status.py +3 -1
  74. workbench/web_interface/components/plugins/generated_compounds.py +1 -1
  75. workbench/web_interface/components/plugins/scatter_plot.py +3 -3
  76. {workbench-0.8.161.dist-info → workbench-0.8.192.dist-info}/METADATA +4 -4
  77. {workbench-0.8.161.dist-info → workbench-0.8.192.dist-info}/RECORD +81 -76
  78. {workbench-0.8.161.dist-info → workbench-0.8.192.dist-info}/entry_points.txt +3 -0
  79. workbench/model_scripts/custom_models/chem_info/local_utils.py +0 -769
  80. workbench/model_scripts/custom_models/chem_info/tautomerize.py +0 -83
  81. workbench/model_scripts/custom_models/proximity/generated_model_script.py +0 -138
  82. workbench/model_scripts/custom_models/uq_models/generated_model_script.py +0 -393
  83. workbench/model_scripts/custom_models/uq_models/mapie_xgb.template +0 -203
  84. workbench/model_scripts/ensemble_xgb/generated_model_script.py +0 -279
  85. workbench/model_scripts/pytorch_model/generated_model_script.py +0 -565
  86. workbench/model_scripts/quant_regression/quant_regression.template +0 -279
  87. workbench/model_scripts/quant_regression/requirements.txt +0 -1
  88. workbench/model_scripts/scikit_learn/generated_model_script.py +0 -307
  89. workbench/model_scripts/xgb_model/generated_model_script.py +0 -477
  90. workbench/utils/chem_utils.py +0 -1556
  91. workbench/utils/execution_environment.py +0 -211
  92. workbench/utils/fast_inference.py +0 -167
  93. workbench/utils/resource_utils.py +0 -39
  94. {workbench-0.8.161.dist-info → workbench-0.8.192.dist-info}/WHEEL +0 -0
  95. {workbench-0.8.161.dist-info → workbench-0.8.192.dist-info}/licenses/LICENSE +0 -0
  96. {workbench-0.8.161.dist-info → workbench-0.8.192.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,492 @@
1
+ # Model: XGBoost for point predictions + LightGBM with MAPIE for conformalized intervals
2
+ from mapie.regression import ConformalizedQuantileRegressor
3
+ from lightgbm import LGBMRegressor
4
+ from xgboost import XGBRegressor
5
+ from sklearn.model_selection import train_test_split
6
+
7
+ # Model Performance Scores
8
+ from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
9
+
10
+ from io import StringIO
11
+ import json
12
+ import argparse
13
+ import joblib
14
+ import os
15
+ import numpy as np
16
+ import pandas as pd
17
+ from typing import List, Tuple
18
+
19
+ # Template Placeholders
20
+ TEMPLATE_PARAMS = {
21
+ "target": "{{target_column}}",
22
+ "features": "{{feature_list}}",
23
+ "compressed_features": "{{compressed_features}}",
24
+ "train_all_data": "{{train_all_data}}",
25
+ "hyperparameters": "{{hyperparameters}}",
26
+ }
27
+
28
+
29
+ # Function to check if dataframe is empty
30
+ def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
31
+ """
32
+ Check if the provided dataframe is empty and raise an exception if it is.
33
+
34
+ Args:
35
+ df (pd.DataFrame): DataFrame to check
36
+ df_name (str): Name of the DataFrame
37
+ """
38
+ if df.empty:
39
+ msg = f"*** The training data {df_name} has 0 rows! ***STOPPING***"
40
+ print(msg)
41
+ raise ValueError(msg)
42
+
43
+
44
+ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> pd.DataFrame:
45
+ """
46
+ Matches and renames DataFrame columns to match model feature names (case-insensitive).
47
+ Prioritizes exact matches, then case-insensitive matches.
48
+
49
+ Raises ValueError if any model features cannot be matched.
50
+ """
51
+ df_columns_lower = {col.lower(): col for col in df.columns}
52
+ rename_dict = {}
53
+ missing = []
54
+ for feature in model_features:
55
+ if feature in df.columns:
56
+ continue # Exact match
57
+ elif feature.lower() in df_columns_lower:
58
+ rename_dict[df_columns_lower[feature.lower()]] = feature
59
+ else:
60
+ missing.append(feature)
61
+
62
+ if missing:
63
+ raise ValueError(f"Features not found: {missing}")
64
+
65
+ # Rename the DataFrame columns to match the model features
66
+ return df.rename(columns=rename_dict)
67
+
68
+
69
+ def convert_categorical_types(df: pd.DataFrame, features: list, category_mappings={}) -> tuple:
70
+ """
71
+ Converts appropriate columns to categorical type with consistent mappings.
72
+
73
+ Args:
74
+ df (pd.DataFrame): The DataFrame to process.
75
+ features (list): List of feature names to consider for conversion.
76
+ category_mappings (dict, optional): Existing category mappings. If empty dict, we're in
77
+ training mode. If populated, we're in inference mode.
78
+
79
+ Returns:
80
+ tuple: (processed DataFrame, category mappings dictionary)
81
+ """
82
+ # Training mode
83
+ if category_mappings == {}:
84
+ for col in df.select_dtypes(include=["object", "string"]):
85
+ if col in features and df[col].nunique() < 20:
86
+ print(f"Training mode: Converting {col} to category")
87
+ df[col] = df[col].astype("category")
88
+ category_mappings[col] = df[col].cat.categories.tolist() # Store category mappings
89
+
90
+ # Inference mode
91
+ else:
92
+ for col, categories in category_mappings.items():
93
+ if col in df.columns:
94
+ print(f"Inference mode: Applying categorical mapping for {col}")
95
+ df[col] = pd.Categorical(df[col], categories=categories) # Apply consistent categorical mapping
96
+
97
+ return df, category_mappings
98
+
99
+
100
+ def decompress_features(
101
+ df: pd.DataFrame, features: List[str], compressed_features: List[str]
102
+ ) -> Tuple[pd.DataFrame, List[str]]:
103
+ """Prepare features for the model by decompressing bitstring features
104
+
105
+ Args:
106
+ df (pd.DataFrame): The features DataFrame
107
+ features (List[str]): Full list of feature names
108
+ compressed_features (List[str]): List of feature names to decompress (bitstrings)
109
+
110
+ Returns:
111
+ pd.DataFrame: DataFrame with the decompressed features
112
+ List[str]: Updated list of feature names after decompression
113
+
114
+ Raises:
115
+ ValueError: If any missing values are found in the specified features
116
+ """
117
+
118
+ # Check for any missing values in the required features
119
+ missing_counts = df[features].isna().sum()
120
+ if missing_counts.any():
121
+ missing_features = missing_counts[missing_counts > 0]
122
+ print(
123
+ f"WARNING: Found missing values in features: {missing_features.to_dict()}. "
124
+ "WARNING: You might want to remove/replace all NaN values before processing."
125
+ )
126
+
127
+ # Decompress the specified compressed features
128
+ decompressed_features = features.copy()
129
+ for feature in compressed_features:
130
+ if (feature not in df.columns) or (feature not in features):
131
+ print(f"Feature '{feature}' not in the features list, skipping decompression.")
132
+ continue
133
+
134
+ # Remove the feature from the list of features to avoid duplication
135
+ decompressed_features.remove(feature)
136
+
137
+ # Handle all compressed features as bitstrings
138
+ bit_matrix = np.array([list(bitstring) for bitstring in df[feature]], dtype=np.uint8)
139
+ prefix = feature[:3]
140
+
141
+ # Create all new columns at once - avoids fragmentation
142
+ new_col_names = [f"{prefix}_{i}" for i in range(bit_matrix.shape[1])]
143
+ new_df = pd.DataFrame(bit_matrix, columns=new_col_names, index=df.index)
144
+
145
+ # Add to features list
146
+ decompressed_features.extend(new_col_names)
147
+
148
+ # Drop original column and concatenate new ones
149
+ df = df.drop(columns=[feature])
150
+ df = pd.concat([df, new_df], axis=1)
151
+
152
+ return df, decompressed_features
153
+
154
+
155
+ if __name__ == "__main__":
156
+ # Template Parameters
157
+ target = TEMPLATE_PARAMS["target"]
158
+ features = TEMPLATE_PARAMS["features"]
159
+ orig_features = features.copy()
160
+ compressed_features = TEMPLATE_PARAMS["compressed_features"]
161
+ train_all_data = TEMPLATE_PARAMS["train_all_data"]
162
+ hyperparameters = TEMPLATE_PARAMS["hyperparameters"]
163
+ validation_split = 0.2
164
+
165
+ # Script arguments for input/output directories
166
+ parser = argparse.ArgumentParser()
167
+ parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model"))
168
+ parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train"))
169
+ parser.add_argument(
170
+ "--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data")
171
+ )
172
+ args = parser.parse_args()
173
+
174
+ # Read the training data into DataFrames
175
+ training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
176
+ print(f"Training Files: {training_files}")
177
+
178
+ # Combine files and read them all into a single pandas dataframe
179
+ all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
180
+
181
+ # Check if the dataframe is empty
182
+ check_dataframe(all_df, "training_df")
183
+
184
+ # Features/Target output
185
+ print(f"Target: {target}")
186
+ print(f"Features: {str(features)}")
187
+
188
+ # Convert any features that might be categorical to 'category' type
189
+ all_df, category_mappings = convert_categorical_types(all_df, features)
190
+
191
+ # If we have compressed features, decompress them
192
+ if compressed_features:
193
+ print(f"Decompressing features {compressed_features}...")
194
+ all_df, features = decompress_features(all_df, features, compressed_features)
195
+
196
+ # Do we want to train on all the data?
197
+ if train_all_data:
198
+ print("Training on ALL of the data")
199
+ df_train = all_df.copy()
200
+ df_val = all_df.copy()
201
+
202
+ # Does the dataframe have a training column?
203
+ elif "training" in all_df.columns:
204
+ print("Found training column, splitting data based on training column")
205
+ df_train = all_df[all_df["training"]]
206
+ df_val = all_df[~all_df["training"]]
207
+ else:
208
+ # Just do a random training Split
209
+ print("WARNING: No training column found, splitting data with random state=42")
210
+ df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
211
+ print(f"FIT/TRAIN: {df_train.shape}")
212
+ print(f"VALIDATION: {df_val.shape}")
213
+
214
+ # Prepare features and targets for training
215
+ X_train = df_train[features]
216
+ X_validate = df_val[features]
217
+ y_train = df_train[target]
218
+ y_validate = df_val[target]
219
+
220
+ # Train XGBoost for point predictions
221
+ print("\nTraining XGBoost for point predictions...")
222
+ print(f" Hyperparameters: {hyperparameters}")
223
+ xgb_model = XGBRegressor(enable_categorical=True, **hyperparameters)
224
+ xgb_model.fit(X_train, y_train)
225
+
226
+ # Evaluate XGBoost performance
227
+ y_pred_xgb = xgb_model.predict(X_validate)
228
+ xgb_rmse = root_mean_squared_error(y_validate, y_pred_xgb)
229
+ xgb_mae = mean_absolute_error(y_validate, y_pred_xgb)
230
+ xgb_r2 = r2_score(y_validate, y_pred_xgb)
231
+
232
+ print(f"\nXGBoost Point Prediction Performance:")
233
+ print(f"RMSE: {xgb_rmse:.3f}")
234
+ print(f"MAE: {xgb_mae:.3f}")
235
+ print(f"R2: {xgb_r2:.3f}")
236
+
237
+ # Define confidence levels we want to model
238
+ confidence_levels = [0.50, 0.68, 0.80, 0.90, 0.95] # 50%, 68%, 80%, 90%, 95% confidence intervals
239
+
240
+ # Store MAPIE models for each confidence level
241
+ mapie_models = {}
242
+
243
+ # Train models for each confidence level
244
+ for confidence_level in confidence_levels:
245
+ alpha = 1 - confidence_level
246
+ lower_q = alpha / 2
247
+ upper_q = 1 - alpha / 2
248
+
249
+ print(f"\nTraining quantile models for {confidence_level * 100:.0f}% confidence interval...")
250
+ print(f" Quantiles: {lower_q:.3f}, {upper_q:.3f}, 0.500")
251
+
252
+ # Train three models for this confidence level
253
+ quantile_estimators = []
254
+ for q in [lower_q, upper_q, 0.5]:
255
+ print(f" Training model for quantile {q:.3f}...")
256
+ est = LGBMRegressor(
257
+ objective="quantile",
258
+ alpha=q,
259
+ n_estimators=1000,
260
+ max_depth=6,
261
+ learning_rate=0.01,
262
+ num_leaves=31,
263
+ min_child_samples=20,
264
+ subsample=0.8,
265
+ colsample_bytree=0.8,
266
+ random_state=42,
267
+ verbose=-1,
268
+ force_col_wise=True,
269
+ )
270
+ est.fit(X_train, y_train)
271
+ quantile_estimators.append(est)
272
+
273
+ # Create MAPIE CQR model for this confidence level
274
+ print(f" Setting up MAPIE CQR for {confidence_level * 100:.0f}% confidence...")
275
+ mapie_model = ConformalizedQuantileRegressor(
276
+ quantile_estimators, confidence_level=confidence_level, prefit=True
277
+ )
278
+
279
+ # Conformalize the model
280
+ print(f" Conformalizing with validation data...")
281
+ mapie_model.conformalize(X_validate, y_validate)
282
+
283
+ # Store the model
284
+ mapie_models[f"mapie_{confidence_level:.2f}"] = mapie_model
285
+
286
+ # Validate coverage for this confidence level
287
+ y_pred, y_pis = mapie_model.predict_interval(X_validate)
288
+ coverage = np.mean((y_validate >= y_pis[:, 0, 0]) & (y_validate <= y_pis[:, 1, 0]))
289
+ print(f" Coverage: Target={confidence_level * 100:.0f}%, Empirical={coverage * 100:.1f}%")
290
+
291
+ print(f"\nOverall Model Performance Summary:")
292
+ print(f"XGBoost RMSE: {xgb_rmse:.3f}")
293
+ print(f"XGBoost MAE: {xgb_mae:.3f}")
294
+ print(f"XGBoost R2: {xgb_r2:.3f}")
295
+ print(f"NumRows: {len(df_val)}")
296
+
297
+ # Analyze interval widths across confidence levels
298
+ print(f"\nInterval Width Analysis:")
299
+ for conf_level in confidence_levels:
300
+ model = mapie_models[f"mapie_{conf_level:.2f}"]
301
+ _, y_pis = model.predict_interval(X_validate)
302
+ widths = y_pis[:, 1, 0] - y_pis[:, 0, 0]
303
+ print(f" {conf_level * 100:.0f}% CI: Mean width={np.mean(widths):.3f}, Std={np.std(widths):.3f}")
304
+
305
+ # Save the trained XGBoost model
306
+ joblib.dump(xgb_model, os.path.join(args.model_dir, "xgb_model.joblib"))
307
+
308
+ # Save all MAPIE models
309
+ for model_name, model in mapie_models.items():
310
+ joblib.dump(model, os.path.join(args.model_dir, f"{model_name}.joblib"))
311
+
312
+ # Save the feature list
313
+ with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
314
+ json.dump(features, fp)
315
+
316
+ # Save category mappings if any
317
+ if category_mappings:
318
+ with open(os.path.join(args.model_dir, "category_mappings.json"), "w") as fp:
319
+ json.dump(category_mappings, fp)
320
+
321
+ # Save model configuration
322
+ model_config = {
323
+ "model_type": "XGBoost_MAPIE_CQR_LightGBM",
324
+ "confidence_levels": confidence_levels,
325
+ "n_features": len(features),
326
+ "target": target,
327
+ "validation_metrics": {
328
+ "xgb_rmse": float(xgb_rmse),
329
+ "xgb_mae": float(xgb_mae),
330
+ "xgb_r2": float(xgb_r2),
331
+ "n_validation": len(df_val),
332
+ },
333
+ }
334
+ with open(os.path.join(args.model_dir, "model_config.json"), "w") as fp:
335
+ json.dump(model_config, fp, indent=2)
336
+
337
+ print(f"\nModel training complete!")
338
+ print(f"Saved 1 XGBoost model and {len(mapie_models)} MAPIE models to {args.model_dir}")
339
+
340
+
341
+ #
342
+ # Inference Section
343
+ #
344
+ def model_fn(model_dir) -> dict:
345
+ """Load XGBoost and all MAPIE models from the specified directory."""
346
+
347
+ # Load model configuration to know which models to load
348
+ with open(os.path.join(model_dir, "model_config.json")) as fp:
349
+ config = json.load(fp)
350
+
351
+ # Load XGBoost regressor
352
+ xgb_path = os.path.join(model_dir, "xgb_model.joblib")
353
+ xgb_model = joblib.load(xgb_path)
354
+
355
+ # Load all MAPIE models
356
+ mapie_models = {}
357
+ for conf_level in config["confidence_levels"]:
358
+ model_name = f"mapie_{conf_level:.2f}"
359
+ mapie_models[model_name] = joblib.load(os.path.join(model_dir, f"{model_name}.joblib"))
360
+
361
+ # Load category mappings if they exist
362
+ category_mappings = {}
363
+ category_path = os.path.join(model_dir, "category_mappings.json")
364
+ if os.path.exists(category_path):
365
+ with open(category_path) as fp:
366
+ category_mappings = json.load(fp)
367
+
368
+ return {
369
+ "xgb_model": xgb_model,
370
+ "mapie_models": mapie_models,
371
+ "confidence_levels": config["confidence_levels"],
372
+ "category_mappings": category_mappings,
373
+ }
374
+
375
+
376
+ def input_fn(input_data, content_type):
377
+ """Parse input data and return a DataFrame."""
378
+ if not input_data:
379
+ raise ValueError("Empty input data is not supported!")
380
+
381
+ # Decode bytes to string if necessary
382
+ if isinstance(input_data, bytes):
383
+ input_data = input_data.decode("utf-8")
384
+
385
+ if "text/csv" in content_type:
386
+ return pd.read_csv(StringIO(input_data))
387
+ elif "application/json" in content_type:
388
+ return pd.DataFrame(json.loads(input_data))
389
+ else:
390
+ raise ValueError(f"{content_type} not supported!")
391
+
392
+
393
+ def output_fn(output_df, accept_type):
394
+ """Supports both CSV and JSON output formats."""
395
+ if "text/csv" in accept_type:
396
+ # Convert categorical columns to string to avoid fillna issues
397
+ for col in output_df.select_dtypes(include=["category"]).columns:
398
+ output_df[col] = output_df[col].astype(str)
399
+ csv_output = output_df.fillna("N/A").to_csv(index=False)
400
+ return csv_output, "text/csv"
401
+ elif "application/json" in accept_type:
402
+ return output_df.to_json(orient="records"), "application/json"
403
+ else:
404
+ raise RuntimeError(f"{accept_type} accept type is not supported by this script.")
405
+
406
+
407
+ def predict_fn(df, models) -> pd.DataFrame:
408
+ """Make predictions using XGBoost for point estimates and MAPIE for conformalized intervals
409
+
410
+ Args:
411
+ df (pd.DataFrame): The input DataFrame
412
+ models (dict): Dictionary containing XGBoost and MAPIE models
413
+
414
+ Returns:
415
+ pd.DataFrame: DataFrame with XGBoost predictions and conformalized intervals
416
+ """
417
+
418
+ # Flag for outlier stretch adjustment for the prediction intervals
419
+ # if the predicted values are outside the intervals
420
+ outlier_stretch = False
421
+
422
+ # Grab our feature columns (from training)
423
+ model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
424
+ with open(os.path.join(model_dir, "feature_columns.json")) as fp:
425
+ model_features = json.load(fp)
426
+
427
+ # Match features in a case-insensitive manner
428
+ matched_df = match_features_case_insensitive(df, model_features)
429
+
430
+ # Apply categorical mappings if they exist
431
+ if models.get("category_mappings"):
432
+ matched_df, _ = convert_categorical_types(matched_df, model_features, models["category_mappings"])
433
+
434
+ # Get features for prediction
435
+ X = matched_df[model_features]
436
+
437
+ # Get XGBoost point predictions
438
+ df["prediction"] = models["xgb_model"].predict(X)
439
+
440
+ # Get predictions from each MAPIE model for conformalized intervals
441
+ for conf_level in models["confidence_levels"]:
442
+ model_name = f"mapie_{conf_level:.2f}"
443
+ model = models["mapie_models"][model_name]
444
+
445
+ # Get conformalized predictions
446
+ y_pred, y_pis = model.predict_interval(X)
447
+
448
+ # Map confidence levels to quantile names
449
+ if conf_level == 0.50: # 50% CI
450
+ df["q_25"] = y_pis[:, 0, 0]
451
+ df["q_75"] = y_pis[:, 1, 0]
452
+ elif conf_level == 0.68: # 68% CI
453
+ df["q_16"] = y_pis[:, 0, 0]
454
+ df["q_84"] = y_pis[:, 1, 0]
455
+ elif conf_level == 0.80: # 80% CI
456
+ df["q_10"] = y_pis[:, 0, 0]
457
+ df["q_90"] = y_pis[:, 1, 0]
458
+ elif conf_level == 0.90: # 90% CI
459
+ df["q_05"] = y_pis[:, 0, 0]
460
+ df["q_95"] = y_pis[:, 1, 0]
461
+ elif conf_level == 0.95: # 95% CI
462
+ df["q_025"] = y_pis[:, 0, 0]
463
+ df["q_975"] = y_pis[:, 1, 0]
464
+
465
+ # Add median (q_50) from XGBoost prediction
466
+ df["q_50"] = df["prediction"]
467
+
468
+ # Calculate a pseudo-standard deviation from the 68% interval width
469
+ df["prediction_std"] = (df["q_84"] - df["q_16"]).abs() / 2.0
470
+
471
+ # Reorder the quantile columns for easier reading
472
+ quantile_cols = ["q_025", "q_05", "q_10", "q_16", "q_25", "q_75", "q_84", "q_90", "q_95", "q_975"]
473
+ other_cols = [col for col in df.columns if col not in quantile_cols]
474
+ df = df[other_cols + quantile_cols]
475
+
476
+ # Adjust the outer quantiles to ensure they encompass the prediction
477
+ if outlier_stretch:
478
+ # Lower intervals adjustments
479
+ df["q_025"] = np.minimum(df["q_025"], df["prediction"])
480
+ df["q_05"] = np.minimum(df["q_05"], df["prediction"])
481
+ df["q_10"] = np.minimum(df["q_10"], df["prediction"])
482
+ df["q_16"] = np.minimum(df["q_16"], df["prediction"])
483
+ df["q_25"] = np.minimum(df["q_25"], df["prediction"])
484
+
485
+ # Upper intervals adjustments
486
+ df["q_75"] = np.maximum(df["q_75"], df["prediction"])
487
+ df["q_84"] = np.maximum(df["q_84"], df["prediction"])
488
+ df["q_90"] = np.maximum(df["q_90"], df["prediction"])
489
+ df["q_95"] = np.maximum(df["q_95"], df["prediction"])
490
+ df["q_975"] = np.maximum(df["q_975"], df["prediction"])
491
+
492
+ return df
@@ -0,0 +1 @@
1
+ # Note: Most libs are already in the training/inference images, ONLY specify additional libs here
@@ -29,13 +29,15 @@ from typing import List, Tuple
29
29
  # Template Parameters
30
30
  TEMPLATE_PARAMS = {
31
31
  "model_type": "{{model_type}}",
32
- "target_column": "{{target_column}}",
32
+ "target": "{{target_column}}",
33
33
  "features": "{{feature_list}}",
34
34
  "compressed_features": "{{compressed_features}}",
35
35
  "model_metrics_s3_path": "{{model_metrics_s3_path}}",
36
- "train_all_data": "{{train_all_data}}"
36
+ "train_all_data": "{{train_all_data}}",
37
+ "hyperparameters": "{{hyperparameters}}",
37
38
  }
38
39
 
40
+
39
41
  # Function to check if dataframe is empty
40
42
  def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
41
43
  """
@@ -75,7 +77,7 @@ def expand_proba_column(df: pd.DataFrame, class_labels: List[str]) -> pd.DataFra
75
77
  proba_df = pd.DataFrame(df[proba_column].tolist(), columns=proba_splits)
76
78
 
77
79
  # Drop any proba columns and reset the index in prep for the concat
78
- df = df.drop(columns=[proba_column]+proba_splits, errors="ignore")
80
+ df = df.drop(columns=[proba_column] + proba_splits, errors="ignore")
79
81
  df = df.reset_index(drop=True)
80
82
 
81
83
  # Concatenate the new columns with the original DataFrame
@@ -88,13 +90,12 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
88
90
  """
89
91
  Matches and renames DataFrame columns to match model feature names (case-insensitive).
90
92
  Prioritizes exact matches, then case-insensitive matches.
91
-
93
+
92
94
  Raises ValueError if any model features cannot be matched.
93
95
  """
94
96
  df_columns_lower = {col.lower(): col for col in df.columns}
95
97
  rename_dict = {}
96
98
  missing = []
97
-
98
99
  for feature in model_features:
99
100
  if feature in df.columns:
100
101
  continue # Exact match
@@ -102,10 +103,11 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
102
103
  rename_dict[df_columns_lower[feature.lower()]] = feature
103
104
  else:
104
105
  missing.append(feature)
105
-
106
+
106
107
  if missing:
107
108
  raise ValueError(f"Features not found: {missing}")
108
-
109
+
110
+ # Rename the DataFrame columns to match the model features
109
111
  return df.rename(columns=rename_dict)
110
112
 
111
113
 
@@ -140,8 +142,10 @@ def convert_categorical_types(df: pd.DataFrame, features: list, category_mapping
140
142
  return df, category_mappings
141
143
 
142
144
 
143
- def decompress_features(df: pd.DataFrame, features: List[str], compressed_features: List[str]) -> Tuple[pd.DataFrame, List[str]]:
144
- """Prepare features for the XGBoost model
145
+ def decompress_features(
146
+ df: pd.DataFrame, features: List[str], compressed_features: List[str]
147
+ ) -> Tuple[pd.DataFrame, List[str]]:
148
+ """Prepare features for the model by decompressing bitstring features
145
149
 
146
150
  Args:
147
151
  df (pd.DataFrame): The features DataFrame
@@ -166,7 +170,7 @@ def decompress_features(df: pd.DataFrame, features: List[str], compressed_featur
166
170
  )
167
171
 
168
172
  # Decompress the specified compressed features
169
- decompressed_features = features
173
+ decompressed_features = features.copy()
170
174
  for feature in compressed_features:
171
175
  if (feature not in df.columns) or (feature not in features):
172
176
  print(f"Feature '{feature}' not in the features list, skipping decompression.")
@@ -197,13 +201,14 @@ if __name__ == "__main__":
197
201
  """The main function is for training the XGBoost model"""
198
202
 
199
203
  # Harness Template Parameters
200
- target = TEMPLATE_PARAMS["target_column"]
204
+ target = TEMPLATE_PARAMS["target"]
201
205
  features = TEMPLATE_PARAMS["features"]
202
206
  orig_features = features.copy()
203
207
  compressed_features = TEMPLATE_PARAMS["compressed_features"]
204
208
  model_type = TEMPLATE_PARAMS["model_type"]
205
209
  model_metrics_s3_path = TEMPLATE_PARAMS["model_metrics_s3_path"]
206
210
  train_all_data = TEMPLATE_PARAMS["train_all_data"]
211
+ hyperparameters = TEMPLATE_PARAMS["hyperparameters"]
207
212
  validation_split = 0.2
208
213
 
209
214
  # Script arguments for input/output directories
@@ -216,11 +221,7 @@ if __name__ == "__main__":
216
221
  args = parser.parse_args()
217
222
 
218
223
  # Read the training data into DataFrames
219
- training_files = [
220
- os.path.join(args.train, file)
221
- for file in os.listdir(args.train)
222
- if file.endswith(".csv")
223
- ]
224
+ training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
224
225
  print(f"Training Files: {training_files}")
225
226
 
226
227
  # Combine files and read them all into a single pandas dataframe
@@ -255,15 +256,16 @@ if __name__ == "__main__":
255
256
  else:
256
257
  # Just do a random training Split
257
258
  print("WARNING: No training column found, splitting data with random state=42")
258
- df_train, df_val = train_test_split(
259
- all_df, test_size=validation_split, random_state=42
260
- )
259
+ df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
261
260
  print(f"FIT/TRAIN: {df_train.shape}")
262
261
  print(f"VALIDATION: {df_val.shape}")
263
262
 
263
+ # Use any hyperparameters to set up both the trainer and model configurations
264
+ print(f"Hyperparameters: {hyperparameters}")
265
+
264
266
  # Now spin up our XGB Model
265
267
  if model_type == "classifier":
266
- xgb_model = xgb.XGBClassifier(enable_categorical=True)
268
+ xgb_model = xgb.XGBClassifier(enable_categorical=True, **hyperparameters)
267
269
 
268
270
  # Encode the target column
269
271
  label_encoder = LabelEncoder()
@@ -271,12 +273,12 @@ if __name__ == "__main__":
271
273
  df_val[target] = label_encoder.transform(df_val[target])
272
274
 
273
275
  else:
274
- xgb_model = xgb.XGBRegressor(enable_categorical=True)
276
+ xgb_model = xgb.XGBRegressor(enable_categorical=True, **hyperparameters)
275
277
  label_encoder = None # We don't need this for regression
276
278
 
277
279
  # Grab our Features, Target and Train the Model
278
280
  y_train = df_train[target]
279
- X_train= df_train[features]
281
+ X_train = df_train[features]
280
282
  xgb_model.fit(X_train, y_train)
281
283
 
282
284
  # Make Predictions on the Validation Set
@@ -315,9 +317,7 @@ if __name__ == "__main__":
315
317
  label_names = label_encoder.classes_
316
318
 
317
319
  # Calculate various model performance metrics
318
- scores = precision_recall_fscore_support(
319
- y_validate, preds, average=None, labels=label_names
320
- )
320
+ scores = precision_recall_fscore_support(y_validate, preds, average=None, labels=label_names)
321
321
 
322
322
  # Put the scores into a dataframe
323
323
  score_df = pd.DataFrame(
@@ -355,7 +355,9 @@ if __name__ == "__main__":
355
355
  print(f"NumRows: {len(df_val)}")
356
356
 
357
357
  # Now save the model to the standard place/name
358
- xgb_model.save_model(os.path.join(args.model_dir, "xgb_model.json"))
358
+ joblib.dump(xgb_model, os.path.join(args.model_dir, "xgb_model.joblib"))
359
+
360
+ # Save the label encoder if we have one
359
361
  if label_encoder:
360
362
  joblib.dump(label_encoder, os.path.join(args.model_dir, "label_encoder.joblib"))
361
363
 
@@ -370,19 +372,8 @@ if __name__ == "__main__":
370
372
 
371
373
  def model_fn(model_dir):
372
374
  """Deserialize and return fitted XGBoost model"""
373
-
374
- model_path = os.path.join(model_dir, "xgb_model.json")
375
-
376
- with open(model_path, "r") as f:
377
- model_json = json.load(f)
378
-
379
- sklearn_data = model_json['learner']['attributes']['scikit_learn']
380
- model_type = json.loads(sklearn_data)['_estimator_type']
381
-
382
- model_class = xgb.XGBClassifier if model_type == "classifier" else xgb.XGBRegressor
383
- model = model_class(enable_categorical=True)
384
- model.load_model(model_path)
385
-
375
+ model_path = os.path.join(model_dir, "xgb_model.joblib")
376
+ model = joblib.load(model_path)
386
377
  return model
387
378
 
388
379
 
@@ -390,7 +381,7 @@ def input_fn(input_data, content_type):
390
381
  """Parse input data and return a DataFrame."""
391
382
  if not input_data:
392
383
  raise ValueError("Empty input data is not supported!")
393
-
384
+
394
385
  # Decode bytes to string if necessary
395
386
  if isinstance(input_data, bytes):
396
387
  input_data = input_data.decode("utf-8")