workbench 0.8.205__py3-none-any.whl → 0.8.213__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. workbench/algorithms/models/noise_model.py +388 -0
  2. workbench/api/endpoint.py +3 -6
  3. workbench/api/feature_set.py +1 -1
  4. workbench/api/model.py +5 -11
  5. workbench/cached/cached_model.py +4 -4
  6. workbench/core/artifacts/endpoint_core.py +63 -153
  7. workbench/core/artifacts/model_core.py +21 -19
  8. workbench/core/transforms/features_to_model/features_to_model.py +2 -2
  9. workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +1 -1
  10. workbench/model_script_utils/model_script_utils.py +335 -0
  11. workbench/model_script_utils/pytorch_utils.py +395 -0
  12. workbench/model_script_utils/uq_harness.py +278 -0
  13. workbench/model_scripts/chemprop/chemprop.template +289 -666
  14. workbench/model_scripts/chemprop/generated_model_script.py +292 -669
  15. workbench/model_scripts/chemprop/model_script_utils.py +335 -0
  16. workbench/model_scripts/chemprop/requirements.txt +2 -10
  17. workbench/model_scripts/pytorch_model/generated_model_script.py +355 -612
  18. workbench/model_scripts/pytorch_model/model_script_utils.py +335 -0
  19. workbench/model_scripts/pytorch_model/pytorch.template +350 -607
  20. workbench/model_scripts/pytorch_model/pytorch_utils.py +395 -0
  21. workbench/model_scripts/pytorch_model/requirements.txt +1 -1
  22. workbench/model_scripts/pytorch_model/uq_harness.py +278 -0
  23. workbench/model_scripts/script_generation.py +2 -5
  24. workbench/model_scripts/uq_models/generated_model_script.py +65 -422
  25. workbench/model_scripts/xgb_model/generated_model_script.py +349 -412
  26. workbench/model_scripts/xgb_model/model_script_utils.py +335 -0
  27. workbench/model_scripts/xgb_model/uq_harness.py +278 -0
  28. workbench/model_scripts/xgb_model/xgb_model.template +344 -407
  29. workbench/scripts/training_test.py +85 -0
  30. workbench/utils/chemprop_utils.py +18 -656
  31. workbench/utils/metrics_utils.py +172 -0
  32. workbench/utils/model_utils.py +104 -47
  33. workbench/utils/pytorch_utils.py +32 -472
  34. workbench/utils/xgboost_local_crossfold.py +267 -0
  35. workbench/utils/xgboost_model_utils.py +49 -356
  36. workbench/web_interface/components/plugins/model_details.py +30 -68
  37. {workbench-0.8.205.dist-info → workbench-0.8.213.dist-info}/METADATA +5 -5
  38. {workbench-0.8.205.dist-info → workbench-0.8.213.dist-info}/RECORD +42 -31
  39. {workbench-0.8.205.dist-info → workbench-0.8.213.dist-info}/entry_points.txt +1 -0
  40. workbench/model_scripts/uq_models/mapie.template +0 -605
  41. workbench/model_scripts/uq_models/requirements.txt +0 -1
  42. {workbench-0.8.205.dist-info → workbench-0.8.213.dist-info}/WHEEL +0 -0
  43. {workbench-0.8.205.dist-info → workbench-0.8.213.dist-info}/licenses/LICENSE +0 -0
  44. {workbench-0.8.205.dist-info → workbench-0.8.213.dist-info}/top_level.txt +0 -0
@@ -1,605 +0,0 @@
1
- # Model: XGBoost for point predictions + LightGBM with MAPIE for conformalized intervals
2
- from mapie.regression import ConformalizedQuantileRegressor
3
- from lightgbm import LGBMRegressor
4
- from xgboost import XGBRegressor
5
- from sklearn.model_selection import train_test_split
6
-
7
- # Model Performance Scores
8
- from sklearn.metrics import mean_absolute_error, median_absolute_error, r2_score, root_mean_squared_error
9
- from scipy.stats import spearmanr
10
-
11
- from io import StringIO
12
- import json
13
- import argparse
14
- import joblib
15
- import os
16
- import numpy as np
17
- import pandas as pd
18
- from typing import List, Tuple, Optional, Dict
19
-
20
- # Template Placeholders
21
- TEMPLATE_PARAMS = {
22
- "target": "{{target_column}}",
23
- "features": "{{feature_list}}",
24
- "compressed_features": "{{compressed_features}}",
25
- "train_all_data": "{{train_all_data}}",
26
- "hyperparameters": "{{hyperparameters}}",
27
- }
28
-
29
-
30
- def compute_confidence(
31
- df: pd.DataFrame,
32
- median_interval_width: float,
33
- lower_q: str = "q_10",
34
- upper_q: str = "q_90",
35
- alpha: float = 1.0,
36
- beta: float = 1.0,
37
- ) -> pd.DataFrame:
38
- """
39
- Compute confidence scores (0.0 to 1.0) based on prediction interval width
40
- and distance from median using exponential decay.
41
-
42
- Args:
43
- df: DataFrame with 'prediction', 'q_50', and quantile columns
44
- median_interval_width: Pre-computed median interval width from training data
45
- lower_q: Lower quantile column name (default: 'q_10')
46
- upper_q: Upper quantile column name (default: 'q_90')
47
- alpha: Weight for interval width term (default: 1.0)
48
- beta: Weight for distance from median term (default: 1.0)
49
-
50
- Returns:
51
- DataFrame with added 'confidence' column
52
- """
53
- # Interval width
54
- interval_width = (df[upper_q] - df[lower_q]).abs()
55
-
56
- # Distance from median, normalized by interval width
57
- distance_from_median = (df['prediction'] - df['q_50']).abs()
58
- normalized_distance = distance_from_median / (interval_width + 1e-6)
59
-
60
- # Cap the distance penalty at 1.0
61
- normalized_distance = np.minimum(normalized_distance, 1.0)
62
-
63
- # Confidence using exponential decay
64
- interval_term = interval_width / median_interval_width
65
- df['confidence'] = np.exp(-(alpha * interval_term + beta * normalized_distance))
66
-
67
- return df
68
-
69
-
70
- # Function to check if dataframe is empty
71
- def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
72
- """
73
- Check if the provided dataframe is empty and raise an exception if it is.
74
-
75
- Args:
76
- df (pd.DataFrame): DataFrame to check
77
- df_name (str): Name of the DataFrame
78
- """
79
- if df.empty:
80
- msg = f"*** The training data {df_name} has 0 rows! ***STOPPING***"
81
- print(msg)
82
- raise ValueError(msg)
83
-
84
-
85
- def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> pd.DataFrame:
86
- """
87
- Matches and renames DataFrame columns to match model feature names (case-insensitive).
88
- Prioritizes exact matches, then case-insensitive matches.
89
-
90
- Raises ValueError if any model features cannot be matched.
91
- """
92
- df_columns_lower = {col.lower(): col for col in df.columns}
93
- rename_dict = {}
94
- missing = []
95
- for feature in model_features:
96
- if feature in df.columns:
97
- continue # Exact match
98
- elif feature.lower() in df_columns_lower:
99
- rename_dict[df_columns_lower[feature.lower()]] = feature
100
- else:
101
- missing.append(feature)
102
-
103
- if missing:
104
- raise ValueError(f"Features not found: {missing}")
105
-
106
- # Rename the DataFrame columns to match the model features
107
- return df.rename(columns=rename_dict)
108
-
109
-
110
- def convert_categorical_types(df: pd.DataFrame, features: list, category_mappings={}) -> tuple:
111
- """
112
- Converts appropriate columns to categorical type with consistent mappings.
113
-
114
- Args:
115
- df (pd.DataFrame): The DataFrame to process.
116
- features (list): List of feature names to consider for conversion.
117
- category_mappings (dict, optional): Existing category mappings. If empty dict, we're in
118
- training mode. If populated, we're in inference mode.
119
-
120
- Returns:
121
- tuple: (processed DataFrame, category mappings dictionary)
122
- """
123
- # Training mode
124
- if category_mappings == {}:
125
- for col in df.select_dtypes(include=["object", "string"]):
126
- if col in features and df[col].nunique() < 20:
127
- print(f"Training mode: Converting {col} to category")
128
- df[col] = df[col].astype("category")
129
- category_mappings[col] = df[col].cat.categories.tolist() # Store category mappings
130
-
131
- # Inference mode
132
- else:
133
- for col, categories in category_mappings.items():
134
- if col in df.columns:
135
- print(f"Inference mode: Applying categorical mapping for {col}")
136
- df[col] = pd.Categorical(df[col], categories=categories) # Apply consistent categorical mapping
137
-
138
- return df, category_mappings
139
-
140
-
141
- def decompress_features(
142
- df: pd.DataFrame, features: List[str], compressed_features: List[str]
143
- ) -> Tuple[pd.DataFrame, List[str]]:
144
- """Prepare features for the model by decompressing bitstring features
145
-
146
- Args:
147
- df (pd.DataFrame): The features DataFrame
148
- features (List[str]): Full list of feature names
149
- compressed_features (List[str]): List of feature names to decompress (bitstrings)
150
-
151
- Returns:
152
- pd.DataFrame: DataFrame with the decompressed features
153
- List[str]: Updated list of feature names after decompression
154
-
155
- Raises:
156
- ValueError: If any missing values are found in the specified features
157
- """
158
-
159
- # Check for any missing values in the required features
160
- missing_counts = df[features].isna().sum()
161
- if missing_counts.any():
162
- missing_features = missing_counts[missing_counts > 0]
163
- print(
164
- f"WARNING: Found missing values in features: {missing_features.to_dict()}. "
165
- "WARNING: You might want to remove/replace all NaN values before processing."
166
- )
167
-
168
- # Decompress the specified compressed features
169
- decompressed_features = features.copy()
170
- for feature in compressed_features:
171
- if (feature not in df.columns) or (feature not in features):
172
- print(f"Feature '{feature}' not in the features list, skipping decompression.")
173
- continue
174
-
175
- # Remove the feature from the list of features to avoid duplication
176
- decompressed_features.remove(feature)
177
-
178
- # Handle all compressed features as bitstrings
179
- bit_matrix = np.array([list(bitstring) for bitstring in df[feature]], dtype=np.uint8)
180
- prefix = feature[:3]
181
-
182
- # Create all new columns at once - avoids fragmentation
183
- new_col_names = [f"{prefix}_{i}" for i in range(bit_matrix.shape[1])]
184
- new_df = pd.DataFrame(bit_matrix, columns=new_col_names, index=df.index)
185
-
186
- # Add to features list
187
- decompressed_features.extend(new_col_names)
188
-
189
- # Drop original column and concatenate new ones
190
- df = df.drop(columns=[feature])
191
- df = pd.concat([df, new_df], axis=1)
192
-
193
- return df, decompressed_features
194
-
195
-
196
- if __name__ == "__main__":
197
- # Template Parameters
198
- target = TEMPLATE_PARAMS["target"]
199
- features = TEMPLATE_PARAMS["features"]
200
- orig_features = features.copy()
201
- compressed_features = TEMPLATE_PARAMS["compressed_features"]
202
- train_all_data = TEMPLATE_PARAMS["train_all_data"]
203
- hyperparameters = TEMPLATE_PARAMS["hyperparameters"]
204
- validation_split = 0.2
205
-
206
- # Script arguments for input/output directories
207
- parser = argparse.ArgumentParser()
208
- parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model"))
209
- parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train"))
210
- parser.add_argument(
211
- "--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data")
212
- )
213
- args = parser.parse_args()
214
-
215
- # Read the training data into DataFrames
216
- training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
217
- print(f"Training Files: {training_files}")
218
-
219
- # Combine files and read them all into a single pandas dataframe
220
- all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
221
-
222
- # Check if the dataframe is empty
223
- check_dataframe(all_df, "training_df")
224
-
225
- # Features/Target output
226
- print(f"Target: {target}")
227
- print(f"Features: {str(features)}")
228
-
229
- # Convert any features that might be categorical to 'category' type
230
- all_df, category_mappings = convert_categorical_types(all_df, features)
231
-
232
- # If we have compressed features, decompress them
233
- if compressed_features:
234
- print(f"Decompressing features {compressed_features}...")
235
- all_df, features = decompress_features(all_df, features, compressed_features)
236
-
237
- # Do we want to train on all the data?
238
- if train_all_data:
239
- print("Training on ALL of the data")
240
- df_train = all_df.copy()
241
- df_val = all_df.copy()
242
-
243
- # Does the dataframe have a training column?
244
- elif "training" in all_df.columns:
245
- print("Found training column, splitting data based on training column")
246
- df_train = all_df[all_df["training"]]
247
- df_val = all_df[~all_df["training"]]
248
- else:
249
- # Just do a random training Split
250
- print("WARNING: No training column found, splitting data with random state=42")
251
- df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
252
- print(f"FIT/TRAIN: {df_train.shape}")
253
- print(f"VALIDATION: {df_val.shape}")
254
-
255
- # Extract sample weights if present
256
- if 'sample_weight' in df_train.columns:
257
- sample_weights = df_train['sample_weight']
258
- print(f"Using sample weights: min={sample_weights.min():.2f}, max={sample_weights.max():.2f}, mean={sample_weights.mean():.2f}")
259
- else:
260
- sample_weights = None
261
- print("No sample weights found, training with equal weights")
262
-
263
- # Prepare features and targets for training
264
- X_train = df_train[features]
265
- X_validate = df_val[features]
266
- y_train = df_train[target]
267
- y_validate = df_val[target]
268
-
269
- # Train XGBoost for point predictions
270
- print("\nTraining XGBoost for point predictions...")
271
- print(f" Hyperparameters: {hyperparameters}")
272
- xgb_model = XGBRegressor(enable_categorical=True, **hyperparameters)
273
- xgb_model.fit(X_train, y_train, sample_weight=sample_weights)
274
-
275
- # Evaluate XGBoost performance
276
- y_pred_xgb = xgb_model.predict(X_validate)
277
- xgb_rmse = root_mean_squared_error(y_validate, y_pred_xgb)
278
- xgb_mae = mean_absolute_error(y_validate, y_pred_xgb)
279
- xgb_r2 = r2_score(y_validate, y_pred_xgb)
280
-
281
- xgb_medae = median_absolute_error(y_validate, y_pred_xgb)
282
- xgb_spearman = spearmanr(y_validate, y_pred_xgb).correlation
283
-
284
- print(f"\nXGBoost Point Prediction Performance:")
285
- print(f"rmse: {xgb_rmse:.3f}")
286
- print(f"mae: {xgb_mae:.3f}")
287
- print(f"medae: {xgb_medae:.3f}")
288
- print(f"r2: {xgb_r2:.3f}")
289
- print(f"spearmanr: {xgb_spearman:.3f}")
290
-
291
- # Define confidence levels we want to model
292
- confidence_levels = [0.50, 0.68, 0.80, 0.90, 0.95] # 50%, 68%, 80%, 90%, 95% confidence intervals
293
-
294
- # Store MAPIE models for each confidence level
295
- mapie_models = {}
296
-
297
- # Train models for each confidence level
298
- for confidence_level in confidence_levels:
299
- alpha = 1 - confidence_level
300
- lower_q = alpha / 2
301
- upper_q = 1 - alpha / 2
302
-
303
- print(f"\nTraining quantile models for {confidence_level * 100:.0f}% confidence interval...")
304
- print(f" Quantiles: {lower_q:.3f}, {upper_q:.3f}, 0.500")
305
-
306
- # Train three models for this confidence level
307
- quantile_estimators = []
308
- for q in [lower_q, upper_q, 0.5]:
309
- print(f" Training model for quantile {q:.3f}...")
310
- est = LGBMRegressor(
311
- objective="quantile",
312
- alpha=q,
313
- n_estimators=1000,
314
- max_depth=6,
315
- learning_rate=0.01,
316
- num_leaves=31,
317
- min_child_samples=20,
318
- subsample=0.8,
319
- colsample_bytree=0.8,
320
- random_state=42,
321
- verbose=-1,
322
- force_col_wise=True,
323
- )
324
- est.fit(X_train, y_train)
325
- quantile_estimators.append(est)
326
-
327
- # Create MAPIE CQR model for this confidence level
328
- print(f" Setting up MAPIE CQR for {confidence_level * 100:.0f}% confidence...")
329
- mapie_model = ConformalizedQuantileRegressor(
330
- quantile_estimators, confidence_level=confidence_level, prefit=True
331
- )
332
-
333
- # Conformalize the model
334
- print(f" Conformalizing with validation data...")
335
- mapie_model.conformalize(X_validate, y_validate)
336
-
337
- # Store the model
338
- mapie_models[f"mapie_{confidence_level:.2f}"] = mapie_model
339
-
340
- # Validate coverage for this confidence level
341
- y_pred, y_pis = mapie_model.predict_interval(X_validate)
342
- coverage = np.mean((y_validate >= y_pis[:, 0, 0]) & (y_validate <= y_pis[:, 1, 0]))
343
- print(f" Coverage: Target={confidence_level * 100:.0f}%, Empirical={coverage * 100:.1f}%")
344
-
345
- support = len(df_val)
346
- print(f"\nOverall Model Performance Summary:")
347
- print(f"rmse: {xgb_rmse:.3f}")
348
- print(f"mae: {xgb_mae:.3f}")
349
- print(f"medae: {xgb_medae:.3f}")
350
- print(f"r2: {xgb_r2:.3f}")
351
- print(f"spearmanr: {xgb_spearman:.3f}")
352
- print(f"support: {support}")
353
-
354
- # Analyze interval widths across confidence levels
355
- print(f"\nInterval Width Analysis:")
356
- for conf_level in confidence_levels:
357
- model = mapie_models[f"mapie_{conf_level:.2f}"]
358
- _, y_pis = model.predict_interval(X_validate)
359
- widths = y_pis[:, 1, 0] - y_pis[:, 0, 0]
360
- print(f" {conf_level * 100:.0f}% CI: Mean width={np.mean(widths):.3f}, Std={np.std(widths):.3f}")
361
-
362
- # Compute normalization statistics for confidence calculation
363
- print(f"\nComputing normalization statistics for confidence scores...")
364
-
365
- # Add predictions directly to validation dataframe
366
- df_val["prediction"] = xgb_model.predict(X_validate)
367
-
368
- # Add all quantile predictions
369
- for conf_level in confidence_levels:
370
- model_name = f"mapie_{conf_level:.2f}"
371
- model = mapie_models[model_name]
372
- y_pred, y_pis = model.predict_interval(X_validate)
373
-
374
- if conf_level == 0.50:
375
- df_val["q_25"] = y_pis[:, 0, 0]
376
- df_val["q_75"] = y_pis[:, 1, 0]
377
- # y_pred is the median prediction
378
- df_val["q_50"] = y_pred
379
- elif conf_level == 0.68:
380
- df_val["q_16"] = y_pis[:, 0, 0]
381
- df_val["q_84"] = y_pis[:, 1, 0]
382
- elif conf_level == 0.80:
383
- df_val["q_10"] = y_pis[:, 0, 0]
384
- df_val["q_90"] = y_pis[:, 1, 0]
385
- elif conf_level == 0.90:
386
- df_val["q_05"] = y_pis[:, 0, 0]
387
- df_val["q_95"] = y_pis[:, 1, 0]
388
- elif conf_level == 0.95:
389
- df_val["q_025"] = y_pis[:, 0, 0]
390
- df_val["q_975"] = y_pis[:, 1, 0]
391
-
392
- # Compute normalization stats using q_10 and q_90 (default range)
393
- interval_width = (df_val["q_90"] - df_val["q_10"]).abs()
394
- median_interval_width = float(interval_width.median())
395
- print(f" Median interval width (q_10-q_90): {median_interval_width:.6f}")
396
-
397
- # Save median interval width for confidence calculation
398
- with open(os.path.join(args.model_dir, "median_interval_width.json"), "w") as fp:
399
- json.dump(median_interval_width, fp)
400
-
401
- # Save the trained XGBoost model
402
- joblib.dump(xgb_model, os.path.join(args.model_dir, "xgb_model.joblib"))
403
-
404
- # Save all MAPIE models
405
- for model_name, model in mapie_models.items():
406
- joblib.dump(model, os.path.join(args.model_dir, f"{model_name}.joblib"))
407
-
408
- # Save the feature list
409
- with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
410
- json.dump(features, fp)
411
-
412
- # Save category mappings if any
413
- if category_mappings:
414
- with open(os.path.join(args.model_dir, "category_mappings.json"), "w") as fp:
415
- json.dump(category_mappings, fp)
416
-
417
- # Save model configuration
418
- model_config = {
419
- "model_type": "XGBoost_MAPIE_CQR_LightGBM",
420
- "confidence_levels": confidence_levels,
421
- "n_features": len(features),
422
- "target": target,
423
- "validation_metrics": {
424
- "xgb_rmse": float(xgb_rmse),
425
- "xgb_mae": float(xgb_mae),
426
- "xgb_r2": float(xgb_r2),
427
- "n_validation": len(df_val),
428
- },
429
- }
430
- with open(os.path.join(args.model_dir, "model_config.json"), "w") as fp:
431
- json.dump(model_config, fp, indent=2)
432
-
433
- print(f"\nModel training complete!")
434
- print(f"Saved 1 XGBoost model and {len(mapie_models)} MAPIE models to {args.model_dir}")
435
-
436
-
437
- #
438
- # Inference Section
439
- #
440
- def model_fn(model_dir) -> dict:
441
- """Load XGBoost and all MAPIE models from the specified directory."""
442
-
443
- # Load model configuration to know which models to load
444
- with open(os.path.join(model_dir, "model_config.json")) as fp:
445
- config = json.load(fp)
446
-
447
- # Load XGBoost regressor
448
- xgb_path = os.path.join(model_dir, "xgb_model.joblib")
449
- xgb_model = joblib.load(xgb_path)
450
-
451
- # Load all MAPIE models
452
- mapie_models = {}
453
- for conf_level in config["confidence_levels"]:
454
- model_name = f"mapie_{conf_level:.2f}"
455
- mapie_models[model_name] = joblib.load(os.path.join(model_dir, f"{model_name}.joblib"))
456
-
457
- # Load category mappings if they exist
458
- category_mappings = {}
459
- category_path = os.path.join(model_dir, "category_mappings.json")
460
- if os.path.exists(category_path):
461
- with open(category_path) as fp:
462
- category_mappings = json.load(fp)
463
-
464
- # Load median interval width for confidence calculation
465
- median_interval_width = None
466
- median_width_path = os.path.join(model_dir, "median_interval_width.json")
467
- if os.path.exists(median_width_path):
468
- with open(median_width_path) as fp:
469
- median_interval_width = json.load(fp)
470
-
471
- return {
472
- "xgb_model": xgb_model,
473
- "mapie_models": mapie_models,
474
- "confidence_levels": config["confidence_levels"],
475
- "category_mappings": category_mappings,
476
- "median_interval_width": median_interval_width,
477
- }
478
-
479
-
480
- def input_fn(input_data, content_type):
481
- """Parse input data and return a DataFrame."""
482
- if not input_data:
483
- raise ValueError("Empty input data is not supported!")
484
-
485
- # Decode bytes to string if necessary
486
- if isinstance(input_data, bytes):
487
- input_data = input_data.decode("utf-8")
488
-
489
- if "text/csv" in content_type:
490
- return pd.read_csv(StringIO(input_data))
491
- elif "application/json" in content_type:
492
- return pd.DataFrame(json.loads(input_data))
493
- else:
494
- raise ValueError(f"{content_type} not supported!")
495
-
496
-
497
- def output_fn(output_df, accept_type):
498
- """Supports both CSV and JSON output formats."""
499
- if "text/csv" in accept_type:
500
- # Convert categorical columns to string to avoid fillna issues
501
- for col in output_df.select_dtypes(include=["category"]).columns:
502
- output_df[col] = output_df[col].astype(str)
503
- csv_output = output_df.fillna("N/A").to_csv(index=False)
504
- return csv_output, "text/csv"
505
- elif "application/json" in accept_type:
506
- return output_df.to_json(orient="records"), "application/json"
507
- else:
508
- raise RuntimeError(f"{accept_type} accept type is not supported by this script.")
509
-
510
-
511
- def predict_fn(df, models) -> pd.DataFrame:
512
- """Make predictions using XGBoost for point estimates and MAPIE for conformalized intervals
513
-
514
- Args:
515
- df (pd.DataFrame): The input DataFrame
516
- models (dict): Dictionary containing XGBoost and MAPIE models
517
-
518
- Returns:
519
- pd.DataFrame: DataFrame with XGBoost predictions and conformalized intervals
520
- """
521
-
522
- # Flag for outlier stretch adjustment for the prediction intervals
523
- # if the predicted values are outside the intervals
524
- outlier_stretch = False
525
-
526
- # Grab our feature columns (from training)
527
- model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
528
- with open(os.path.join(model_dir, "feature_columns.json")) as fp:
529
- model_features = json.load(fp)
530
-
531
- # Match features in a case-insensitive manner
532
- matched_df = match_features_case_insensitive(df, model_features)
533
-
534
- # Apply categorical mappings if they exist
535
- if models.get("category_mappings"):
536
- matched_df, _ = convert_categorical_types(matched_df, model_features, models["category_mappings"])
537
-
538
- # Get features for prediction
539
- X = matched_df[model_features]
540
-
541
- # Get XGBoost point predictions
542
- df["prediction"] = models["xgb_model"].predict(X)
543
-
544
- # Get predictions from each MAPIE model for conformalized intervals
545
- for conf_level in models["confidence_levels"]:
546
- model_name = f"mapie_{conf_level:.2f}"
547
- model = models["mapie_models"][model_name]
548
-
549
- # Get conformalized predictions
550
- y_pred, y_pis = model.predict_interval(X)
551
-
552
- # Map confidence levels to quantile names
553
- if conf_level == 0.50: # 50% CI
554
- df["q_25"] = y_pis[:, 0, 0]
555
- df["q_75"] = y_pis[:, 1, 0]
556
- # y_pred is the median prediction
557
- df["q_50"] = y_pred
558
- elif conf_level == 0.68: # 68% CI
559
- df["q_16"] = y_pis[:, 0, 0]
560
- df["q_84"] = y_pis[:, 1, 0]
561
- elif conf_level == 0.80: # 80% CI
562
- df["q_10"] = y_pis[:, 0, 0]
563
- df["q_90"] = y_pis[:, 1, 0]
564
- elif conf_level == 0.90: # 90% CI
565
- df["q_05"] = y_pis[:, 0, 0]
566
- df["q_95"] = y_pis[:, 1, 0]
567
- elif conf_level == 0.95: # 95% CI
568
- df["q_025"] = y_pis[:, 0, 0]
569
- df["q_975"] = y_pis[:, 1, 0]
570
-
571
- # Calculate a pseudo-standard deviation from the 68% interval width
572
- df["prediction_std"] = (df["q_84"] - df["q_16"]).abs() / 2.0
573
-
574
- # Reorder the quantile columns for easier reading
575
- quantile_cols = ["q_025", "q_05", "q_10", "q_16", "q_25", "q_50", "q_75", "q_84", "q_90", "q_95", "q_975"]
576
- other_cols = [col for col in df.columns if col not in quantile_cols]
577
- df = df[other_cols + quantile_cols]
578
-
579
- # Adjust the outer quantiles to ensure they encompass the prediction
580
- if outlier_stretch:
581
- # Lower intervals adjustments
582
- df["q_025"] = np.minimum(df["q_025"], df["prediction"])
583
- df["q_05"] = np.minimum(df["q_05"], df["prediction"])
584
- df["q_10"] = np.minimum(df["q_10"], df["prediction"])
585
- df["q_16"] = np.minimum(df["q_16"], df["prediction"])
586
- df["q_25"] = np.minimum(df["q_25"], df["prediction"])
587
-
588
- # Upper intervals adjustments
589
- df["q_75"] = np.maximum(df["q_75"], df["prediction"])
590
- df["q_84"] = np.maximum(df["q_84"], df["prediction"])
591
- df["q_90"] = np.maximum(df["q_90"], df["prediction"])
592
- df["q_95"] = np.maximum(df["q_95"], df["prediction"])
593
- df["q_975"] = np.maximum(df["q_975"], df["prediction"])
594
-
595
- # Compute confidence scores using pre-computed normalization stats
596
- df = compute_confidence(
597
- df,
598
- lower_q="q_10",
599
- upper_q="q_90",
600
- alpha=1.0,
601
- beta=1.0,
602
- median_interval_width=models["median_interval_width"],
603
- )
604
-
605
- return df
@@ -1 +0,0 @@
1
- # Note: Most libs are already in the training/inference images, ONLY specify additional libs here