workbench 0.8.192__py3-none-any.whl → 0.8.193__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -41,52 +41,39 @@ class ModelType(Enum):
41
41
  class ModelImages:
42
42
  """Class for retrieving workbench inference images"""
43
43
 
44
- image_uris = {
45
- # US East 1 images
46
- ("us-east-1", "training", "0.1", "x86_64"): (
47
- "507740646243.dkr.ecr.us-east-1.amazonaws.com/aws-ml-images/py312-general-ml-training:0.1"
48
- ),
49
- ("us-east-1", "inference", "0.1", "x86_64"): (
50
- "507740646243.dkr.ecr.us-east-1.amazonaws.com/aws-ml-images/py312-general-ml-inference:0.1"
51
- ),
52
- ("us-east-1", "pytorch_training", "0.1", "x86_64"): (
53
- "507740646243.dkr.ecr.us-east-1.amazonaws.com/aws-ml-images/py312-pytorch-training:0.1"
54
- ),
55
- ("us-east-1", "pytorch_inference", "0.1", "x86_64"): (
56
- "507740646243.dkr.ecr.us-east-1.amazonaws.com/aws-ml-images/py312-pytorch-inference:0.1"
57
- ),
58
- # US West 2 images
59
- ("us-west-2", "training", "0.1", "x86_64"): (
60
- "507740646243.dkr.ecr.us-west-2.amazonaws.com/aws-ml-images/py312-general-ml-training:0.1"
61
- ),
62
- ("us-west-2", "inference", "0.1", "x86_64"): (
63
- "507740646243.dkr.ecr.us-west-2.amazonaws.com/aws-ml-images/py312-general-ml-inference:0.1"
64
- ),
65
- ("us-west-2", "pytorch_training", "0.1", "x86_64"): (
66
- "507740646243.dkr.ecr.us-west-2.amazonaws.com/aws-ml-images/py312-pytorch-training:0.1"
67
- ),
68
- ("us-west-2", "pytorch_inference", "0.1", "x86_64"): (
69
- "507740646243.dkr.ecr.us-west-2.amazonaws.com/aws-ml-images/py312-pytorch-inference:0.1"
70
- ),
71
- # ARM64 images
72
- # Meta Endpoint inference images
73
- ("us-east-1", "meta-endpoint", "0.1", "x86_64"): (
74
- "507740646243.dkr.ecr.us-east-1.amazonaws.com/aws-ml-images/py312-meta-endpoint:0.1"
75
- ),
76
- ("us-west-2", "meta-endpoint", "0.1", "x86_64"): (
77
- "507740646243.dkr.ecr.us-west-2.amazonaws.com/aws-ml-images/py312-meta-endpoint:0.1"
78
- ),
44
+ # Account ID
45
+ ACCOUNT_ID = "507740646243"
46
+
47
+ # Image name mappings
48
+ IMAGE_NAMES = {
49
+ "training": "py312-general-ml-training",
50
+ "inference": "py312-general-ml-inference",
51
+ "pytorch_training": "py312-pytorch-training",
52
+ "pytorch_inference": "py312-pytorch-inference",
53
+ "meta-endpoint": "py312-meta-endpoint",
79
54
  }
80
55
 
81
56
  @classmethod
82
57
  def get_image_uri(cls, region, image_type, version="0.1", architecture="x86_64"):
83
- key = (region, image_type, version, architecture)
84
- if key in cls.image_uris:
85
- return cls.image_uris[key]
86
- else:
87
- raise ValueError(
88
- f"No matching image found for region: {region}, image_type: {image_type}, version: {version}"
89
- )
58
+ """
59
+ Dynamically construct ECR image URI.
60
+
61
+ Args:
62
+ region: AWS region (e.g., 'us-east-1', 'us-west-2')
63
+ image_type: Type of image (e.g., 'training', 'inference', 'pytorch_training')
64
+ version: Image version (e.g., '0.1', '0.2')
65
+ architecture: CPU architecture (default: 'x86_64', currently unused but kept for compatibility)
66
+
67
+ Returns:
68
+ ECR image URI string
69
+ """
70
+ if image_type not in cls.IMAGE_NAMES:
71
+ raise ValueError(f"Unknown image_type: {image_type}. Valid types: {list(cls.IMAGE_NAMES.keys())}")
72
+
73
+ image_name = cls.IMAGE_NAMES[image_type]
74
+ uri = f"{cls.ACCOUNT_ID}.dkr.ecr.{region}.amazonaws.com/aws-ml-images/{image_name}:{version}"
75
+
76
+ return uri
90
77
 
91
78
 
92
79
  class ModelCore(Artifact):
@@ -233,7 +233,7 @@ class FeaturesToModel(Transform):
233
233
  source_dir = str(Path(script_path).parent)
234
234
 
235
235
  # Create a Sagemaker Model with our script
236
- image = ModelImages.get_image_uri(self.sm_session.boto_region_name, self.training_image, "0.1")
236
+ image = ModelImages.get_image_uri(self.sm_session.boto_region_name, self.training_image, "0.2")
237
237
  self.estimator = Estimator(
238
238
  entry_point=entry_point,
239
239
  source_dir=source_dir,
@@ -306,7 +306,7 @@ class FeaturesToModel(Transform):
306
306
 
307
307
  # Register our model
308
308
  image = ModelImages.get_image_uri(
309
- self.sm_session.boto_region_name, self.inference_image, "0.1", self.inference_arch
309
+ self.sm_session.boto_region_name, self.inference_image, "0.2", self.inference_arch
310
310
  )
311
311
  self.log.important(f"Registering model {self.output_name} with Inference Image {image}...")
312
312
  model = self.estimator.create_model(role=self.workbench_role_arn)
@@ -0,0 +1,468 @@
1
+ # Imports for XGB Model
2
+ import xgboost as xgb
3
+ import awswrangler as wr
4
+ import numpy as np
5
+
6
+ # Model Performance Scores
7
+ from sklearn.metrics import (
8
+ mean_absolute_error,
9
+ r2_score,
10
+ root_mean_squared_error,
11
+ precision_recall_fscore_support,
12
+ confusion_matrix,
13
+ )
14
+
15
+ # Classification Encoder
16
+ from sklearn.preprocessing import LabelEncoder
17
+
18
+ # Scikit Learn Imports
19
+ from sklearn.model_selection import train_test_split
20
+
21
+ from io import StringIO
22
+ import json
23
+ import argparse
24
+ import joblib
25
+ import os
26
+ import pandas as pd
27
+ from typing import List, Tuple
28
+
29
+ # Template Parameters
30
+ TEMPLATE_PARAMS = {
31
+ "model_type": "regressor",
32
+ "target": "solubility",
33
+ "features": ['molwt', 'mollogp', 'molmr', 'heavyatomcount', 'numhacceptors', 'numhdonors', 'numheteroatoms', 'numrotatablebonds', 'numvalenceelectrons', 'numaromaticrings', 'numsaturatedrings', 'numaliphaticrings', 'ringcount', 'tpsa', 'labuteasa', 'balabanj', 'bertzct'],
34
+ "compressed_features": [],
35
+ "model_metrics_s3_path": "s3://sandbox-sageworks-artifacts/models/aqsol-regression/training",
36
+ "train_all_data": False,
37
+ "hyperparameters": {},
38
+ }
39
+
40
+
41
+ # Function to check if dataframe is empty
42
+ def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
43
+ """
44
+ Check if the provided dataframe is empty and raise an exception if it is.
45
+
46
+ Args:
47
+ df (pd.DataFrame): DataFrame to check
48
+ df_name (str): Name of the DataFrame
49
+ """
50
+ if df.empty:
51
+ msg = f"*** The training data {df_name} has 0 rows! ***STOPPING***"
52
+ print(msg)
53
+ raise ValueError(msg)
54
+
55
+
56
+ def expand_proba_column(df: pd.DataFrame, class_labels: List[str]) -> pd.DataFrame:
57
+ """
58
+ Expands a column in a DataFrame containing a list of probabilities into separate columns.
59
+
60
+ Args:
61
+ df (pd.DataFrame): DataFrame containing a "pred_proba" column
62
+ class_labels (List[str]): List of class labels
63
+
64
+ Returns:
65
+ pd.DataFrame: DataFrame with the "pred_proba" expanded into separate columns
66
+ """
67
+
68
+ # Sanity check
69
+ proba_column = "pred_proba"
70
+ if proba_column not in df.columns:
71
+ raise ValueError('DataFrame does not contain a "pred_proba" column')
72
+
73
+ # Construct new column names with '_proba' suffix
74
+ proba_splits = [f"{label}_proba" for label in class_labels]
75
+
76
+ # Expand the proba_column into separate columns for each probability
77
+ proba_df = pd.DataFrame(df[proba_column].tolist(), columns=proba_splits)
78
+
79
+ # Drop any proba columns and reset the index in prep for the concat
80
+ df = df.drop(columns=[proba_column] + proba_splits, errors="ignore")
81
+ df = df.reset_index(drop=True)
82
+
83
+ # Concatenate the new columns with the original DataFrame
84
+ df = pd.concat([df, proba_df], axis=1)
85
+ print(df)
86
+ return df
87
+
88
+
89
+ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> pd.DataFrame:
90
+ """
91
+ Matches and renames DataFrame columns to match model feature names (case-insensitive).
92
+ Prioritizes exact matches, then case-insensitive matches.
93
+
94
+ Raises ValueError if any model features cannot be matched.
95
+ """
96
+ df_columns_lower = {col.lower(): col for col in df.columns}
97
+ rename_dict = {}
98
+ missing = []
99
+ for feature in model_features:
100
+ if feature in df.columns:
101
+ continue # Exact match
102
+ elif feature.lower() in df_columns_lower:
103
+ rename_dict[df_columns_lower[feature.lower()]] = feature
104
+ else:
105
+ missing.append(feature)
106
+
107
+ if missing:
108
+ raise ValueError(f"Features not found: {missing}")
109
+
110
+ # Rename the DataFrame columns to match the model features
111
+ return df.rename(columns=rename_dict)
112
+
113
+
114
+ def convert_categorical_types(df: pd.DataFrame, features: list, category_mappings={}) -> tuple:
115
+ """
116
+ Converts appropriate columns to categorical type with consistent mappings.
117
+
118
+ Args:
119
+ df (pd.DataFrame): The DataFrame to process.
120
+ features (list): List of feature names to consider for conversion.
121
+ category_mappings (dict, optional): Existing category mappings. If empty dict, we're in
122
+ training mode. If populated, we're in inference mode.
123
+
124
+ Returns:
125
+ tuple: (processed DataFrame, category mappings dictionary)
126
+ """
127
+ # Training mode
128
+ if category_mappings == {}:
129
+ for col in df.select_dtypes(include=["object", "string"]):
130
+ if col in features and df[col].nunique() < 20:
131
+ print(f"Training mode: Converting {col} to category")
132
+ df[col] = df[col].astype("category")
133
+ category_mappings[col] = df[col].cat.categories.tolist() # Store category mappings
134
+
135
+ # Inference mode
136
+ else:
137
+ for col, categories in category_mappings.items():
138
+ if col in df.columns:
139
+ print(f"Inference mode: Applying categorical mapping for {col}")
140
+ df[col] = pd.Categorical(df[col], categories=categories) # Apply consistent categorical mapping
141
+
142
+ return df, category_mappings
143
+
144
+
145
+ def decompress_features(
146
+ df: pd.DataFrame, features: List[str], compressed_features: List[str]
147
+ ) -> Tuple[pd.DataFrame, List[str]]:
148
+ """Prepare features for the model by decompressing bitstring features
149
+
150
+ Args:
151
+ df (pd.DataFrame): The features DataFrame
152
+ features (List[str]): Full list of feature names
153
+ compressed_features (List[str]): List of feature names to decompress (bitstrings)
154
+
155
+ Returns:
156
+ pd.DataFrame: DataFrame with the decompressed features
157
+ List[str]: Updated list of feature names after decompression
158
+
159
+ Raises:
160
+ ValueError: If any missing values are found in the specified features
161
+ """
162
+
163
+ # Check for any missing values in the required features
164
+ missing_counts = df[features].isna().sum()
165
+ if missing_counts.any():
166
+ missing_features = missing_counts[missing_counts > 0]
167
+ print(
168
+ f"WARNING: Found missing values in features: {missing_features.to_dict()}. "
169
+ "WARNING: You might want to remove/replace all NaN values before processing."
170
+ )
171
+
172
+ # Decompress the specified compressed features
173
+ decompressed_features = features.copy()
174
+ for feature in compressed_features:
175
+ if (feature not in df.columns) or (feature not in features):
176
+ print(f"Feature '{feature}' not in the features list, skipping decompression.")
177
+ continue
178
+
179
+ # Remove the feature from the list of features to avoid duplication
180
+ decompressed_features.remove(feature)
181
+
182
+ # Handle all compressed features as bitstrings
183
+ bit_matrix = np.array([list(bitstring) for bitstring in df[feature]], dtype=np.uint8)
184
+ prefix = feature[:3]
185
+
186
+ # Create all new columns at once - avoids fragmentation
187
+ new_col_names = [f"{prefix}_{i}" for i in range(bit_matrix.shape[1])]
188
+ new_df = pd.DataFrame(bit_matrix, columns=new_col_names, index=df.index)
189
+
190
+ # Add to features list
191
+ decompressed_features.extend(new_col_names)
192
+
193
+ # Drop original column and concatenate new ones
194
+ df = df.drop(columns=[feature])
195
+ df = pd.concat([df, new_df], axis=1)
196
+
197
+ return df, decompressed_features
198
+
199
+
200
+ if __name__ == "__main__":
201
+ """The main function is for training the XGBoost model"""
202
+
203
+ # Harness Template Parameters
204
+ target = TEMPLATE_PARAMS["target"]
205
+ features = TEMPLATE_PARAMS["features"]
206
+ orig_features = features.copy()
207
+ compressed_features = TEMPLATE_PARAMS["compressed_features"]
208
+ model_type = TEMPLATE_PARAMS["model_type"]
209
+ model_metrics_s3_path = TEMPLATE_PARAMS["model_metrics_s3_path"]
210
+ train_all_data = TEMPLATE_PARAMS["train_all_data"]
211
+ hyperparameters = TEMPLATE_PARAMS["hyperparameters"]
212
+ validation_split = 0.2
213
+
214
+ # Script arguments for input/output directories
215
+ parser = argparse.ArgumentParser()
216
+ parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model"))
217
+ parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train"))
218
+ parser.add_argument(
219
+ "--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data")
220
+ )
221
+ args = parser.parse_args()
222
+
223
+ # Read the training data into DataFrames
224
+ training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
225
+ print(f"Training Files: {training_files}")
226
+
227
+ # Combine files and read them all into a single pandas dataframe
228
+ all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
229
+
230
+ # Check if the dataframe is empty
231
+ check_dataframe(all_df, "training_df")
232
+
233
+ # Features/Target output
234
+ print(f"Target: {target}")
235
+ print(f"Features: {str(features)}")
236
+
237
+ # Convert any features that might be categorical to 'category' type
238
+ all_df, category_mappings = convert_categorical_types(all_df, features)
239
+
240
+ # If we have compressed features, decompress them
241
+ if compressed_features:
242
+ print(f"Decompressing features {compressed_features}...")
243
+ all_df, features = decompress_features(all_df, features, compressed_features)
244
+
245
+ # Do we want to train on all the data?
246
+ if train_all_data:
247
+ print("Training on ALL of the data")
248
+ df_train = all_df.copy()
249
+ df_val = all_df.copy()
250
+
251
+ # Does the dataframe have a training column?
252
+ elif "training" in all_df.columns:
253
+ print("Found training column, splitting data based on training column")
254
+ df_train = all_df[all_df["training"]]
255
+ df_val = all_df[~all_df["training"]]
256
+ else:
257
+ # Just do a random training Split
258
+ print("WARNING: No training column found, splitting data with random state=42")
259
+ df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
260
+ print(f"FIT/TRAIN: {df_train.shape}")
261
+ print(f"VALIDATION: {df_val.shape}")
262
+
263
+ # Use any hyperparameters to set up both the trainer and model configurations
264
+ print(f"Hyperparameters: {hyperparameters}")
265
+
266
+ # Now spin up our XGB Model
267
+ if model_type == "classifier":
268
+ xgb_model = xgb.XGBClassifier(enable_categorical=True, **hyperparameters)
269
+
270
+ # Encode the target column
271
+ label_encoder = LabelEncoder()
272
+ df_train[target] = label_encoder.fit_transform(df_train[target])
273
+ df_val[target] = label_encoder.transform(df_val[target])
274
+
275
+ else:
276
+ xgb_model = xgb.XGBRegressor(enable_categorical=True, **hyperparameters)
277
+ label_encoder = None # We don't need this for regression
278
+
279
+ # Grab our Features, Target and Train the Model
280
+ y_train = df_train[target]
281
+ X_train = df_train[features]
282
+ xgb_model.fit(X_train, y_train)
283
+
284
+ # Make Predictions on the Validation Set
285
+ print(f"Making Predictions on Validation Set...")
286
+ y_validate = df_val[target]
287
+ X_validate = df_val[features]
288
+ preds = xgb_model.predict(X_validate)
289
+ if model_type == "classifier":
290
+ # Also get the probabilities for each class
291
+ print("Processing Probabilities...")
292
+ probs = xgb_model.predict_proba(X_validate)
293
+ df_val["pred_proba"] = [p.tolist() for p in probs]
294
+
295
+ # Expand the pred_proba column into separate columns for each class
296
+ print(df_val.columns)
297
+ df_val = expand_proba_column(df_val, label_encoder.classes_)
298
+ print(df_val.columns)
299
+
300
+ # Decode the target and prediction labels
301
+ y_validate = label_encoder.inverse_transform(y_validate)
302
+ preds = label_encoder.inverse_transform(preds)
303
+
304
+ # Save predictions to S3 (just the target, prediction, and '_proba' columns)
305
+ df_val["prediction"] = preds
306
+ output_columns = [target, "prediction"]
307
+ output_columns += [col for col in df_val.columns if col.endswith("_proba")]
308
+ wr.s3.to_csv(
309
+ df_val[output_columns],
310
+ path=f"{model_metrics_s3_path}/validation_predictions.csv",
311
+ index=False,
312
+ )
313
+
314
+ # Report Performance Metrics
315
+ if model_type == "classifier":
316
+ # Get the label names and their integer mapping
317
+ label_names = label_encoder.classes_
318
+
319
+ # Calculate various model performance metrics
320
+ scores = precision_recall_fscore_support(y_validate, preds, average=None, labels=label_names)
321
+
322
+ # Put the scores into a dataframe
323
+ score_df = pd.DataFrame(
324
+ {
325
+ target: label_names,
326
+ "precision": scores[0],
327
+ "recall": scores[1],
328
+ "fscore": scores[2],
329
+ "support": scores[3],
330
+ }
331
+ )
332
+
333
+ # We need to get creative with the Classification Metrics
334
+ metrics = ["precision", "recall", "fscore", "support"]
335
+ for t in label_names:
336
+ for m in metrics:
337
+ value = score_df.loc[score_df[target] == t, m].iloc[0]
338
+ print(f"Metrics:{t}:{m} {value}")
339
+
340
+ # Compute and output the confusion matrix
341
+ conf_mtx = confusion_matrix(y_validate, preds, labels=label_names)
342
+ for i, row_name in enumerate(label_names):
343
+ for j, col_name in enumerate(label_names):
344
+ value = conf_mtx[i, j]
345
+ print(f"ConfusionMatrix:{row_name}:{col_name} {value}")
346
+
347
+ else:
348
+ # Calculate various model performance metrics (regression)
349
+ rmse = root_mean_squared_error(y_validate, preds)
350
+ mae = mean_absolute_error(y_validate, preds)
351
+ r2 = r2_score(y_validate, preds)
352
+ print(f"RMSE: {rmse:.3f}")
353
+ print(f"MAE: {mae:.3f}")
354
+ print(f"R2: {r2:.3f}")
355
+ print(f"NumRows: {len(df_val)}")
356
+
357
+ # Now save the model to the standard place/name
358
+ joblib.dump(xgb_model, os.path.join(args.model_dir, "xgb_model.joblib"))
359
+
360
+ # Save the label encoder if we have one
361
+ if label_encoder:
362
+ joblib.dump(label_encoder, os.path.join(args.model_dir, "label_encoder.joblib"))
363
+
364
+ # Save the features (this will validate input during predictions)
365
+ with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
366
+ json.dump(orig_features, fp) # We save the original features, not the decompressed ones
367
+
368
+ # Save the category mappings
369
+ with open(os.path.join(args.model_dir, "category_mappings.json"), "w") as fp:
370
+ json.dump(category_mappings, fp)
371
+
372
+
373
+ def model_fn(model_dir):
374
+ """Deserialize and return fitted XGBoost model"""
375
+ model_path = os.path.join(model_dir, "xgb_model.joblib")
376
+ model = joblib.load(model_path)
377
+ return model
378
+
379
+
380
+ def input_fn(input_data, content_type):
381
+ """Parse input data and return a DataFrame."""
382
+ if not input_data:
383
+ raise ValueError("Empty input data is not supported!")
384
+
385
+ # Decode bytes to string if necessary
386
+ if isinstance(input_data, bytes):
387
+ input_data = input_data.decode("utf-8")
388
+
389
+ if "text/csv" in content_type:
390
+ return pd.read_csv(StringIO(input_data))
391
+ elif "application/json" in content_type:
392
+ return pd.DataFrame(json.loads(input_data)) # Assumes JSON array of records
393
+ else:
394
+ raise ValueError(f"{content_type} not supported!")
395
+
396
+
397
+ def output_fn(output_df, accept_type):
398
+ """Supports both CSV and JSON output formats."""
399
+ if "text/csv" in accept_type:
400
+ csv_output = output_df.fillna("N/A").to_csv(index=False) # CSV with N/A for missing values
401
+ return csv_output, "text/csv"
402
+ elif "application/json" in accept_type:
403
+ return output_df.to_json(orient="records"), "application/json" # JSON array of records (NaNs -> null)
404
+ else:
405
+ raise RuntimeError(f"{accept_type} accept type is not supported by this script.")
406
+
407
+
408
+ def predict_fn(df, model) -> pd.DataFrame:
409
+ """Make Predictions with our XGB Model
410
+
411
+ Args:
412
+ df (pd.DataFrame): The input DataFrame
413
+ model: The model use for predictions
414
+
415
+ Returns:
416
+ pd.DataFrame: The DataFrame with the predictions added
417
+ """
418
+ compressed_features = TEMPLATE_PARAMS["compressed_features"]
419
+
420
+ # Grab our feature columns (from training)
421
+ model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
422
+ with open(os.path.join(model_dir, "feature_columns.json")) as fp:
423
+ features = json.load(fp)
424
+ print(f"Model Features: {features}")
425
+
426
+ # Load the category mappings (from training)
427
+ with open(os.path.join(model_dir, "category_mappings.json")) as fp:
428
+ category_mappings = json.load(fp)
429
+
430
+ # Load our Label Encoder if we have one
431
+ label_encoder = None
432
+ if os.path.exists(os.path.join(model_dir, "label_encoder.joblib")):
433
+ label_encoder = joblib.load(os.path.join(model_dir, "label_encoder.joblib"))
434
+
435
+ # We're going match features in a case-insensitive manner, accounting for all the permutations
436
+ # - Model has a feature list that's any case ("Id", "taCos", "cOunT", "likes_tacos")
437
+ # - Incoming data has columns that are mixed case ("ID", "Tacos", "Count", "Likes_Tacos")
438
+ matched_df = match_features_case_insensitive(df, features)
439
+
440
+ # Detect categorical types in the incoming DataFrame
441
+ matched_df, _ = convert_categorical_types(matched_df, features, category_mappings)
442
+
443
+ # If we have compressed features, decompress them
444
+ if compressed_features:
445
+ print("Decompressing features for prediction...")
446
+ matched_df, features = decompress_features(matched_df, features, compressed_features)
447
+
448
+ # Predict the features against our XGB Model
449
+ X = matched_df[features]
450
+ predictions = model.predict(X)
451
+
452
+ # If we have a label encoder, decode the predictions
453
+ if label_encoder:
454
+ predictions = label_encoder.inverse_transform(predictions)
455
+
456
+ # Set the predictions on the DataFrame
457
+ df["prediction"] = predictions
458
+
459
+ # Does our model have a 'predict_proba' method? If so we will call it and add the results to the DataFrame
460
+ if getattr(model, "predict_proba", None):
461
+ probs = model.predict_proba(matched_df[features])
462
+ df["pred_proba"] = [p.tolist() for p in probs]
463
+
464
+ # Expand the pred_proba column into separate columns for each class
465
+ df = expand_proba_column(df, label_encoder.classes_)
466
+
467
+ # All done, return the DataFrame with new columns for the predictions
468
+ return df
@@ -188,6 +188,20 @@ def uq_model(model: "Model", uq_model_name: str, train_all_data: bool = False) -
188
188
  return uq_model
189
189
 
190
190
 
191
+ def safe_extract_tarfile(tar_path: str, extract_path: str) -> None:
192
+ """
193
+ Extract a tarball safely, using data filter if available.
194
+
195
+ The filter parameter was backported to Python 3.8+, 3.9+, 3.10.13+, 3.11+
196
+ as a security patch, but may not be present in older patch versions.
197
+ """
198
+ with tarfile.open(tar_path, "r:gz") as tar:
199
+ if hasattr(tarfile, "data_filter"):
200
+ tar.extractall(path=extract_path, filter="data")
201
+ else:
202
+ tar.extractall(path=extract_path)
203
+
204
+
191
205
  def load_category_mappings_from_s3(model_artifact_uri: str) -> Optional[dict]:
192
206
  """
193
207
  Download and extract category mappings from a model artifact in S3.
@@ -206,9 +220,7 @@ def load_category_mappings_from_s3(model_artifact_uri: str) -> Optional[dict]:
206
220
  wr.s3.download(path=model_artifact_uri, local_file=local_tar_path)
207
221
 
208
222
  # Extract tarball
209
- with tarfile.open(local_tar_path, "r:gz") as tar:
210
- # Note: For 3.12+, can use filter="data" argument
211
- tar.extractall(path=tmpdir)
223
+ safe_extract_tarfile(local_tar_path, tmpdir)
212
224
 
213
225
  # Look for category mappings in base directory only
214
226
  mappings_path = os.path.join(tmpdir, "category_mappings.json")
@@ -3,7 +3,6 @@
3
3
  import logging
4
4
  import os
5
5
  import tempfile
6
- import tarfile
7
6
  import joblib
8
7
  import pickle
9
8
  import glob
@@ -26,7 +25,7 @@ from scipy.stats import spearmanr
26
25
  from sklearn.preprocessing import LabelEncoder
27
26
 
28
27
  # Workbench Imports
29
- from workbench.utils.model_utils import load_category_mappings_from_s3
28
+ from workbench.utils.model_utils import load_category_mappings_from_s3, safe_extract_tarfile
30
29
  from workbench.utils.pandas_utils import convert_categorical_types
31
30
 
32
31
  # Set up the log
@@ -50,9 +49,7 @@ def xgboost_model_from_s3(model_artifact_uri: str):
50
49
  wr.s3.download(path=model_artifact_uri, local_file=local_tar_path)
51
50
 
52
51
  # Extract tarball
53
- with tarfile.open(local_tar_path, "r:gz") as tar:
54
- # Note: For 3.12+, can use filter="data" argument
55
- tar.extractall(path=tmpdir)
52
+ safe_extract_tarfile(local_tar_path, tmpdir)
56
53
 
57
54
  # Define model file patterns to search for (in order of preference)
58
55
  patterns = [
@@ -285,8 +282,18 @@ def cross_fold_inference(workbench_model: Any, nfolds: int = 5) -> Tuple[Dict[st
285
282
 
286
283
  # Check if we got a full sklearn model or need to create one
287
284
  if isinstance(loaded_model, (xgb.XGBClassifier, xgb.XGBRegressor)):
288
- xgb_model = loaded_model
289
- is_classifier = isinstance(xgb_model, xgb.XGBClassifier)
285
+ is_classifier = isinstance(loaded_model, xgb.XGBClassifier)
286
+
287
+ # Get the model's hyperparameters and ensure enable_categorical=True
288
+ params = loaded_model.get_params()
289
+ params["enable_categorical"] = True
290
+
291
+ # Create new model with same params but enable_categorical=True
292
+ if is_classifier:
293
+ xgb_model = xgb.XGBClassifier(**params)
294
+ else:
295
+ xgb_model = xgb.XGBRegressor(**params)
296
+
290
297
  elif isinstance(loaded_model, xgb.Booster):
291
298
  # Legacy: got a booster, need to wrap it
292
299
  log.warning("Deprecated: Loaded model is a Booster, wrapping in sklearn model.")
@@ -308,10 +315,12 @@ def cross_fold_inference(workbench_model: Any, nfolds: int = 5) -> Tuple[Dict[st
308
315
  target_col = workbench_model.target()
309
316
  feature_cols = workbench_model.features()
310
317
 
311
- # Convert string features to categorical
318
+ # Convert string[python] to object, then to category for XGBoost compatibility
319
+ # This avoids XGBoost's issue with pandas 2.x string[python] dtype in categorical categories
312
320
  for col in feature_cols:
313
- if df[col].dtype in ["object", "string"]:
314
- df[col] = df[col].astype("category")
321
+ if pd.api.types.is_string_dtype(df[col]):
322
+ # Double conversion: string[python] -> object -> category
323
+ df[col] = df[col].astype("object").astype("category")
315
324
 
316
325
  X = df[feature_cols]
317
326
  y = df[target_col]
@@ -440,10 +449,12 @@ def leave_one_out_inference(workbench_model: Any) -> pd.DataFrame:
440
449
  target_col = workbench_model.target()
441
450
  feature_cols = workbench_model.features()
442
451
 
443
- # Convert string features to categorical
452
+ # Convert string[python] to object, then to category for XGBoost compatibility
453
+ # This avoids XGBoost's issue with pandas 2.x string[python] dtype in categorical categories
444
454
  for col in feature_cols:
445
- if df[col].dtype in ["object", "string"]:
446
- df[col] = df[col].astype("category")
455
+ if pd.api.types.is_string_dtype(df[col]):
456
+ # Double conversion: string[python] -> object -> category
457
+ df[col] = df[col].astype("object").astype("category")
447
458
 
448
459
  # Determine which samples to run LOO on
449
460
  if len(df) > 1000:
@@ -39,7 +39,13 @@ class ModelPlot(ComponentInterface):
39
39
  # Calculate the distance from the diagonal for each point
40
40
  target = model.target()
41
41
  df["error"] = abs(df["prediction"] - df[target])
42
- return ScatterPlot().update_properties(df, color="error", regression_line=True)[0]
42
+ return ScatterPlot().update_properties(
43
+ df,
44
+ color="error",
45
+ regression_line=True,
46
+ x=target,
47
+ y="prediction",
48
+ )[0]
43
49
  else:
44
50
  return self.display_text(f"Model Type: {model.model_type}\n\n Awesome Plot Coming Soon!")
45
51
 
@@ -249,8 +249,13 @@ class ModelDetails(PluginInterface):
249
249
  if not inference_runs:
250
250
  return [], None
251
251
 
252
- # Set "auto_inference" as the default, if that doesn't exist, set the first
253
- default_inference_run = "auto_inference" if "auto_inference" in inference_runs else inference_runs[0]
252
+ # Default inference run (full_cross_fold if it exists, then auto_inference, then first)
253
+ if "full_cross_fold" in inference_runs:
254
+ default_inference_run = "full_cross_fold"
255
+ elif "auto_inference" in inference_runs:
256
+ default_inference_run = "auto_inference"
257
+ else:
258
+ default_inference_run = inference_runs[0]
254
259
 
255
260
  # Return the options for the dropdown and the selected value
256
261
  return inference_runs, default_inference_run
@@ -1,9 +1,30 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: workbench
3
- Version: 0.8.192
3
+ Version: 0.8.193
4
4
  Summary: Workbench: A Dashboard and Python API for creating and deploying AWS SageMaker Model Pipelines
5
5
  Author-email: SuperCowPowers LLC <support@supercowpowers.com>
6
- License-Expression: MIT
6
+ License: MIT License
7
+
8
+ Copyright (c) 2021-2026 SuperCowPowers LLC
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
7
28
  Project-URL: Homepage, https://github.com/SuperCowPowers/workbench
8
29
  Keywords: SageMaker,Machine Learning,AWS,Python,Utilities
9
30
  Classifier: Development Status :: 4 - Beta
@@ -56,7 +56,7 @@ workbench/core/artifacts/data_source_abstract.py,sha256=5IRCzFVK-17cd4NXPMRfx99v
56
56
  workbench/core/artifacts/data_source_factory.py,sha256=YL_tA5fsgubbB3dPF6T4tO0rGgz-6oo3ge4i_YXVC-M,2380
57
57
  workbench/core/artifacts/endpoint_core.py,sha256=VH-q-R4pfKyjCOXl2Gq1pxMNf4Ir0YdMw9YIHqd7CVU,51974
58
58
  workbench/core/artifacts/feature_set_core.py,sha256=6qOJoJ9_qwtyz4neFY6vMn73Ujjeut7E0dy_e8nYfSE,31462
59
- workbench/core/artifacts/model_core.py,sha256=vVERAD7cmYW8Y9tG8FmT7GMCWiudj8rJ4wfmucWrHUI,52287
59
+ workbench/core/artifacts/model_core.py,sha256=hY-2w_N4HtCO-vnfVQP22_PYYsnoAQfZzSS3vrP1wYY,51281
60
60
  workbench/core/artifacts/monitor_core.py,sha256=M307yz7tEzOEHgv-LmtVy9jKjSbM98fHW3ckmNYrwlU,27897
61
61
  workbench/core/cloud_platform/cloud_meta.py,sha256=-g4-LTC3D0PXb3VfaXdLR1ERijKuHdffeMK_zhD-koQ,8809
62
62
  workbench/core/cloud_platform/aws/README.md,sha256=QT5IQXoUHbIA0qQ2wO6_2P2lYjYQFVYuezc22mWY4i8,97
@@ -102,7 +102,7 @@ workbench/core/transforms/features_to_features/__init__.py,sha256=47DEQpj8HBSa-_
102
102
  workbench/core/transforms/features_to_features/heavy/emr/Readme.md,sha256=YtQgCEQeKe0CQXQkhzMTYq9xOtCsCYb5P5LW2BmRKWQ,68
103
103
  workbench/core/transforms/features_to_features/heavy/glue/Readme.md,sha256=TuyCatWfoDr99zUwvOcxf-TqMkQzaMqXlj5nmFcRzfo,48
104
104
  workbench/core/transforms/features_to_model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
105
- workbench/core/transforms/features_to_model/features_to_model.py,sha256=qdE7sKMWPERpmDBsBOAAkZ94FzNZftDmUMNu0Mjq6ns,20115
105
+ workbench/core/transforms/features_to_model/features_to_model.py,sha256=MHJQbKpzBQzW-ZXVfmYQ_1yvAHVPHsw81udBWotxiac,20115
106
106
  workbench/core/transforms/model_to_endpoint/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
107
107
  workbench/core/transforms/model_to_endpoint/model_to_endpoint.py,sha256=TIYXvuK0s383PwJ4iS6fCRhuif6oIxsoWb4CpMGJjY4,6358
108
108
  workbench/core/transforms/pandas_transforms/__init__.py,sha256=xL4MT8-fZ1SFqDbTLc8XyxjupHtB1YR6Ej0AC2nwd7I,894
@@ -153,6 +153,7 @@ workbench/model_scripts/scikit_learn/requirements.txt,sha256=aVvwiJ3LgBUhM_PyFlb
153
153
  workbench/model_scripts/scikit_learn/scikit_learn.template,sha256=QQvqx-eX9ZTbYmyupq6R6vIQwosmsmY_MRBPaHyfjdk,12586
154
154
  workbench/model_scripts/uq_models/mapie.template,sha256=2HIwB_658IsZiLIV1RViIZBIGgXxDsJPZinDUu8SchU,18961
155
155
  workbench/model_scripts/uq_models/requirements.txt,sha256=fw7T7t_YJAXK3T6Ysbesxh_Agx_tv0oYx72cEBTqRDY,98
156
+ workbench/model_scripts/xgb_model/generated_model_script.py,sha256=0S2WYCcgguGJ3vqiZe9y5CLuhrpHwIOoKVJBIphxQSQ,18129
156
157
  workbench/model_scripts/xgb_model/requirements.txt,sha256=jWlGc7HH7vqyukTm38LN4EyDi8jDUPEay4n45z-30uc,104
157
158
  workbench/model_scripts/xgb_model/xgb_model.template,sha256=0uXknIEqgUaIFUfu2gfkxa3WHUr8HBBqBepGUTDvrhQ,17917
158
159
  workbench/repl/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -213,7 +214,7 @@ workbench/utils/lambda_utils.py,sha256=7GhGRPyXn9o-toWb9HBGSnI8-DhK9YRkwhCSk_mNK
213
214
  workbench/utils/license_manager.py,sha256=lNE9zZIglmX3zqqCKBdN1xqTgHCEZgJDxavF6pdG7fc,6825
214
215
  workbench/utils/log_utils.py,sha256=7n1NJXO_jUX82e6LWAQug6oPo3wiPDBYsqk9gsYab_A,3167
215
216
  workbench/utils/markdown_utils.py,sha256=4lEqzgG4EVmLcvvKKNUwNxVCySLQKJTJmWDiaDroI1w,8306
216
- workbench/utils/model_utils.py,sha256=THs0GXkZdJLkk-sMxEo_wu7-ouGrJZdlN99yDUzO5xk,13515
217
+ workbench/utils/model_utils.py,sha256=_Gjr318BkMT7hv9M3g7eBYsFluYVPzMjWjMrpNpqx3A,13921
217
218
  workbench/utils/monitor_utils.py,sha256=kVaJ7BgUXs3VPMFYfLC03wkIV4Dq-pEhoXS0wkJFxCc,7858
218
219
  workbench/utils/pandas_utils.py,sha256=uTUx-d1KYfjbS9PMQp2_9FogCV7xVZR6XLzU5YAGmfs,39371
219
220
  workbench/utils/performance_utils.py,sha256=WDNvz-bOdC99cDuXl0urAV4DJ7alk_V3yzKPwvqgST4,1329
@@ -235,7 +236,7 @@ workbench/utils/workbench_cache.py,sha256=IQchxB81iR4eVggHBxUJdXxUCRkqWz1jKe5gxN
235
236
  workbench/utils/workbench_event_bridge.py,sha256=z1GmXOB-Qs7VOgC6Hjnp2DI9nSEWepaSXejACxTIR7o,4150
236
237
  workbench/utils/workbench_logging.py,sha256=WCuMWhQwibrvcGAyj96h2wowh6dH7zNlDJ7sWUzdCeI,10263
237
238
  workbench/utils/workbench_sqs.py,sha256=RwM80z7YWwdtMaCKh7KWF8v38f7eBRU7kyC7ZhTRuI0,2072
238
- workbench/utils/xgboost_model_utils.py,sha256=K0lbeb6LpS5oyN8cmK5Mfzz89n9q3p6ikvN39U9rvQw,22090
239
+ workbench/utils/xgboost_model_utils.py,sha256=lm6XYnPImi3RyHyiJgl2o4HLJ63EghEdSbmwVRMctXg,22786
239
240
  workbench/utils/chem_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
240
241
  workbench/utils/chem_utils/fingerprints.py,sha256=Qvs8jaUwguWUq3Q3j695MY0t0Wk3BvroW-oWBwalMUo,5255
241
242
  workbench/utils/chem_utils/misc.py,sha256=Nevf8_opu-uIPrv_1_0ubuFVVo2_fGUkMoLAHB3XAeo,7372
@@ -251,7 +252,7 @@ workbench/web_interface/components/component_interface.py,sha256=QCPWqiZLkVsAEzQ
251
252
  workbench/web_interface/components/correlation_matrix.py,sha256=Lv4vRta5-TdxBsu0G8Ea7hyyR3XyPes-k5AfL6qZWEc,6376
252
253
  workbench/web_interface/components/data_details_markdown.py,sha256=axDs6eXniglBmvFwIKjpJ5oyT-3D4FO9IcfA_cl-EJ8,9706
253
254
  workbench/web_interface/components/endpoint_metric_plots.py,sha256=H0cXuj9UQrrh_2JvRHtq7O8pMXFXKs7o9XpzySENylw,3441
254
- workbench/web_interface/components/model_plot.py,sha256=fgmOOxdrRkXN2-uNhY3K-pOsY34I-wcfy2MAnhr9ehY,2415
255
+ workbench/web_interface/components/model_plot.py,sha256=Rojx_ZED4P9gvgeEsUm6xnwMNPoeOyn0evw45BWTITc,2536
255
256
  workbench/web_interface/components/plugin_interface.py,sha256=jGRq4igUTVXUT4sDqqsKKI2yjilV0ORNBQq6CjEWE84,9563
256
257
  workbench/web_interface/components/plugin_unit_test.py,sha256=UBZtGztLk2oJMDXfExfxkhHdmXr6ayv4NS0RpwGc8ro,7704
257
258
  workbench/web_interface/components/regression_plot.py,sha256=k18Bd0fcH7ig6kL5GqC_dINci3_YLle_fSEM32zXtzY,3342
@@ -266,7 +267,7 @@ workbench/web_interface/components/plugins/endpoint_details.py,sha256=0A7g_Lx5-3
266
267
  workbench/web_interface/components/plugins/generated_compounds.py,sha256=A6JGlkl7buZUugPK21YgufVFDRoGlHJowaqf8PAmz_s,8056
267
268
  workbench/web_interface/components/plugins/graph_plot.py,sha256=JFzuSH_CkEmlaLAgFpzmiEpS3sXov0ycnCfP0VLsK2g,14502
268
269
  workbench/web_interface/components/plugins/license_details.py,sha256=UyMSBGxEgdp3m9szDkDUAl_Ua8C5a4RNMdYpYCx354M,5497
269
- workbench/web_interface/components/plugins/model_details.py,sha256=BBvT3zdGHpRWz4V9SpNO0AFMPaExWMbNAE1_onfCGRI,10294
270
+ workbench/web_interface/components/plugins/model_details.py,sha256=S5J7LmN39F-oWbPQjndv0T3XKEKLDm6pz3JY4274O2M,10468
270
271
  workbench/web_interface/components/plugins/molecule_panel.py,sha256=xGCEI5af8F5lNId5eKUpetdQs_ahnIPdW6U7wKvbz2o,3515
271
272
  workbench/web_interface/components/plugins/molecule_viewer.py,sha256=xavixcu4RNzh6Nj_-3-XlK09DgpNx5jGmo3wEPNftiE,4529
272
273
  workbench/web_interface/components/plugins/pipeline_details.py,sha256=caiFIakHk-1dGGNW7wlio2X7iAm2_tCNbSjDzoRWGEk,5534
@@ -280,9 +281,9 @@ workbench/web_interface/page_views/main_page.py,sha256=X4-KyGTKLAdxR-Zk2niuLJB2Y
280
281
  workbench/web_interface/page_views/models_page_view.py,sha256=M0bdC7bAzLyIaE2jviY12FF4abdMFZmg6sFuOY_LaGI,2650
281
282
  workbench/web_interface/page_views/page_view.py,sha256=Gh6YnpOGlUejx-bHZAf5pzqoQ1H1R0OSwOpGhOBO06w,455
282
283
  workbench/web_interface/page_views/pipelines_page_view.py,sha256=v2pxrIbsHBcYiblfius3JK766NZ7ciD2yPx0t3E5IJo,2656
283
- workbench-0.8.192.dist-info/licenses/LICENSE,sha256=z4QMMPlLJkZjU8VOKqJkZiQZCEZ--saIU2Z8-p3aVc0,1080
284
- workbench-0.8.192.dist-info/METADATA,sha256=dlskxjoY82E7I0Z-JwO802MdU6hTmk5eWD5hwn1ahq8,9261
285
- workbench-0.8.192.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
286
- workbench-0.8.192.dist-info/entry_points.txt,sha256=o7ohD4D2oygnHp7i9-C0LfcHDuPW5Tv0JXGAg97DpGk,413
287
- workbench-0.8.192.dist-info/top_level.txt,sha256=Dhy72zTxaA_o_yRkPZx5zw-fwumnjGaeGf0hBN3jc_w,10
288
- workbench-0.8.192.dist-info/RECORD,,
284
+ workbench-0.8.193.dist-info/licenses/LICENSE,sha256=RTBoTMeEwTgEhS-n8vgQ-VUo5qig0PWVd8xFPKU6Lck,1080
285
+ workbench-0.8.193.dist-info/METADATA,sha256=vW41RDdu0YekBKST6qabwNkGWXpcDgXaqPhk0YvolRU,10495
286
+ workbench-0.8.193.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
287
+ workbench-0.8.193.dist-info/entry_points.txt,sha256=o7ohD4D2oygnHp7i9-C0LfcHDuPW5Tv0JXGAg97DpGk,413
288
+ workbench-0.8.193.dist-info/top_level.txt,sha256=Dhy72zTxaA_o_yRkPZx5zw-fwumnjGaeGf0hBN3jc_w,10
289
+ workbench-0.8.193.dist-info/RECORD,,
@@ -1,6 +1,6 @@
1
1
  MIT License
2
2
 
3
- Copyright (c) 2021-2024 SuperCowPowers LLC
3
+ Copyright (c) 2021-2026 SuperCowPowers LLC
4
4
 
5
5
  Permission is hereby granted, free of charge, to any person obtaining a copy
6
6
  of this software and associated documentation files (the "Software"), to deal