workbench 0.8.156__py3-none-any.whl → 0.8.158__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of workbench might be problematic. Click here for more details.

@@ -259,7 +259,7 @@ class DataSourceAbstract(Artifact):
259
259
  def ready(self) -> bool:
260
260
  """Is the DataSource ready?"""
261
261
 
262
- # Check if the Artifact is ready
262
+ # Check if our parent class (Artifact) is ready
263
263
  if not super().ready():
264
264
  return False
265
265
 
@@ -614,12 +614,8 @@ class FeatureSetCore(Artifact):
614
614
  Note: Since FeatureSet is a composite of DataSource and FeatureGroup, we need to
615
615
  check both to see if the FeatureSet is ready."""
616
616
 
617
- # Check the expected metadata for the FeatureSet
618
- expected_meta = self.expected_meta()
619
- existing_meta = self.workbench_meta()
620
- feature_set_ready = set(existing_meta.keys()).issuperset(expected_meta)
621
- if not feature_set_ready:
622
- self.log.info(f"FeatureSet {self.name} is not ready!")
617
+ # Check if our parent class (Artifact) is ready
618
+ if not super().ready():
623
619
  return False
624
620
 
625
621
  # Okay now call/return the DataSource ready() method
@@ -105,12 +105,22 @@ class AWSAccountClamp:
105
105
  self.log.critical("AWS Identity Check Failure: Check AWS_PROFILE and/or Renew SSO Token...")
106
106
  return info
107
107
 
108
- def check_s3_access(self) -> bool:
109
- s3 = self.boto3_session.client("s3")
110
- results = s3.list_buckets()
111
- for bucket in results["Buckets"]:
112
- self.log.info(f"\t{bucket['Name']}")
113
- return True
108
+ def check_workbench_bucket(self) -> bool:
109
+ """Check if the Workbench S3 Bucket exists and is accessible"""
110
+ s3 = self.boto3_session.client("s3") # Use client, not resource
111
+ try:
112
+ s3.head_bucket(Bucket=self.workbench_bucket_name)
113
+ self.log.info(f"The {self.workbench_bucket_name} bucket exists and is accessible")
114
+ return True
115
+ except ClientError as e:
116
+ error_code = e.response["Error"]["Code"]
117
+ if error_code == "404":
118
+ self.log.critical(f"The {self.workbench_bucket_name} bucket does not exist")
119
+ elif error_code == "403":
120
+ self.log.critical(f"Access denied to {self.workbench_bucket_name} bucket")
121
+ else:
122
+ self.log.error(f"Error checking S3 bucket: {e}")
123
+ return False
114
124
 
115
125
  def sagemaker_session(self) -> "SageSession":
116
126
  """Create a workbench SageMaker session (using our boto3 refreshable session)
@@ -145,9 +155,9 @@ if __name__ == "__main__":
145
155
  aws_account_clamp.check_assumed_role()
146
156
  print("Assumed Role Check Success...")
147
157
 
148
- print("\n\n*** AWS S3 Access Check ***")
149
- aws_account_clamp.check_s3_access()
150
- print("S3 Access Check Success...")
158
+ print("\n\n*** AWS Workbench Bucket Check ***")
159
+ aws_account_clamp.check_workbench_bucket()
160
+ print("Workbench Bucket Check Success...")
151
161
 
152
162
  print("\n\n*** AWS Sagemaker Session/Client Check ***")
153
163
  sm_client = aws_account_clamp.sagemaker_client()
@@ -192,7 +192,7 @@ class AWSDFStore:
192
192
  # Update/Insert the DataFrame to S3
193
193
  s3_uri = self._generate_s3_uri(location)
194
194
  try:
195
- wr.s3.to_parquet(df=data, path=s3_uri, dataset=True, mode="overwrite")
195
+ wr.s3.to_parquet(df=data, path=s3_uri, dataset=True, mode="overwrite", index=True)
196
196
  self.log.info(f"Dataframe cached {s3_uri}...")
197
197
  except Exception as e:
198
198
  self.log.error(f"Failed to cache dataframe '{s3_uri}': {e}")
@@ -167,6 +167,9 @@ class FeaturesToModel(Transform):
167
167
  if self.custom_script:
168
168
  script_path = self.custom_script
169
169
  if self.custom_script.endswith(".template"):
170
+ # Model Type is an enumerated type, so we need to convert it to a string
171
+ template_params["model_type"] = template_params["model_type"].value
172
+
170
173
  # Fill in the custom script template with specific parameters (include any custom args)
171
174
  template_params.update(self.custom_args)
172
175
  script_path = fill_template(self.custom_script, template_params, "generated_model_script.py")
@@ -316,10 +319,10 @@ if __name__ == "__main__":
316
319
 
317
320
  # Regression Model
318
321
  input_name = "abalone_features"
319
- output_name = "abalone-regression"
322
+ output_name = "test-abalone-regression"
320
323
  to_model = FeaturesToModel(input_name, output_name, model_type=ModelType.REGRESSOR)
321
- to_model.set_output_tags(["abalone", "public"])
322
- to_model.transform(target_column="class_number_of_rings", description="Abalone Regression")
324
+ to_model.set_output_tags(["test"])
325
+ to_model.transform(target_column="class_number_of_rings", description="Test Abalone Regression")
323
326
 
324
327
  """
325
328
  # Classification Model
@@ -378,13 +378,18 @@ class PandasToFeatures(Transform):
378
378
  def ensure_feature_group_created(self, feature_group):
379
379
  status = feature_group.describe().get("FeatureGroupStatus")
380
380
  while status == "Creating":
381
- self.log.debug("FeatureSet being Created...")
381
+ self.log.debug("FeatureSet being Created")
382
382
  time.sleep(5)
383
383
  status = feature_group.describe().get("FeatureGroupStatus")
384
+
384
385
  if status == "Created":
385
386
  self.log.info(f"FeatureSet {feature_group.name} successfully created")
386
387
  else:
388
+ # Get the detailed failure reason
389
+ description = feature_group.describe()
390
+ failure_reason = description.get("FailureReason", "No failure reason provided")
387
391
  self.log.critical(f"FeatureSet {feature_group.name} creation failed with status: {status}")
392
+ self.log.critical(f"Failure reason: {failure_reason}")
388
393
 
389
394
  def wait_for_rows(self, expected_rows: int):
390
395
  """Wait for AWS Feature Group to fully populate the Offline Storage"""
@@ -0,0 +1,9 @@
1
+ # Neural Network Model Scripts
2
+ Welcome to the set of neural network model scripts.
3
+ Right now we just have:
4
+
5
+ - PyTorch
6
+ - ChemProp
7
+
8
+
9
+ ### References
@@ -0,0 +1,543 @@
1
+ # Imports for PyTorch Tabular Model
2
+ import torch
3
+ from pytorch_tabular import TabularModel
4
+ from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig
5
+ from pytorch_tabular.models import CategoryEmbeddingModelConfig, NodeConfig, TabNetModelConfig
6
+ import awswrangler as wr
7
+ import numpy as np
8
+
9
+ # PyTorch 2.6 compatibility: pytorch-tabular saves complex objects, not just tensors
10
+ # Use legacy loading behavior for compatibility (recommended by PyTorch docs for this scenario)
11
+ import os
12
+
13
+ os.environ["TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD"] = "1"
14
+
15
+ # Model Performance Scores
16
+ from sklearn.metrics import (
17
+ mean_absolute_error,
18
+ r2_score,
19
+ root_mean_squared_error,
20
+ precision_recall_fscore_support,
21
+ confusion_matrix,
22
+ )
23
+
24
+ # Classification Encoder
25
+ from sklearn.preprocessing import LabelEncoder
26
+
27
+ # Scikit Learn Imports
28
+ from sklearn.model_selection import train_test_split
29
+
30
+ from io import StringIO
31
+ import json
32
+ import argparse
33
+ import joblib
34
+ import os
35
+ import pandas as pd
36
+ from typing import List, Tuple
37
+
38
+ # Template Parameters
39
+ TEMPLATE_PARAMS = {
40
+ "model_type": "regressor",
41
+ "target_column": "solubility",
42
+ "features": ['molwt', 'mollogp', 'molmr', 'heavyatomcount', 'numhacceptors', 'numhdonors', 'numheteroatoms', 'numrotatablebonds', 'numvalenceelectrons', 'numaromaticrings', 'numsaturatedrings', 'numaliphaticrings', 'ringcount', 'tpsa', 'labuteasa', 'balabanj', 'bertzct'],
43
+ "compressed_features": [],
44
+ "model_metrics_s3_path": "s3://sandbox-sageworks-artifacts/models/aqsol-pytorch-reg/training",
45
+ "train_all_data": False
46
+ }
47
+
48
+
49
+ # Function to check if dataframe is empty
50
+ def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
51
+ """
52
+ Check if the provided dataframe is empty and raise an exception if it is.
53
+
54
+ Args:
55
+ df (pd.DataFrame): DataFrame to check
56
+ df_name (str): Name of the DataFrame
57
+ """
58
+ if df.empty:
59
+ msg = f"*** The training data {df_name} has 0 rows! ***STOPPING***"
60
+ print(msg)
61
+ raise ValueError(msg)
62
+
63
+
64
+ def expand_proba_column(df: pd.DataFrame, class_labels: List[str]) -> pd.DataFrame:
65
+ """
66
+ Expands a column in a DataFrame containing a list of probabilities into separate columns.
67
+
68
+ Args:
69
+ df (pd.DataFrame): DataFrame containing a "pred_proba" column
70
+ class_labels (List[str]): List of class labels
71
+
72
+ Returns:
73
+ pd.DataFrame: DataFrame with the "pred_proba" expanded into separate columns
74
+ """
75
+
76
+ # Sanity check
77
+ proba_column = "pred_proba"
78
+ if proba_column not in df.columns:
79
+ raise ValueError('DataFrame does not contain a "pred_proba" column')
80
+
81
+ # Construct new column names with '_proba' suffix
82
+ proba_splits = [f"{label}_proba" for label in class_labels]
83
+
84
+ # Expand the proba_column into separate columns for each probability
85
+ proba_df = pd.DataFrame(df[proba_column].tolist(), columns=proba_splits)
86
+
87
+ # Drop any proba columns and reset the index in prep for the concat
88
+ df = df.drop(columns=[proba_column] + proba_splits, errors="ignore")
89
+ df = df.reset_index(drop=True)
90
+
91
+ # Concatenate the new columns with the original DataFrame
92
+ df = pd.concat([df, proba_df], axis=1)
93
+ print(df)
94
+ return df
95
+
96
+
97
+ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> pd.DataFrame:
98
+ """
99
+ Matches and renames DataFrame columns to match model feature names (case-insensitive).
100
+ Prioritizes exact matches, then case-insensitive matches.
101
+
102
+ Raises ValueError if any model features cannot be matched.
103
+ """
104
+ df_columns_lower = {col.lower(): col for col in df.columns}
105
+ rename_dict = {}
106
+ missing = []
107
+
108
+ for feature in model_features:
109
+ if feature in df.columns:
110
+ continue # Exact match
111
+ elif feature.lower() in df_columns_lower:
112
+ rename_dict[df_columns_lower[feature.lower()]] = feature
113
+ else:
114
+ missing.append(feature)
115
+
116
+ if missing:
117
+ raise ValueError(f"Features not found: {missing}")
118
+
119
+ return df.rename(columns=rename_dict)
120
+
121
+
122
+ def convert_categorical_types(df: pd.DataFrame, features: list, category_mappings={}) -> tuple:
123
+ """
124
+ Converts appropriate columns to categorical type with consistent mappings.
125
+
126
+ Args:
127
+ df (pd.DataFrame): The DataFrame to process.
128
+ features (list): List of feature names to consider for conversion.
129
+ category_mappings (dict, optional): Existing category mappings. If empty dict, we're in
130
+ training mode. If populated, we're in inference mode.
131
+
132
+ Returns:
133
+ tuple: (processed DataFrame, category mappings dictionary)
134
+ """
135
+ # Training mode
136
+ if category_mappings == {}:
137
+ for col in df.select_dtypes(include=["object", "string"]):
138
+ if col in features and df[col].nunique() < 20:
139
+ print(f"Training mode: Converting {col} to category")
140
+ df[col] = df[col].astype("category")
141
+ category_mappings[col] = df[col].cat.categories.tolist() # Store category mappings
142
+
143
+ # Inference mode
144
+ else:
145
+ for col, categories in category_mappings.items():
146
+ if col in df.columns:
147
+ print(f"Inference mode: Applying categorical mapping for {col}")
148
+ df[col] = pd.Categorical(df[col], categories=categories) # Apply consistent categorical mapping
149
+
150
+ return df, category_mappings
151
+
152
+
153
+ def decompress_features(df: pd.DataFrame, features: List[str], compressed_features: List[str]) -> Tuple[pd.DataFrame, List[str]]:
154
+ """Prepare features for the model
155
+
156
+ Args:
157
+ df (pd.DataFrame): The features DataFrame
158
+ features (List[str]): Full list of feature names
159
+ compressed_features (List[str]): List of feature names to decompress (bitstrings)
160
+
161
+ Returns:
162
+ pd.DataFrame: DataFrame with the decompressed features
163
+ List[str]: Updated list of feature names after decompression
164
+
165
+ Raises:
166
+ ValueError: If any missing values are found in the specified features
167
+ """
168
+
169
+ # Check for any missing values in the required features
170
+ missing_counts = df[features].isna().sum()
171
+ if missing_counts.any():
172
+ missing_features = missing_counts[missing_counts > 0]
173
+ print(
174
+ f"WARNING: Found missing values in features: {missing_features.to_dict()}. "
175
+ "WARNING: You might want to remove/replace all NaN values before processing."
176
+ )
177
+
178
+ # Decompress the specified compressed features
179
+ decompressed_features = features
180
+ for feature in compressed_features:
181
+ if (feature not in df.columns) or (feature not in features):
182
+ print(f"Feature '{feature}' not in the features list, skipping decompression.")
183
+ continue
184
+
185
+ # Remove the feature from the list of features to avoid duplication
186
+ decompressed_features.remove(feature)
187
+
188
+ # Handle all compressed features as bitstrings
189
+ bit_matrix = np.array([list(bitstring) for bitstring in df[feature]], dtype=np.uint8)
190
+ prefix = feature[:3]
191
+
192
+ # Create all new columns at once - avoids fragmentation
193
+ new_col_names = [f"{prefix}_{i}" for i in range(bit_matrix.shape[1])]
194
+ new_df = pd.DataFrame(bit_matrix, columns=new_col_names, index=df.index)
195
+
196
+ # Add to features list
197
+ decompressed_features.extend(new_col_names)
198
+
199
+ # Drop original column and concatenate new ones
200
+ df = df.drop(columns=[feature])
201
+ df = pd.concat([df, new_df], axis=1)
202
+
203
+ return df, decompressed_features
204
+
205
+
206
+ if __name__ == "__main__":
207
+ """The main function is for training the PyTorch Tabular model"""
208
+
209
+ # Harness Template Parameters
210
+ target = TEMPLATE_PARAMS["target_column"]
211
+ features = TEMPLATE_PARAMS["features"]
212
+ orig_features = features.copy()
213
+ compressed_features = TEMPLATE_PARAMS["compressed_features"]
214
+ model_type = TEMPLATE_PARAMS["model_type"]
215
+ model_metrics_s3_path = TEMPLATE_PARAMS["model_metrics_s3_path"]
216
+ train_all_data = TEMPLATE_PARAMS["train_all_data"]
217
+ validation_split = 0.2
218
+
219
+ # Script arguments for input/output directories
220
+ parser = argparse.ArgumentParser()
221
+ parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model"))
222
+ parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train"))
223
+ parser.add_argument(
224
+ "--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data")
225
+ )
226
+ args = parser.parse_args()
227
+
228
+ # Read the training data into DataFrames
229
+ training_files = [
230
+ os.path.join(args.train, file)
231
+ for file in os.listdir(args.train)
232
+ if file.endswith(".csv")
233
+ ]
234
+ print(f"Training Files: {training_files}")
235
+
236
+ # Combine files and read them all into a single pandas dataframe
237
+ all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
238
+
239
+ # Check if the dataframe is empty
240
+ check_dataframe(all_df, "training_df")
241
+
242
+ # Features/Target output
243
+ print(f"Target: {target}")
244
+ print(f"Features: {str(features)}")
245
+
246
+ # Convert any features that might be categorical to 'category' type
247
+ all_df, category_mappings = convert_categorical_types(all_df, features)
248
+
249
+ # If we have compressed features, decompress them
250
+ if compressed_features:
251
+ print(f"Decompressing features {compressed_features}...")
252
+ all_df, features = decompress_features(all_df, features, compressed_features)
253
+
254
+ # Do we want to train on all the data?
255
+ if train_all_data:
256
+ print("Training on ALL of the data")
257
+ df_train = all_df.copy()
258
+ df_val = all_df.copy()
259
+
260
+ # Does the dataframe have a training column?
261
+ elif "training" in all_df.columns:
262
+ print("Found training column, splitting data based on training column")
263
+ df_train = all_df[all_df["training"]]
264
+ df_val = all_df[~all_df["training"]]
265
+ else:
266
+ # Just do a random training Split
267
+ print("WARNING: No training column found, splitting data with random state=42")
268
+ df_train, df_val = train_test_split(
269
+ all_df, test_size=validation_split, random_state=42
270
+ )
271
+ print(f"FIT/TRAIN: {df_train.shape}")
272
+ print(f"VALIDATION: {df_val.shape}")
273
+
274
+ # Determine categorical and continuous columns
275
+ categorical_cols = [col for col in features if df_train[col].dtype.name == 'category']
276
+ continuous_cols = [col for col in features if col not in categorical_cols]
277
+
278
+ print(f"Categorical columns: {categorical_cols}")
279
+ print(f"Continuous columns: {continuous_cols}")
280
+
281
+ # Set up PyTorch Tabular configuration
282
+ data_config = DataConfig(
283
+ target=[target],
284
+ continuous_cols=continuous_cols,
285
+ categorical_cols=categorical_cols,
286
+ )
287
+
288
+ trainer_config = TrainerConfig(
289
+ auto_lr_find=True,
290
+ batch_size=1024,
291
+ max_epochs=100,
292
+ early_stopping="valid_loss",
293
+ early_stopping_patience=20,
294
+ progress_bar="none",
295
+ )
296
+
297
+ optimizer_config = OptimizerConfig()
298
+
299
+ # Choose model configuration based on model type
300
+ if model_type == "classifier":
301
+ # Use TabNet for classification
302
+ model_config = TabNetModelConfig(
303
+ task="classification",
304
+ learning_rate=1e-3,
305
+ )
306
+
307
+ # Encode the target column
308
+ label_encoder = LabelEncoder()
309
+ df_train[target] = label_encoder.fit_transform(df_train[target])
310
+ df_val[target] = label_encoder.transform(df_val[target])
311
+
312
+ else:
313
+ # Use CategoryEmbedding for regression
314
+ model_config = CategoryEmbeddingModelConfig(
315
+ task="regression",
316
+ layers="1024-512-512",
317
+ activation="ReLU",
318
+ learning_rate=1e-3,
319
+ )
320
+ label_encoder = None # We don't need this for regression
321
+
322
+ # Create and train the TabularModel
323
+ tabular_model = TabularModel(
324
+ data_config=data_config,
325
+ model_config=model_config,
326
+ optimizer_config=optimizer_config,
327
+ trainer_config=trainer_config,
328
+ )
329
+
330
+ # Train the model
331
+ tabular_model.fit(train=df_train, validation=df_val)
332
+
333
+ # Make Predictions on the Validation Set
334
+ print(f"Making Predictions on Validation Set...")
335
+ result = tabular_model.predict(df_val)
336
+ print(f"Result Columns: {result.columns.tolist()}")
337
+
338
+ # For regression: pytorch-tabular returns predictions using the target column name
339
+ # For classification: pytorch-tabular returns predictions using "prediction" column
340
+ if model_type == "classifier":
341
+ preds = result["prediction"].values
342
+ else:
343
+ # Regression: use the target column name
344
+ preds = result[target].values
345
+
346
+ if model_type == "classifier":
347
+ # Get probabilities for classification
348
+ print("Processing Probabilities...")
349
+ prob_cols = [col for col in result.columns if col.endswith("_probability")]
350
+ if prob_cols:
351
+ probs = result[prob_cols].values
352
+ df_val["pred_proba"] = [p.tolist() for p in probs]
353
+
354
+ # Expand the pred_proba column into separate columns for each class
355
+ print(df_val.columns)
356
+ df_val = expand_proba_column(df_val, label_encoder.classes_)
357
+ print(df_val.columns)
358
+
359
+ # Decode the target and prediction labels
360
+ y_validate = label_encoder.inverse_transform(df_val[target])
361
+ preds = label_encoder.inverse_transform(preds.astype(int))
362
+ else:
363
+ y_validate = df_val[target].values
364
+
365
+ # Save predictions to S3 (just the target, prediction, and '_proba' columns)
366
+ df_val["prediction"] = preds
367
+ output_columns = [target, "prediction"]
368
+ output_columns += [col for col in df_val.columns if col.endswith("_proba")]
369
+ wr.s3.to_csv(
370
+ df_val[output_columns],
371
+ path=f"{model_metrics_s3_path}/validation_predictions.csv",
372
+ index=False,
373
+ )
374
+
375
+ # Report Performance Metrics
376
+ if model_type == "classifier":
377
+ # Get the label names and their integer mapping
378
+ label_names = label_encoder.classes_
379
+
380
+ # Calculate various model performance metrics
381
+ scores = precision_recall_fscore_support(
382
+ y_validate, preds, average=None, labels=label_names
383
+ )
384
+
385
+ # Put the scores into a dataframe
386
+ score_df = pd.DataFrame(
387
+ {
388
+ target: label_names,
389
+ "precision": scores[0],
390
+ "recall": scores[1],
391
+ "fscore": scores[2],
392
+ "support": scores[3],
393
+ }
394
+ )
395
+
396
+ # We need to get creative with the Classification Metrics
397
+ metrics = ["precision", "recall", "fscore", "support"]
398
+ for t in label_names:
399
+ for m in metrics:
400
+ value = score_df.loc[score_df[target] == t, m].iloc[0]
401
+ print(f"Metrics:{t}:{m} {value}")
402
+
403
+ # Compute and output the confusion matrix
404
+ conf_mtx = confusion_matrix(y_validate, preds, labels=label_names)
405
+ for i, row_name in enumerate(label_names):
406
+ for j, col_name in enumerate(label_names):
407
+ value = conf_mtx[i, j]
408
+ print(f"ConfusionMatrix:{row_name}:{col_name} {value}")
409
+
410
+ else:
411
+ # Calculate various model performance metrics (regression)
412
+ rmse = root_mean_squared_error(y_validate, preds)
413
+ mae = mean_absolute_error(y_validate, preds)
414
+ r2 = r2_score(y_validate, preds)
415
+ print(f"RMSE: {rmse:.3f}")
416
+ print(f"MAE: {mae:.3f}")
417
+ print(f"R2: {r2:.3f}")
418
+ print(f"NumRows: {len(df_val)}")
419
+
420
+ # Save the model to the standard place/name
421
+ tabular_model.save_model(os.path.join(args.model_dir, "tabular_model"))
422
+ if label_encoder:
423
+ joblib.dump(label_encoder, os.path.join(args.model_dir, "label_encoder.joblib"))
424
+
425
+ # Save the features (this will validate input during predictions)
426
+ with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
427
+ json.dump(orig_features, fp) # We save the original features, not the decompressed ones
428
+
429
+ # Save the category mappings
430
+ with open(os.path.join(args.model_dir, "category_mappings.json"), "w") as fp:
431
+ json.dump(category_mappings, fp)
432
+
433
+
434
+ def model_fn(model_dir):
435
+ """Deserialize and return fitted PyTorch Tabular model"""
436
+ model_path = os.path.join(model_dir, "tabular_model")
437
+ model = TabularModel.load_model(model_path)
438
+ return model
439
+
440
+
441
+ def input_fn(input_data, content_type):
442
+ """Parse input data and return a DataFrame."""
443
+ if not input_data:
444
+ raise ValueError("Empty input data is not supported!")
445
+
446
+ # Decode bytes to string if necessary
447
+ if isinstance(input_data, bytes):
448
+ input_data = input_data.decode("utf-8")
449
+
450
+ if "text/csv" in content_type:
451
+ return pd.read_csv(StringIO(input_data))
452
+ elif "application/json" in content_type:
453
+ return pd.DataFrame(json.loads(input_data)) # Assumes JSON array of records
454
+ else:
455
+ raise ValueError(f"{content_type} not supported!")
456
+
457
+
458
+ def output_fn(output_df, accept_type):
459
+ """Supports both CSV and JSON output formats."""
460
+ if "text/csv" in accept_type:
461
+ csv_output = output_df.fillna("N/A").to_csv(index=False) # CSV with N/A for missing values
462
+ return csv_output, "text/csv"
463
+ elif "application/json" in accept_type:
464
+ return output_df.to_json(orient="records"), "application/json" # JSON array of records (NaNs -> null)
465
+ else:
466
+ raise RuntimeError(f"{accept_type} accept type is not supported by this script.")
467
+
468
+
469
+ def predict_fn(df, model) -> pd.DataFrame:
470
+ """Make Predictions with our PyTorch Tabular Model
471
+
472
+ Args:
473
+ df (pd.DataFrame): The input DataFrame
474
+ model: The TabularModel use for predictions
475
+
476
+ Returns:
477
+ pd.DataFrame: The DataFrame with the predictions added
478
+ """
479
+ compressed_features = TEMPLATE_PARAMS["compressed_features"]
480
+
481
+ # Grab our feature columns (from training)
482
+ model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
483
+ with open(os.path.join(model_dir, "feature_columns.json")) as fp:
484
+ features = json.load(fp)
485
+ print(f"Model Features: {features}")
486
+
487
+ # Load the category mappings (from training)
488
+ with open(os.path.join(model_dir, "category_mappings.json")) as fp:
489
+ category_mappings = json.load(fp)
490
+
491
+ # Load our Label Encoder if we have one
492
+ label_encoder = None
493
+ if os.path.exists(os.path.join(model_dir, "label_encoder.joblib")):
494
+ label_encoder = joblib.load(os.path.join(model_dir, "label_encoder.joblib"))
495
+
496
+ # We're going match features in a case-insensitive manner, accounting for all the permutations
497
+ # - Model has a feature list that's any case ("Id", "taCos", "cOunT", "likes_tacos")
498
+ # - Incoming data has columns that are mixed case ("ID", "Tacos", "Count", "Likes_Tacos")
499
+ matched_df = match_features_case_insensitive(df, features)
500
+
501
+ # Detect categorical types in the incoming DataFrame
502
+ matched_df, _ = convert_categorical_types(matched_df, features, category_mappings)
503
+
504
+ # If we have compressed features, decompress them
505
+ if compressed_features:
506
+ print("Decompressing features for prediction...")
507
+ matched_df, features = decompress_features(matched_df, features, compressed_features)
508
+
509
+ # Make predictions using the TabularModel
510
+ result = model.predict(matched_df)
511
+
512
+ # Extract predictions based on model type
513
+ # For regression: pytorch-tabular uses target column name
514
+ # For classification: pytorch-tabular uses "prediction" column
515
+ if "prediction" in result.columns:
516
+ predictions = result["prediction"].values
517
+ else:
518
+ # For regression, find the new column (not in original dataframe)
519
+ pred_cols = [col for col in result.columns if col not in matched_df.columns]
520
+ if pred_cols:
521
+ predictions = result[pred_cols[0]].values
522
+ else:
523
+ raise ValueError(f"Cannot find prediction column in: {result.columns.tolist()}")
524
+
525
+ # If we have a label encoder, decode the predictions
526
+ if label_encoder:
527
+ predictions = label_encoder.inverse_transform(predictions.astype(int))
528
+
529
+ # Set the predictions on the DataFrame
530
+ df["prediction"] = predictions
531
+
532
+ # For classification, get probabilities
533
+ if label_encoder is not None:
534
+ prob_cols = [col for col in result.columns if col.endswith("_probability")]
535
+ if prob_cols:
536
+ probs = result[prob_cols].values
537
+ df["pred_proba"] = [p.tolist() for p in probs]
538
+
539
+ # Expand the pred_proba column into separate columns for each class
540
+ df = expand_proba_column(df, label_encoder.classes_)
541
+
542
+ # All done, return the DataFrame with new columns for the predictions
543
+ return df