workbench 0.8.157__py3-none-any.whl → 0.8.159__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. workbench/api/feature_set.py +12 -4
  2. workbench/api/meta.py +1 -1
  3. workbench/cached/cached_feature_set.py +1 -0
  4. workbench/cached/cached_meta.py +10 -12
  5. workbench/core/artifacts/cached_artifact_mixin.py +6 -3
  6. workbench/core/artifacts/data_source_abstract.py +1 -1
  7. workbench/core/artifacts/feature_set_core.py +2 -6
  8. workbench/core/artifacts/model_core.py +19 -7
  9. workbench/core/cloud_platform/aws/aws_meta.py +66 -45
  10. workbench/core/cloud_platform/cloud_meta.py +5 -2
  11. workbench/core/transforms/features_to_model/features_to_model.py +9 -5
  12. workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +6 -0
  13. workbench/core/transforms/pandas_transforms/pandas_to_features.py +6 -1
  14. workbench/model_scripts/{custom_models/nn_models → pytorch_model}/generated_model_script.py +170 -156
  15. workbench/model_scripts/{custom_models/nn_models → pytorch_model}/pytorch.template +153 -147
  16. workbench/model_scripts/pytorch_model/requirements.txt +2 -0
  17. workbench/model_scripts/scikit_learn/generated_model_script.py +307 -0
  18. workbench/model_scripts/script_generation.py +6 -2
  19. workbench/model_scripts/xgb_model/generated_model_script.py +5 -5
  20. workbench/repl/workbench_shell.py +4 -9
  21. workbench/utils/cloudwatch_handler.py +1 -9
  22. workbench/utils/json_utils.py +27 -8
  23. workbench/utils/pandas_utils.py +12 -13
  24. workbench/utils/redis_cache.py +28 -13
  25. workbench/utils/workbench_cache.py +20 -14
  26. workbench/web_interface/page_views/endpoints_page_view.py +1 -1
  27. workbench/web_interface/page_views/main_page.py +1 -1
  28. {workbench-0.8.157.dist-info → workbench-0.8.159.dist-info}/METADATA +7 -10
  29. {workbench-0.8.157.dist-info → workbench-0.8.159.dist-info}/RECORD +33 -33
  30. workbench/model_scripts/custom_models/nn_models/Readme.md +0 -9
  31. workbench/model_scripts/custom_models/nn_models/requirements.txt +0 -4
  32. {workbench-0.8.157.dist-info → workbench-0.8.159.dist-info}/WHEEL +0 -0
  33. {workbench-0.8.157.dist-info → workbench-0.8.159.dist-info}/entry_points.txt +0 -0
  34. {workbench-0.8.157.dist-info → workbench-0.8.159.dist-info}/licenses/LICENSE +0 -0
  35. {workbench-0.8.157.dist-info → workbench-0.8.159.dist-info}/top_level.txt +0 -0
@@ -3,12 +3,12 @@ import os
3
3
  import awswrangler as wr
4
4
  import numpy as np
5
5
 
6
- # PyTorch 2.6+ compatibility: pytorch-tabular saves complex objects, not just tensors
6
+ # PyTorch compatibility: pytorch-tabular saves complex objects, not just tensors
7
7
  # Use legacy loading behavior for compatibility (recommended by PyTorch docs for this scenario)
8
8
  os.environ["TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD"] = "1"
9
9
  from pytorch_tabular import TabularModel
10
10
  from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig
11
- from pytorch_tabular.models import CategoryEmbeddingModelConfig, TabNetModelConfig
11
+ from pytorch_tabular.models import CategoryEmbeddingModelConfig
12
12
 
13
13
  # Model Performance Scores
14
14
  from sklearn.metrics import (
@@ -148,7 +148,9 @@ def convert_categorical_types(df: pd.DataFrame, features: list, category_mapping
148
148
  return df, category_mappings
149
149
 
150
150
 
151
- def decompress_features(df: pd.DataFrame, features: List[str], compressed_features: List[str]) -> Tuple[pd.DataFrame, List[str]]:
151
+ def decompress_features(
152
+ df: pd.DataFrame, features: List[str], compressed_features: List[str]
153
+ ) -> Tuple[pd.DataFrame, List[str]]:
152
154
  """Prepare features for the model
153
155
 
154
156
  Args:
@@ -201,6 +203,126 @@ def decompress_features(df: pd.DataFrame, features: List[str], compressed_featur
201
203
  return df, decompressed_features
202
204
 
203
205
 
206
+ def model_fn(model_dir):
207
+
208
+ # Save current working directory
209
+ original_cwd = os.getcwd()
210
+ try:
211
+ # Change to /tmp because Pytorch Tabular needs write access (creates a .pt_tmp directory)
212
+ os.chdir('/tmp')
213
+
214
+ # Load the model
215
+ model_path = os.path.join(model_dir, "tabular_model")
216
+ model = TabularModel.load_model(model_path)
217
+
218
+ # Restore the original working directory
219
+ finally:
220
+ os.chdir(original_cwd)
221
+
222
+ return model
223
+
224
+
225
+ def input_fn(input_data, content_type):
226
+ """Parse input data and return a DataFrame."""
227
+ if not input_data:
228
+ raise ValueError("Empty input data is not supported!")
229
+
230
+ # Decode bytes to string if necessary
231
+ if isinstance(input_data, bytes):
232
+ input_data = input_data.decode("utf-8")
233
+
234
+ if "text/csv" in content_type:
235
+ return pd.read_csv(StringIO(input_data))
236
+ elif "application/json" in content_type:
237
+ return pd.DataFrame(json.loads(input_data)) # Assumes JSON array of records
238
+ else:
239
+ raise ValueError(f"{content_type} not supported!")
240
+
241
+
242
+ def output_fn(output_df, accept_type):
243
+ """Supports both CSV and JSON output formats."""
244
+ if "text/csv" in accept_type:
245
+ csv_output = output_df.fillna("N/A").to_csv(index=False) # CSV with N/A for missing values
246
+ return csv_output, "text/csv"
247
+ elif "application/json" in accept_type:
248
+ return output_df.to_json(orient="records"), "application/json" # JSON array of records (NaNs -> null)
249
+ else:
250
+ raise RuntimeError(f"{accept_type} accept type is not supported by this script.")
251
+
252
+
253
+ def predict_fn(df, model) -> pd.DataFrame:
254
+ """Make Predictions with our PyTorch Tabular Model
255
+
256
+ Args:
257
+ df (pd.DataFrame): The input DataFrame
258
+ model: The TabularModel use for predictions
259
+
260
+ Returns:
261
+ pd.DataFrame: The DataFrame with the predictions added
262
+ """
263
+ compressed_features = TEMPLATE_PARAMS["compressed_features"]
264
+
265
+ # Grab our feature columns (from training)
266
+ model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
267
+ with open(os.path.join(model_dir, "feature_columns.json")) as fp:
268
+ features = json.load(fp)
269
+ print(f"Model Features: {features}")
270
+
271
+ # Load the category mappings (from training)
272
+ with open(os.path.join(model_dir, "category_mappings.json")) as fp:
273
+ category_mappings = json.load(fp)
274
+
275
+ # Load our Label Encoder if we have one
276
+ label_encoder = None
277
+ if os.path.exists(os.path.join(model_dir, "label_encoder.joblib")):
278
+ label_encoder = joblib.load(os.path.join(model_dir, "label_encoder.joblib"))
279
+
280
+ # We're going match features in a case-insensitive manner, accounting for all the permutations
281
+ # - Model has a feature list that's any case ("Id", "taCos", "cOunT", "likes_tacos")
282
+ # - Incoming data has columns that are mixed case ("ID", "Tacos", "Count", "Likes_Tacos")
283
+ matched_df = match_features_case_insensitive(df, features)
284
+
285
+ # Detect categorical types in the incoming DataFrame
286
+ matched_df, _ = convert_categorical_types(matched_df, features, category_mappings)
287
+
288
+ # If we have compressed features, decompress them
289
+ if compressed_features:
290
+ print("Decompressing features for prediction...")
291
+ matched_df, features = decompress_features(matched_df, features, compressed_features)
292
+
293
+ # Make predictions using the TabularModel
294
+ result = model.predict(matched_df[features])
295
+
296
+ # pytorch-tabular returns predictions using f"{target}_prediction" column
297
+ # and classification probabilities in columns ending with "_probability"
298
+ target = TEMPLATE_PARAMS["target_column"]
299
+ prediction_column = f"{target}_prediction"
300
+ if prediction_column in result.columns:
301
+ predictions = result[prediction_column].values
302
+ else:
303
+ raise ValueError(f"Cannot find prediction column in: {result.columns.tolist()}")
304
+
305
+ # If we have a label encoder, decode the predictions
306
+ if label_encoder:
307
+ predictions = label_encoder.inverse_transform(predictions.astype(int))
308
+
309
+ # Set the predictions on the DataFrame
310
+ df["prediction"] = predictions
311
+
312
+ # For classification, get probabilities
313
+ if label_encoder is not None:
314
+ prob_cols = [col for col in result.columns if col.endswith("_probability")]
315
+ if prob_cols:
316
+ probs = result[prob_cols].values
317
+ df["pred_proba"] = [p.tolist() for p in probs]
318
+
319
+ # Expand the pred_proba column into separate columns for each class
320
+ df = expand_proba_column(df, label_encoder.classes_)
321
+
322
+ # All done, return the DataFrame with new columns for the predictions
323
+ return df
324
+
325
+
204
326
  if __name__ == "__main__":
205
327
  """The main function is for training the PyTorch Tabular model"""
206
328
 
@@ -263,14 +385,12 @@ if __name__ == "__main__":
263
385
  else:
264
386
  # Just do a random training Split
265
387
  print("WARNING: No training column found, splitting data with random state=42")
266
- df_train, df_val = train_test_split(
267
- all_df, test_size=validation_split, random_state=42
268
- )
388
+ df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
269
389
  print(f"FIT/TRAIN: {df_train.shape}")
270
390
  print(f"VALIDATION: {df_val.shape}")
271
391
 
272
392
  # Determine categorical and continuous columns
273
- categorical_cols = [col for col in features if df_train[col].dtype.name == 'category']
393
+ categorical_cols = [col for col in features if df_train[col].dtype.name == "category"]
274
394
  continuous_cols = [col for col in features if col not in categorical_cols]
275
395
 
276
396
  print(f"Categorical columns: {categorical_cols}")
@@ -285,11 +405,14 @@ if __name__ == "__main__":
285
405
 
286
406
  trainer_config = TrainerConfig(
287
407
  auto_lr_find=True,
288
- batch_size=1024,
408
+ batch_size=min(1024, len(df_train) // 4),
289
409
  max_epochs=100,
290
410
  early_stopping="valid_loss",
291
- early_stopping_patience=20,
411
+ early_stopping_patience=15,
412
+ checkpoints="valid_loss",
413
+ accelerator="auto",
292
414
  progress_bar="none",
415
+ gradient_clip_val=1.0,
293
416
  )
294
417
 
295
418
  optimizer_config = OptimizerConfig()
@@ -297,27 +420,24 @@ if __name__ == "__main__":
297
420
  # Choose model configuration based on model type
298
421
  if model_type == "classifier":
299
422
  task = "classification"
300
- # Use TabNet for classification
301
- model_config = TabNetModelConfig(
302
- task=task,
303
- learning_rate=1e-3,
304
- )
305
-
306
423
  # Encode the target column
307
424
  label_encoder = LabelEncoder()
308
425
  df_train[target] = label_encoder.fit_transform(df_train[target])
309
426
  df_val[target] = label_encoder.transform(df_val[target])
310
-
311
427
  else:
312
428
  task = "regression"
313
- # Use CategoryEmbedding for regression
314
- model_config = CategoryEmbeddingModelConfig(
315
- task=task,
316
- layers="1024-512-512",
317
- activation="ReLU",
318
- learning_rate=1e-3,
319
- )
320
- label_encoder = None # We don't need this for regression
429
+ label_encoder = None
430
+
431
+ # Use CategoryEmbedding for both regression and classification tasks
432
+ model_config = CategoryEmbeddingModelConfig(
433
+ task=task,
434
+ layers="1024-512-512",
435
+ activation="ReLU",
436
+ learning_rate=1e-3,
437
+ dropout=0.1,
438
+ use_batch_norm=True,
439
+ initialization="kaiming",
440
+ )
321
441
 
322
442
  # Create and train the TabularModel
323
443
  tabular_model = TabularModel(
@@ -331,16 +451,16 @@ if __name__ == "__main__":
331
451
  tabular_model.fit(train=df_train, validation=df_val)
332
452
 
333
453
  # Make Predictions on the Validation Set
334
- print(f"Making Predictions on Validation Set...")
335
- result = tabular_model.predict(df_val)
454
+ print("Making Predictions on Validation Set...")
455
+ result = tabular_model.predict(df_val, include_input_features=False)
336
456
 
337
- # For regression: pytorch-tabular returns predictions using the target column name
338
- # For classification: pytorch-tabular returns predictions using "prediction" column
457
+ # pytorch-tabular returns predictions using f"{target}_prediction" column
458
+ # and classification probabilities in columns ending with "_probability"
339
459
  if model_type == "classifier":
340
- preds = result["prediction"].values
460
+ preds = result[f"{target}_prediction"].values
341
461
  else:
342
462
  # Regression: use the target column name
343
- preds = result[target].values
463
+ preds = result[f"{target}_prediction"].values
344
464
 
345
465
  if model_type == "classifier":
346
466
  # Get probabilities for classification
@@ -361,10 +481,10 @@ if __name__ == "__main__":
361
481
  else:
362
482
  y_validate = df_val[target].values
363
483
 
364
- # Save predictions to S3 (just the target, prediction, and '_proba' columns)
484
+ # Save predictions to S3 (just the target, prediction, and '_probability' columns)
365
485
  df_val["prediction"] = preds
366
486
  output_columns = [target, "prediction"]
367
- output_columns += [col for col in df_val.columns if col.endswith("_proba")]
487
+ output_columns += [col for col in df_val.columns if col.endswith("_probability")]
368
488
  wr.s3.to_csv(
369
489
  df_val[output_columns],
370
490
  path=f"{model_metrics_s3_path}/validation_predictions.csv",
@@ -377,9 +497,7 @@ if __name__ == "__main__":
377
497
  label_names = label_encoder.classes_
378
498
 
379
499
  # Calculate various model performance metrics
380
- scores = precision_recall_fscore_support(
381
- y_validate, preds, average=None, labels=label_names
382
- )
500
+ scores = precision_recall_fscore_support(y_validate, preds, average=None, labels=label_names)
383
501
 
384
502
  # Put the scores into a dataframe
385
503
  score_df = pd.DataFrame(
@@ -428,115 +546,3 @@ if __name__ == "__main__":
428
546
  # Save the category mappings
429
547
  with open(os.path.join(args.model_dir, "category_mappings.json"), "w") as fp:
430
548
  json.dump(category_mappings, fp)
431
-
432
-
433
- def model_fn(model_dir):
434
- """Deserialize and return fitted PyTorch Tabular model"""
435
- model_path = os.path.join(model_dir, "tabular_model")
436
- model = TabularModel.load_model(model_path)
437
- return model
438
-
439
-
440
- def input_fn(input_data, content_type):
441
- """Parse input data and return a DataFrame."""
442
- if not input_data:
443
- raise ValueError("Empty input data is not supported!")
444
-
445
- # Decode bytes to string if necessary
446
- if isinstance(input_data, bytes):
447
- input_data = input_data.decode("utf-8")
448
-
449
- if "text/csv" in content_type:
450
- return pd.read_csv(StringIO(input_data))
451
- elif "application/json" in content_type:
452
- return pd.DataFrame(json.loads(input_data)) # Assumes JSON array of records
453
- else:
454
- raise ValueError(f"{content_type} not supported!")
455
-
456
-
457
- def output_fn(output_df, accept_type):
458
- """Supports both CSV and JSON output formats."""
459
- if "text/csv" in accept_type:
460
- csv_output = output_df.fillna("N/A").to_csv(index=False) # CSV with N/A for missing values
461
- return csv_output, "text/csv"
462
- elif "application/json" in accept_type:
463
- return output_df.to_json(orient="records"), "application/json" # JSON array of records (NaNs -> null)
464
- else:
465
- raise RuntimeError(f"{accept_type} accept type is not supported by this script.")
466
-
467
-
468
- def predict_fn(df, model) -> pd.DataFrame:
469
- """Make Predictions with our PyTorch Tabular Model
470
-
471
- Args:
472
- df (pd.DataFrame): The input DataFrame
473
- model: The TabularModel use for predictions
474
-
475
- Returns:
476
- pd.DataFrame: The DataFrame with the predictions added
477
- """
478
- compressed_features = TEMPLATE_PARAMS["compressed_features"]
479
-
480
- # Grab our feature columns (from training)
481
- model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
482
- with open(os.path.join(model_dir, "feature_columns.json")) as fp:
483
- features = json.load(fp)
484
- print(f"Model Features: {features}")
485
-
486
- # Load the category mappings (from training)
487
- with open(os.path.join(model_dir, "category_mappings.json")) as fp:
488
- category_mappings = json.load(fp)
489
-
490
- # Load our Label Encoder if we have one
491
- label_encoder = None
492
- if os.path.exists(os.path.join(model_dir, "label_encoder.joblib")):
493
- label_encoder = joblib.load(os.path.join(model_dir, "label_encoder.joblib"))
494
-
495
- # We're going match features in a case-insensitive manner, accounting for all the permutations
496
- # - Model has a feature list that's any case ("Id", "taCos", "cOunT", "likes_tacos")
497
- # - Incoming data has columns that are mixed case ("ID", "Tacos", "Count", "Likes_Tacos")
498
- matched_df = match_features_case_insensitive(df, features)
499
-
500
- # Detect categorical types in the incoming DataFrame
501
- matched_df, _ = convert_categorical_types(matched_df, features, category_mappings)
502
-
503
- # If we have compressed features, decompress them
504
- if compressed_features:
505
- print("Decompressing features for prediction...")
506
- matched_df, features = decompress_features(matched_df, features, compressed_features)
507
-
508
- # Make predictions using the TabularModel
509
- result = model.predict(matched_df)
510
-
511
- # Extract predictions based on model type
512
- # For regression: pytorch-tabular uses target column name
513
- # For classification: pytorch-tabular uses "prediction" column
514
- if "prediction" in result.columns:
515
- predictions = result["prediction"].values
516
- else:
517
- # For regression, find the new column (not in original dataframe)
518
- pred_cols = [col for col in result.columns if col not in matched_df.columns]
519
- if pred_cols:
520
- predictions = result[pred_cols[0]].values
521
- else:
522
- raise ValueError(f"Cannot find prediction column in: {result.columns.tolist()}")
523
-
524
- # If we have a label encoder, decode the predictions
525
- if label_encoder:
526
- predictions = label_encoder.inverse_transform(predictions.astype(int))
527
-
528
- # Set the predictions on the DataFrame
529
- df["prediction"] = predictions
530
-
531
- # For classification, get probabilities
532
- if label_encoder is not None:
533
- prob_cols = [col for col in result.columns if col.endswith("_probability")]
534
- if prob_cols:
535
- probs = result[prob_cols].values
536
- df["pred_proba"] = [p.tolist() for p in probs]
537
-
538
- # Expand the pred_proba column into separate columns for each class
539
- df = expand_proba_column(df, label_encoder.classes_)
540
-
541
- # All done, return the DataFrame with new columns for the predictions
542
- return df
@@ -0,0 +1,2 @@
1
+ # Note: The training and inference images already have torch and pytorch-tabular installed.
2
+ # So we only need to install packages that are not already included in the images.