workbench 0.8.158__py3-none-any.whl → 0.8.159__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. workbench/api/feature_set.py +12 -4
  2. workbench/api/meta.py +1 -1
  3. workbench/cached/cached_feature_set.py +1 -0
  4. workbench/cached/cached_meta.py +10 -12
  5. workbench/core/artifacts/cached_artifact_mixin.py +6 -3
  6. workbench/core/artifacts/model_core.py +19 -7
  7. workbench/core/cloud_platform/aws/aws_meta.py +66 -45
  8. workbench/core/cloud_platform/cloud_meta.py +5 -2
  9. workbench/core/transforms/features_to_model/features_to_model.py +9 -5
  10. workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +6 -0
  11. workbench/model_scripts/{custom_models/nn_models → pytorch_model}/generated_model_script.py +170 -156
  12. workbench/model_scripts/{custom_models/nn_models → pytorch_model}/pytorch.template +153 -147
  13. workbench/model_scripts/pytorch_model/requirements.txt +2 -0
  14. workbench/model_scripts/scikit_learn/generated_model_script.py +307 -0
  15. workbench/model_scripts/script_generation.py +6 -2
  16. workbench/model_scripts/xgb_model/generated_model_script.py +6 -6
  17. workbench/repl/workbench_shell.py +4 -9
  18. workbench/utils/json_utils.py +27 -8
  19. workbench/utils/pandas_utils.py +12 -13
  20. workbench/utils/redis_cache.py +28 -13
  21. workbench/utils/workbench_cache.py +20 -14
  22. workbench/web_interface/page_views/endpoints_page_view.py +1 -1
  23. workbench/web_interface/page_views/main_page.py +1 -1
  24. {workbench-0.8.158.dist-info → workbench-0.8.159.dist-info}/METADATA +5 -8
  25. {workbench-0.8.158.dist-info → workbench-0.8.159.dist-info}/RECORD +29 -29
  26. workbench/model_scripts/custom_models/nn_models/Readme.md +0 -9
  27. workbench/model_scripts/custom_models/nn_models/requirements.txt +0 -4
  28. {workbench-0.8.158.dist-info → workbench-0.8.159.dist-info}/WHEEL +0 -0
  29. {workbench-0.8.158.dist-info → workbench-0.8.159.dist-info}/entry_points.txt +0 -0
  30. {workbench-0.8.158.dist-info → workbench-0.8.159.dist-info}/licenses/LICENSE +0 -0
  31. {workbench-0.8.158.dist-info → workbench-0.8.159.dist-info}/top_level.txt +0 -0
@@ -1,16 +1,14 @@
1
1
  # Imports for PyTorch Tabular Model
2
- import torch
3
- from pytorch_tabular import TabularModel
4
- from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig
5
- from pytorch_tabular.models import CategoryEmbeddingModelConfig, NodeConfig, TabNetModelConfig
2
+ import os
6
3
  import awswrangler as wr
7
4
  import numpy as np
8
5
 
9
- # PyTorch 2.6 compatibility: pytorch-tabular saves complex objects, not just tensors
6
+ # PyTorch compatibility: pytorch-tabular saves complex objects, not just tensors
10
7
  # Use legacy loading behavior for compatibility (recommended by PyTorch docs for this scenario)
11
- import os
12
-
13
8
  os.environ["TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD"] = "1"
9
+ from pytorch_tabular import TabularModel
10
+ from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig
11
+ from pytorch_tabular.models import CategoryEmbeddingModelConfig, TabNetModelConfig
14
12
 
15
13
  # Model Performance Scores
16
14
  from sklearn.metrics import (
@@ -37,11 +35,11 @@ from typing import List, Tuple
37
35
 
38
36
  # Template Parameters
39
37
  TEMPLATE_PARAMS = {
40
- "model_type": "regressor",
41
- "target_column": "solubility",
38
+ "model_type": "classifier",
39
+ "target_column": "solubility_class",
42
40
  "features": ['molwt', 'mollogp', 'molmr', 'heavyatomcount', 'numhacceptors', 'numhdonors', 'numheteroatoms', 'numrotatablebonds', 'numvalenceelectrons', 'numaromaticrings', 'numsaturatedrings', 'numaliphaticrings', 'ringcount', 'tpsa', 'labuteasa', 'balabanj', 'bertzct'],
43
41
  "compressed_features": [],
44
- "model_metrics_s3_path": "s3://sandbox-sageworks-artifacts/models/aqsol-pytorch-reg/training",
42
+ "model_metrics_s3_path": "s3://sandbox-sageworks-artifacts/models/aqsol-pytorch-class/training",
45
43
  "train_all_data": False
46
44
  }
47
45
 
@@ -150,7 +148,9 @@ def convert_categorical_types(df: pd.DataFrame, features: list, category_mapping
150
148
  return df, category_mappings
151
149
 
152
150
 
153
- def decompress_features(df: pd.DataFrame, features: List[str], compressed_features: List[str]) -> Tuple[pd.DataFrame, List[str]]:
151
+ def decompress_features(
152
+ df: pd.DataFrame, features: List[str], compressed_features: List[str]
153
+ ) -> Tuple[pd.DataFrame, List[str]]:
154
154
  """Prepare features for the model
155
155
 
156
156
  Args:
@@ -203,6 +203,135 @@ def decompress_features(df: pd.DataFrame, features: List[str], compressed_featur
203
203
  return df, decompressed_features
204
204
 
205
205
 
206
+ def model_fn(model_dir):
207
+ """Deserialize and return fitted PyTorch Tabular model"""
208
+ #
209
+ os.environ['TEMP'] = '/tmp'
210
+ model_path = os.path.join(model_dir, "tabular_model")
211
+ model = TabularModel.load_model(model_path)
212
+ return model
213
+
214
+
215
+ def model_fn(model_dir):
216
+
217
+ # Save current working directory
218
+ original_cwd = os.getcwd()
219
+ try:
220
+ # Change to /tmp because Pytorch Tabular needs write access (creates a .pt_tmp directory)
221
+ os.chdir('/tmp')
222
+
223
+ # Load the model
224
+ model_path = os.path.join(model_dir, "tabular_model")
225
+ model = TabularModel.load_model(model_path)
226
+
227
+ # Restore the original working directory
228
+ finally:
229
+ os.chdir(original_cwd)
230
+
231
+ return model
232
+
233
+
234
+ def input_fn(input_data, content_type):
235
+ """Parse input data and return a DataFrame."""
236
+ if not input_data:
237
+ raise ValueError("Empty input data is not supported!")
238
+
239
+ # Decode bytes to string if necessary
240
+ if isinstance(input_data, bytes):
241
+ input_data = input_data.decode("utf-8")
242
+
243
+ if "text/csv" in content_type:
244
+ return pd.read_csv(StringIO(input_data))
245
+ elif "application/json" in content_type:
246
+ return pd.DataFrame(json.loads(input_data)) # Assumes JSON array of records
247
+ else:
248
+ raise ValueError(f"{content_type} not supported!")
249
+
250
+
251
+ def output_fn(output_df, accept_type):
252
+ """Supports both CSV and JSON output formats."""
253
+ if "text/csv" in accept_type:
254
+ csv_output = output_df.fillna("N/A").to_csv(index=False) # CSV with N/A for missing values
255
+ return csv_output, "text/csv"
256
+ elif "application/json" in accept_type:
257
+ return output_df.to_json(orient="records"), "application/json" # JSON array of records (NaNs -> null)
258
+ else:
259
+ raise RuntimeError(f"{accept_type} accept type is not supported by this script.")
260
+
261
+
262
+ def predict_fn(df, model) -> pd.DataFrame:
263
+ """Make Predictions with our PyTorch Tabular Model
264
+
265
+ Args:
266
+ df (pd.DataFrame): The input DataFrame
267
+ model: The TabularModel use for predictions
268
+
269
+ Returns:
270
+ pd.DataFrame: The DataFrame with the predictions added
271
+ """
272
+ compressed_features = TEMPLATE_PARAMS["compressed_features"]
273
+
274
+ # Grab our feature columns (from training)
275
+ model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
276
+ with open(os.path.join(model_dir, "feature_columns.json")) as fp:
277
+ features = json.load(fp)
278
+ print(f"Model Features: {features}")
279
+
280
+ # Load the category mappings (from training)
281
+ with open(os.path.join(model_dir, "category_mappings.json")) as fp:
282
+ category_mappings = json.load(fp)
283
+
284
+ # Load our Label Encoder if we have one
285
+ label_encoder = None
286
+ if os.path.exists(os.path.join(model_dir, "label_encoder.joblib")):
287
+ label_encoder = joblib.load(os.path.join(model_dir, "label_encoder.joblib"))
288
+
289
+ # We're going match features in a case-insensitive manner, accounting for all the permutations
290
+ # - Model has a feature list that's any case ("Id", "taCos", "cOunT", "likes_tacos")
291
+ # - Incoming data has columns that are mixed case ("ID", "Tacos", "Count", "Likes_Tacos")
292
+ matched_df = match_features_case_insensitive(df, features)
293
+
294
+ # Detect categorical types in the incoming DataFrame
295
+ matched_df, _ = convert_categorical_types(matched_df, features, category_mappings)
296
+
297
+ # If we have compressed features, decompress them
298
+ if compressed_features:
299
+ print("Decompressing features for prediction...")
300
+ matched_df, features = decompress_features(matched_df, features, compressed_features)
301
+
302
+ # Make predictions using the TabularModel
303
+ result = model.predict(matched_df[features])
304
+
305
+ # pytorch-tabular returns predictions using f"{target}_prediction" column
306
+ # and classification probabilities in columns ending with "_probability"
307
+ target = TEMPLATE_PARAMS["target_column"]
308
+ prediction_column = f"{target}_prediction"
309
+ if prediction_column in result.columns:
310
+ predictions = result[prediction_column].values
311
+ else:
312
+ raise ValueError(f"Cannot find prediction column in: {result.columns.tolist()}")
313
+
314
+ # If we have a label encoder, decode the predictions
315
+ if label_encoder:
316
+ predictions = label_encoder.inverse_transform(predictions.astype(int))
317
+
318
+ # Set the predictions on the DataFrame
319
+ df["prediction"] = predictions
320
+
321
+ # For classification, get probabilities
322
+ if label_encoder is not None:
323
+ prob_cols = [col for col in result.columns if col.endswith("_probability")]
324
+ if prob_cols:
325
+ probs = result[prob_cols].values
326
+ df["pred_proba"] = [p.tolist() for p in probs]
327
+
328
+ # Expand the pred_proba column into separate columns for each class
329
+ df = expand_proba_column(df, label_encoder.classes_)
330
+
331
+ # All done, return the DataFrame with new columns for the predictions
332
+ return df
333
+
334
+
206
335
  if __name__ == "__main__":
207
336
  """The main function is for training the PyTorch Tabular model"""
208
337
 
@@ -265,14 +394,12 @@ if __name__ == "__main__":
265
394
  else:
266
395
  # Just do a random training Split
267
396
  print("WARNING: No training column found, splitting data with random state=42")
268
- df_train, df_val = train_test_split(
269
- all_df, test_size=validation_split, random_state=42
270
- )
397
+ df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
271
398
  print(f"FIT/TRAIN: {df_train.shape}")
272
399
  print(f"VALIDATION: {df_val.shape}")
273
400
 
274
401
  # Determine categorical and continuous columns
275
- categorical_cols = [col for col in features if df_train[col].dtype.name == 'category']
402
+ categorical_cols = [col for col in features if df_train[col].dtype.name == "category"]
276
403
  continuous_cols = [col for col in features if col not in categorical_cols]
277
404
 
278
405
  print(f"Categorical columns: {categorical_cols}")
@@ -287,37 +414,39 @@ if __name__ == "__main__":
287
414
 
288
415
  trainer_config = TrainerConfig(
289
416
  auto_lr_find=True,
290
- batch_size=1024,
417
+ batch_size=min(1024, len(df_train) // 4),
291
418
  max_epochs=100,
292
419
  early_stopping="valid_loss",
293
- early_stopping_patience=20,
420
+ early_stopping_patience=15,
421
+ checkpoints="valid_loss",
422
+ accelerator="auto",
294
423
  progress_bar="none",
424
+ gradient_clip_val=1.0,
295
425
  )
296
426
 
297
427
  optimizer_config = OptimizerConfig()
298
428
 
299
429
  # Choose model configuration based on model type
300
430
  if model_type == "classifier":
301
- # Use TabNet for classification
302
- model_config = TabNetModelConfig(
303
- task="classification",
304
- learning_rate=1e-3,
305
- )
306
-
431
+ task = "classification"
307
432
  # Encode the target column
308
433
  label_encoder = LabelEncoder()
309
434
  df_train[target] = label_encoder.fit_transform(df_train[target])
310
435
  df_val[target] = label_encoder.transform(df_val[target])
311
-
312
436
  else:
313
- # Use CategoryEmbedding for regression
314
- model_config = CategoryEmbeddingModelConfig(
315
- task="regression",
316
- layers="1024-512-512",
317
- activation="ReLU",
318
- learning_rate=1e-3,
319
- )
320
- label_encoder = None # We don't need this for regression
437
+ task = "regression"
438
+ label_encoder = None
439
+
440
+ # Use CategoryEmbedding for both regression and classification tasks
441
+ model_config = CategoryEmbeddingModelConfig(
442
+ task=task,
443
+ layers="1024-512-512",
444
+ activation="ReLU",
445
+ learning_rate=1e-3,
446
+ dropout=0.1,
447
+ use_batch_norm=True,
448
+ initialization="kaiming",
449
+ )
321
450
 
322
451
  # Create and train the TabularModel
323
452
  tabular_model = TabularModel(
@@ -332,16 +461,15 @@ if __name__ == "__main__":
332
461
 
333
462
  # Make Predictions on the Validation Set
334
463
  print(f"Making Predictions on Validation Set...")
335
- result = tabular_model.predict(df_val)
336
- print(f"Result Columns: {result.columns.tolist()}")
464
+ result = tabular_model.predict(df_val, include_input_features=False)
337
465
 
338
- # For regression: pytorch-tabular returns predictions using the target column name
339
- # For classification: pytorch-tabular returns predictions using "prediction" column
466
+ # pytorch-tabular returns predictions using f"{target}_prediction" column
467
+ # and classification probabilities in columns ending with "_probability"
340
468
  if model_type == "classifier":
341
- preds = result["prediction"].values
469
+ preds = result[f"{target}_prediction"].values
342
470
  else:
343
471
  # Regression: use the target column name
344
- preds = result[target].values
472
+ preds = result[f"{target}_prediction"].values
345
473
 
346
474
  if model_type == "classifier":
347
475
  # Get probabilities for classification
@@ -362,10 +490,10 @@ if __name__ == "__main__":
362
490
  else:
363
491
  y_validate = df_val[target].values
364
492
 
365
- # Save predictions to S3 (just the target, prediction, and '_proba' columns)
493
+ # Save predictions to S3 (just the target, prediction, and '_probability' columns)
366
494
  df_val["prediction"] = preds
367
495
  output_columns = [target, "prediction"]
368
- output_columns += [col for col in df_val.columns if col.endswith("_proba")]
496
+ output_columns += [col for col in df_val.columns if col.endswith("_probability")]
369
497
  wr.s3.to_csv(
370
498
  df_val[output_columns],
371
499
  path=f"{model_metrics_s3_path}/validation_predictions.csv",
@@ -378,9 +506,7 @@ if __name__ == "__main__":
378
506
  label_names = label_encoder.classes_
379
507
 
380
508
  # Calculate various model performance metrics
381
- scores = precision_recall_fscore_support(
382
- y_validate, preds, average=None, labels=label_names
383
- )
509
+ scores = precision_recall_fscore_support(y_validate, preds, average=None, labels=label_names)
384
510
 
385
511
  # Put the scores into a dataframe
386
512
  score_df = pd.DataFrame(
@@ -428,116 +554,4 @@ if __name__ == "__main__":
428
554
 
429
555
  # Save the category mappings
430
556
  with open(os.path.join(args.model_dir, "category_mappings.json"), "w") as fp:
431
- json.dump(category_mappings, fp)
432
-
433
-
434
- def model_fn(model_dir):
435
- """Deserialize and return fitted PyTorch Tabular model"""
436
- model_path = os.path.join(model_dir, "tabular_model")
437
- model = TabularModel.load_model(model_path)
438
- return model
439
-
440
-
441
- def input_fn(input_data, content_type):
442
- """Parse input data and return a DataFrame."""
443
- if not input_data:
444
- raise ValueError("Empty input data is not supported!")
445
-
446
- # Decode bytes to string if necessary
447
- if isinstance(input_data, bytes):
448
- input_data = input_data.decode("utf-8")
449
-
450
- if "text/csv" in content_type:
451
- return pd.read_csv(StringIO(input_data))
452
- elif "application/json" in content_type:
453
- return pd.DataFrame(json.loads(input_data)) # Assumes JSON array of records
454
- else:
455
- raise ValueError(f"{content_type} not supported!")
456
-
457
-
458
- def output_fn(output_df, accept_type):
459
- """Supports both CSV and JSON output formats."""
460
- if "text/csv" in accept_type:
461
- csv_output = output_df.fillna("N/A").to_csv(index=False) # CSV with N/A for missing values
462
- return csv_output, "text/csv"
463
- elif "application/json" in accept_type:
464
- return output_df.to_json(orient="records"), "application/json" # JSON array of records (NaNs -> null)
465
- else:
466
- raise RuntimeError(f"{accept_type} accept type is not supported by this script.")
467
-
468
-
469
- def predict_fn(df, model) -> pd.DataFrame:
470
- """Make Predictions with our PyTorch Tabular Model
471
-
472
- Args:
473
- df (pd.DataFrame): The input DataFrame
474
- model: The TabularModel use for predictions
475
-
476
- Returns:
477
- pd.DataFrame: The DataFrame with the predictions added
478
- """
479
- compressed_features = TEMPLATE_PARAMS["compressed_features"]
480
-
481
- # Grab our feature columns (from training)
482
- model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
483
- with open(os.path.join(model_dir, "feature_columns.json")) as fp:
484
- features = json.load(fp)
485
- print(f"Model Features: {features}")
486
-
487
- # Load the category mappings (from training)
488
- with open(os.path.join(model_dir, "category_mappings.json")) as fp:
489
- category_mappings = json.load(fp)
490
-
491
- # Load our Label Encoder if we have one
492
- label_encoder = None
493
- if os.path.exists(os.path.join(model_dir, "label_encoder.joblib")):
494
- label_encoder = joblib.load(os.path.join(model_dir, "label_encoder.joblib"))
495
-
496
- # We're going match features in a case-insensitive manner, accounting for all the permutations
497
- # - Model has a feature list that's any case ("Id", "taCos", "cOunT", "likes_tacos")
498
- # - Incoming data has columns that are mixed case ("ID", "Tacos", "Count", "Likes_Tacos")
499
- matched_df = match_features_case_insensitive(df, features)
500
-
501
- # Detect categorical types in the incoming DataFrame
502
- matched_df, _ = convert_categorical_types(matched_df, features, category_mappings)
503
-
504
- # If we have compressed features, decompress them
505
- if compressed_features:
506
- print("Decompressing features for prediction...")
507
- matched_df, features = decompress_features(matched_df, features, compressed_features)
508
-
509
- # Make predictions using the TabularModel
510
- result = model.predict(matched_df)
511
-
512
- # Extract predictions based on model type
513
- # For regression: pytorch-tabular uses target column name
514
- # For classification: pytorch-tabular uses "prediction" column
515
- if "prediction" in result.columns:
516
- predictions = result["prediction"].values
517
- else:
518
- # For regression, find the new column (not in original dataframe)
519
- pred_cols = [col for col in result.columns if col not in matched_df.columns]
520
- if pred_cols:
521
- predictions = result[pred_cols[0]].values
522
- else:
523
- raise ValueError(f"Cannot find prediction column in: {result.columns.tolist()}")
524
-
525
- # If we have a label encoder, decode the predictions
526
- if label_encoder:
527
- predictions = label_encoder.inverse_transform(predictions.astype(int))
528
-
529
- # Set the predictions on the DataFrame
530
- df["prediction"] = predictions
531
-
532
- # For classification, get probabilities
533
- if label_encoder is not None:
534
- prob_cols = [col for col in result.columns if col.endswith("_probability")]
535
- if prob_cols:
536
- probs = result[prob_cols].values
537
- df["pred_proba"] = [p.tolist() for p in probs]
538
-
539
- # Expand the pred_proba column into separate columns for each class
540
- df = expand_proba_column(df, label_encoder.classes_)
541
-
542
- # All done, return the DataFrame with new columns for the predictions
543
- return df
557
+ json.dump(category_mappings, fp)