workbench 0.8.158__py3-none-any.whl → 0.8.159__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- workbench/api/feature_set.py +12 -4
- workbench/api/meta.py +1 -1
- workbench/cached/cached_feature_set.py +1 -0
- workbench/cached/cached_meta.py +10 -12
- workbench/core/artifacts/cached_artifact_mixin.py +6 -3
- workbench/core/artifacts/model_core.py +19 -7
- workbench/core/cloud_platform/aws/aws_meta.py +66 -45
- workbench/core/cloud_platform/cloud_meta.py +5 -2
- workbench/core/transforms/features_to_model/features_to_model.py +9 -5
- workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +6 -0
- workbench/model_scripts/{custom_models/nn_models → pytorch_model}/generated_model_script.py +170 -156
- workbench/model_scripts/{custom_models/nn_models → pytorch_model}/pytorch.template +153 -147
- workbench/model_scripts/pytorch_model/requirements.txt +2 -0
- workbench/model_scripts/scikit_learn/generated_model_script.py +307 -0
- workbench/model_scripts/script_generation.py +6 -2
- workbench/model_scripts/xgb_model/generated_model_script.py +6 -6
- workbench/repl/workbench_shell.py +4 -9
- workbench/utils/json_utils.py +27 -8
- workbench/utils/pandas_utils.py +12 -13
- workbench/utils/redis_cache.py +28 -13
- workbench/utils/workbench_cache.py +20 -14
- workbench/web_interface/page_views/endpoints_page_view.py +1 -1
- workbench/web_interface/page_views/main_page.py +1 -1
- {workbench-0.8.158.dist-info → workbench-0.8.159.dist-info}/METADATA +5 -8
- {workbench-0.8.158.dist-info → workbench-0.8.159.dist-info}/RECORD +29 -29
- workbench/model_scripts/custom_models/nn_models/Readme.md +0 -9
- workbench/model_scripts/custom_models/nn_models/requirements.txt +0 -4
- {workbench-0.8.158.dist-info → workbench-0.8.159.dist-info}/WHEEL +0 -0
- {workbench-0.8.158.dist-info → workbench-0.8.159.dist-info}/entry_points.txt +0 -0
- {workbench-0.8.158.dist-info → workbench-0.8.159.dist-info}/licenses/LICENSE +0 -0
- {workbench-0.8.158.dist-info → workbench-0.8.159.dist-info}/top_level.txt +0 -0
|
@@ -3,12 +3,12 @@ import os
|
|
|
3
3
|
import awswrangler as wr
|
|
4
4
|
import numpy as np
|
|
5
5
|
|
|
6
|
-
# PyTorch
|
|
6
|
+
# PyTorch compatibility: pytorch-tabular saves complex objects, not just tensors
|
|
7
7
|
# Use legacy loading behavior for compatibility (recommended by PyTorch docs for this scenario)
|
|
8
8
|
os.environ["TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD"] = "1"
|
|
9
9
|
from pytorch_tabular import TabularModel
|
|
10
10
|
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig
|
|
11
|
-
from pytorch_tabular.models import CategoryEmbeddingModelConfig
|
|
11
|
+
from pytorch_tabular.models import CategoryEmbeddingModelConfig
|
|
12
12
|
|
|
13
13
|
# Model Performance Scores
|
|
14
14
|
from sklearn.metrics import (
|
|
@@ -148,7 +148,9 @@ def convert_categorical_types(df: pd.DataFrame, features: list, category_mapping
|
|
|
148
148
|
return df, category_mappings
|
|
149
149
|
|
|
150
150
|
|
|
151
|
-
def decompress_features(
|
|
151
|
+
def decompress_features(
|
|
152
|
+
df: pd.DataFrame, features: List[str], compressed_features: List[str]
|
|
153
|
+
) -> Tuple[pd.DataFrame, List[str]]:
|
|
152
154
|
"""Prepare features for the model
|
|
153
155
|
|
|
154
156
|
Args:
|
|
@@ -201,6 +203,126 @@ def decompress_features(df: pd.DataFrame, features: List[str], compressed_featur
|
|
|
201
203
|
return df, decompressed_features
|
|
202
204
|
|
|
203
205
|
|
|
206
|
+
def model_fn(model_dir):
|
|
207
|
+
|
|
208
|
+
# Save current working directory
|
|
209
|
+
original_cwd = os.getcwd()
|
|
210
|
+
try:
|
|
211
|
+
# Change to /tmp because Pytorch Tabular needs write access (creates a .pt_tmp directory)
|
|
212
|
+
os.chdir('/tmp')
|
|
213
|
+
|
|
214
|
+
# Load the model
|
|
215
|
+
model_path = os.path.join(model_dir, "tabular_model")
|
|
216
|
+
model = TabularModel.load_model(model_path)
|
|
217
|
+
|
|
218
|
+
# Restore the original working directory
|
|
219
|
+
finally:
|
|
220
|
+
os.chdir(original_cwd)
|
|
221
|
+
|
|
222
|
+
return model
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def input_fn(input_data, content_type):
|
|
226
|
+
"""Parse input data and return a DataFrame."""
|
|
227
|
+
if not input_data:
|
|
228
|
+
raise ValueError("Empty input data is not supported!")
|
|
229
|
+
|
|
230
|
+
# Decode bytes to string if necessary
|
|
231
|
+
if isinstance(input_data, bytes):
|
|
232
|
+
input_data = input_data.decode("utf-8")
|
|
233
|
+
|
|
234
|
+
if "text/csv" in content_type:
|
|
235
|
+
return pd.read_csv(StringIO(input_data))
|
|
236
|
+
elif "application/json" in content_type:
|
|
237
|
+
return pd.DataFrame(json.loads(input_data)) # Assumes JSON array of records
|
|
238
|
+
else:
|
|
239
|
+
raise ValueError(f"{content_type} not supported!")
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
def output_fn(output_df, accept_type):
|
|
243
|
+
"""Supports both CSV and JSON output formats."""
|
|
244
|
+
if "text/csv" in accept_type:
|
|
245
|
+
csv_output = output_df.fillna("N/A").to_csv(index=False) # CSV with N/A for missing values
|
|
246
|
+
return csv_output, "text/csv"
|
|
247
|
+
elif "application/json" in accept_type:
|
|
248
|
+
return output_df.to_json(orient="records"), "application/json" # JSON array of records (NaNs -> null)
|
|
249
|
+
else:
|
|
250
|
+
raise RuntimeError(f"{accept_type} accept type is not supported by this script.")
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def predict_fn(df, model) -> pd.DataFrame:
|
|
254
|
+
"""Make Predictions with our PyTorch Tabular Model
|
|
255
|
+
|
|
256
|
+
Args:
|
|
257
|
+
df (pd.DataFrame): The input DataFrame
|
|
258
|
+
model: The TabularModel use for predictions
|
|
259
|
+
|
|
260
|
+
Returns:
|
|
261
|
+
pd.DataFrame: The DataFrame with the predictions added
|
|
262
|
+
"""
|
|
263
|
+
compressed_features = TEMPLATE_PARAMS["compressed_features"]
|
|
264
|
+
|
|
265
|
+
# Grab our feature columns (from training)
|
|
266
|
+
model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
|
|
267
|
+
with open(os.path.join(model_dir, "feature_columns.json")) as fp:
|
|
268
|
+
features = json.load(fp)
|
|
269
|
+
print(f"Model Features: {features}")
|
|
270
|
+
|
|
271
|
+
# Load the category mappings (from training)
|
|
272
|
+
with open(os.path.join(model_dir, "category_mappings.json")) as fp:
|
|
273
|
+
category_mappings = json.load(fp)
|
|
274
|
+
|
|
275
|
+
# Load our Label Encoder if we have one
|
|
276
|
+
label_encoder = None
|
|
277
|
+
if os.path.exists(os.path.join(model_dir, "label_encoder.joblib")):
|
|
278
|
+
label_encoder = joblib.load(os.path.join(model_dir, "label_encoder.joblib"))
|
|
279
|
+
|
|
280
|
+
# We're going match features in a case-insensitive manner, accounting for all the permutations
|
|
281
|
+
# - Model has a feature list that's any case ("Id", "taCos", "cOunT", "likes_tacos")
|
|
282
|
+
# - Incoming data has columns that are mixed case ("ID", "Tacos", "Count", "Likes_Tacos")
|
|
283
|
+
matched_df = match_features_case_insensitive(df, features)
|
|
284
|
+
|
|
285
|
+
# Detect categorical types in the incoming DataFrame
|
|
286
|
+
matched_df, _ = convert_categorical_types(matched_df, features, category_mappings)
|
|
287
|
+
|
|
288
|
+
# If we have compressed features, decompress them
|
|
289
|
+
if compressed_features:
|
|
290
|
+
print("Decompressing features for prediction...")
|
|
291
|
+
matched_df, features = decompress_features(matched_df, features, compressed_features)
|
|
292
|
+
|
|
293
|
+
# Make predictions using the TabularModel
|
|
294
|
+
result = model.predict(matched_df[features])
|
|
295
|
+
|
|
296
|
+
# pytorch-tabular returns predictions using f"{target}_prediction" column
|
|
297
|
+
# and classification probabilities in columns ending with "_probability"
|
|
298
|
+
target = TEMPLATE_PARAMS["target_column"]
|
|
299
|
+
prediction_column = f"{target}_prediction"
|
|
300
|
+
if prediction_column in result.columns:
|
|
301
|
+
predictions = result[prediction_column].values
|
|
302
|
+
else:
|
|
303
|
+
raise ValueError(f"Cannot find prediction column in: {result.columns.tolist()}")
|
|
304
|
+
|
|
305
|
+
# If we have a label encoder, decode the predictions
|
|
306
|
+
if label_encoder:
|
|
307
|
+
predictions = label_encoder.inverse_transform(predictions.astype(int))
|
|
308
|
+
|
|
309
|
+
# Set the predictions on the DataFrame
|
|
310
|
+
df["prediction"] = predictions
|
|
311
|
+
|
|
312
|
+
# For classification, get probabilities
|
|
313
|
+
if label_encoder is not None:
|
|
314
|
+
prob_cols = [col for col in result.columns if col.endswith("_probability")]
|
|
315
|
+
if prob_cols:
|
|
316
|
+
probs = result[prob_cols].values
|
|
317
|
+
df["pred_proba"] = [p.tolist() for p in probs]
|
|
318
|
+
|
|
319
|
+
# Expand the pred_proba column into separate columns for each class
|
|
320
|
+
df = expand_proba_column(df, label_encoder.classes_)
|
|
321
|
+
|
|
322
|
+
# All done, return the DataFrame with new columns for the predictions
|
|
323
|
+
return df
|
|
324
|
+
|
|
325
|
+
|
|
204
326
|
if __name__ == "__main__":
|
|
205
327
|
"""The main function is for training the PyTorch Tabular model"""
|
|
206
328
|
|
|
@@ -263,14 +385,12 @@ if __name__ == "__main__":
|
|
|
263
385
|
else:
|
|
264
386
|
# Just do a random training Split
|
|
265
387
|
print("WARNING: No training column found, splitting data with random state=42")
|
|
266
|
-
df_train, df_val = train_test_split(
|
|
267
|
-
all_df, test_size=validation_split, random_state=42
|
|
268
|
-
)
|
|
388
|
+
df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
|
|
269
389
|
print(f"FIT/TRAIN: {df_train.shape}")
|
|
270
390
|
print(f"VALIDATION: {df_val.shape}")
|
|
271
391
|
|
|
272
392
|
# Determine categorical and continuous columns
|
|
273
|
-
categorical_cols = [col for col in features if df_train[col].dtype.name ==
|
|
393
|
+
categorical_cols = [col for col in features if df_train[col].dtype.name == "category"]
|
|
274
394
|
continuous_cols = [col for col in features if col not in categorical_cols]
|
|
275
395
|
|
|
276
396
|
print(f"Categorical columns: {categorical_cols}")
|
|
@@ -285,11 +405,14 @@ if __name__ == "__main__":
|
|
|
285
405
|
|
|
286
406
|
trainer_config = TrainerConfig(
|
|
287
407
|
auto_lr_find=True,
|
|
288
|
-
batch_size=1024,
|
|
408
|
+
batch_size=min(1024, len(df_train) // 4),
|
|
289
409
|
max_epochs=100,
|
|
290
410
|
early_stopping="valid_loss",
|
|
291
|
-
early_stopping_patience=
|
|
411
|
+
early_stopping_patience=15,
|
|
412
|
+
checkpoints="valid_loss",
|
|
413
|
+
accelerator="auto",
|
|
292
414
|
progress_bar="none",
|
|
415
|
+
gradient_clip_val=1.0,
|
|
293
416
|
)
|
|
294
417
|
|
|
295
418
|
optimizer_config = OptimizerConfig()
|
|
@@ -297,27 +420,24 @@ if __name__ == "__main__":
|
|
|
297
420
|
# Choose model configuration based on model type
|
|
298
421
|
if model_type == "classifier":
|
|
299
422
|
task = "classification"
|
|
300
|
-
# Use TabNet for classification
|
|
301
|
-
model_config = TabNetModelConfig(
|
|
302
|
-
task=task,
|
|
303
|
-
learning_rate=1e-3,
|
|
304
|
-
)
|
|
305
|
-
|
|
306
423
|
# Encode the target column
|
|
307
424
|
label_encoder = LabelEncoder()
|
|
308
425
|
df_train[target] = label_encoder.fit_transform(df_train[target])
|
|
309
426
|
df_val[target] = label_encoder.transform(df_val[target])
|
|
310
|
-
|
|
311
427
|
else:
|
|
312
428
|
task = "regression"
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
429
|
+
label_encoder = None
|
|
430
|
+
|
|
431
|
+
# Use CategoryEmbedding for both regression and classification tasks
|
|
432
|
+
model_config = CategoryEmbeddingModelConfig(
|
|
433
|
+
task=task,
|
|
434
|
+
layers="1024-512-512",
|
|
435
|
+
activation="ReLU",
|
|
436
|
+
learning_rate=1e-3,
|
|
437
|
+
dropout=0.1,
|
|
438
|
+
use_batch_norm=True,
|
|
439
|
+
initialization="kaiming",
|
|
440
|
+
)
|
|
321
441
|
|
|
322
442
|
# Create and train the TabularModel
|
|
323
443
|
tabular_model = TabularModel(
|
|
@@ -331,16 +451,16 @@ if __name__ == "__main__":
|
|
|
331
451
|
tabular_model.fit(train=df_train, validation=df_val)
|
|
332
452
|
|
|
333
453
|
# Make Predictions on the Validation Set
|
|
334
|
-
print(
|
|
335
|
-
result = tabular_model.predict(df_val)
|
|
454
|
+
print("Making Predictions on Validation Set...")
|
|
455
|
+
result = tabular_model.predict(df_val, include_input_features=False)
|
|
336
456
|
|
|
337
|
-
#
|
|
338
|
-
#
|
|
457
|
+
# pytorch-tabular returns predictions using f"{target}_prediction" column
|
|
458
|
+
# and classification probabilities in columns ending with "_probability"
|
|
339
459
|
if model_type == "classifier":
|
|
340
|
-
preds = result["
|
|
460
|
+
preds = result[f"{target}_prediction"].values
|
|
341
461
|
else:
|
|
342
462
|
# Regression: use the target column name
|
|
343
|
-
preds = result[target].values
|
|
463
|
+
preds = result[f"{target}_prediction"].values
|
|
344
464
|
|
|
345
465
|
if model_type == "classifier":
|
|
346
466
|
# Get probabilities for classification
|
|
@@ -361,10 +481,10 @@ if __name__ == "__main__":
|
|
|
361
481
|
else:
|
|
362
482
|
y_validate = df_val[target].values
|
|
363
483
|
|
|
364
|
-
# Save predictions to S3 (just the target, prediction, and '
|
|
484
|
+
# Save predictions to S3 (just the target, prediction, and '_probability' columns)
|
|
365
485
|
df_val["prediction"] = preds
|
|
366
486
|
output_columns = [target, "prediction"]
|
|
367
|
-
output_columns += [col for col in df_val.columns if col.endswith("
|
|
487
|
+
output_columns += [col for col in df_val.columns if col.endswith("_probability")]
|
|
368
488
|
wr.s3.to_csv(
|
|
369
489
|
df_val[output_columns],
|
|
370
490
|
path=f"{model_metrics_s3_path}/validation_predictions.csv",
|
|
@@ -377,9 +497,7 @@ if __name__ == "__main__":
|
|
|
377
497
|
label_names = label_encoder.classes_
|
|
378
498
|
|
|
379
499
|
# Calculate various model performance metrics
|
|
380
|
-
scores = precision_recall_fscore_support(
|
|
381
|
-
y_validate, preds, average=None, labels=label_names
|
|
382
|
-
)
|
|
500
|
+
scores = precision_recall_fscore_support(y_validate, preds, average=None, labels=label_names)
|
|
383
501
|
|
|
384
502
|
# Put the scores into a dataframe
|
|
385
503
|
score_df = pd.DataFrame(
|
|
@@ -428,115 +546,3 @@ if __name__ == "__main__":
|
|
|
428
546
|
# Save the category mappings
|
|
429
547
|
with open(os.path.join(args.model_dir, "category_mappings.json"), "w") as fp:
|
|
430
548
|
json.dump(category_mappings, fp)
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
def model_fn(model_dir):
|
|
434
|
-
"""Deserialize and return fitted PyTorch Tabular model"""
|
|
435
|
-
model_path = os.path.join(model_dir, "tabular_model")
|
|
436
|
-
model = TabularModel.load_model(model_path)
|
|
437
|
-
return model
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
def input_fn(input_data, content_type):
|
|
441
|
-
"""Parse input data and return a DataFrame."""
|
|
442
|
-
if not input_data:
|
|
443
|
-
raise ValueError("Empty input data is not supported!")
|
|
444
|
-
|
|
445
|
-
# Decode bytes to string if necessary
|
|
446
|
-
if isinstance(input_data, bytes):
|
|
447
|
-
input_data = input_data.decode("utf-8")
|
|
448
|
-
|
|
449
|
-
if "text/csv" in content_type:
|
|
450
|
-
return pd.read_csv(StringIO(input_data))
|
|
451
|
-
elif "application/json" in content_type:
|
|
452
|
-
return pd.DataFrame(json.loads(input_data)) # Assumes JSON array of records
|
|
453
|
-
else:
|
|
454
|
-
raise ValueError(f"{content_type} not supported!")
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
def output_fn(output_df, accept_type):
|
|
458
|
-
"""Supports both CSV and JSON output formats."""
|
|
459
|
-
if "text/csv" in accept_type:
|
|
460
|
-
csv_output = output_df.fillna("N/A").to_csv(index=False) # CSV with N/A for missing values
|
|
461
|
-
return csv_output, "text/csv"
|
|
462
|
-
elif "application/json" in accept_type:
|
|
463
|
-
return output_df.to_json(orient="records"), "application/json" # JSON array of records (NaNs -> null)
|
|
464
|
-
else:
|
|
465
|
-
raise RuntimeError(f"{accept_type} accept type is not supported by this script.")
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
def predict_fn(df, model) -> pd.DataFrame:
|
|
469
|
-
"""Make Predictions with our PyTorch Tabular Model
|
|
470
|
-
|
|
471
|
-
Args:
|
|
472
|
-
df (pd.DataFrame): The input DataFrame
|
|
473
|
-
model: The TabularModel use for predictions
|
|
474
|
-
|
|
475
|
-
Returns:
|
|
476
|
-
pd.DataFrame: The DataFrame with the predictions added
|
|
477
|
-
"""
|
|
478
|
-
compressed_features = TEMPLATE_PARAMS["compressed_features"]
|
|
479
|
-
|
|
480
|
-
# Grab our feature columns (from training)
|
|
481
|
-
model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
|
|
482
|
-
with open(os.path.join(model_dir, "feature_columns.json")) as fp:
|
|
483
|
-
features = json.load(fp)
|
|
484
|
-
print(f"Model Features: {features}")
|
|
485
|
-
|
|
486
|
-
# Load the category mappings (from training)
|
|
487
|
-
with open(os.path.join(model_dir, "category_mappings.json")) as fp:
|
|
488
|
-
category_mappings = json.load(fp)
|
|
489
|
-
|
|
490
|
-
# Load our Label Encoder if we have one
|
|
491
|
-
label_encoder = None
|
|
492
|
-
if os.path.exists(os.path.join(model_dir, "label_encoder.joblib")):
|
|
493
|
-
label_encoder = joblib.load(os.path.join(model_dir, "label_encoder.joblib"))
|
|
494
|
-
|
|
495
|
-
# We're going match features in a case-insensitive manner, accounting for all the permutations
|
|
496
|
-
# - Model has a feature list that's any case ("Id", "taCos", "cOunT", "likes_tacos")
|
|
497
|
-
# - Incoming data has columns that are mixed case ("ID", "Tacos", "Count", "Likes_Tacos")
|
|
498
|
-
matched_df = match_features_case_insensitive(df, features)
|
|
499
|
-
|
|
500
|
-
# Detect categorical types in the incoming DataFrame
|
|
501
|
-
matched_df, _ = convert_categorical_types(matched_df, features, category_mappings)
|
|
502
|
-
|
|
503
|
-
# If we have compressed features, decompress them
|
|
504
|
-
if compressed_features:
|
|
505
|
-
print("Decompressing features for prediction...")
|
|
506
|
-
matched_df, features = decompress_features(matched_df, features, compressed_features)
|
|
507
|
-
|
|
508
|
-
# Make predictions using the TabularModel
|
|
509
|
-
result = model.predict(matched_df)
|
|
510
|
-
|
|
511
|
-
# Extract predictions based on model type
|
|
512
|
-
# For regression: pytorch-tabular uses target column name
|
|
513
|
-
# For classification: pytorch-tabular uses "prediction" column
|
|
514
|
-
if "prediction" in result.columns:
|
|
515
|
-
predictions = result["prediction"].values
|
|
516
|
-
else:
|
|
517
|
-
# For regression, find the new column (not in original dataframe)
|
|
518
|
-
pred_cols = [col for col in result.columns if col not in matched_df.columns]
|
|
519
|
-
if pred_cols:
|
|
520
|
-
predictions = result[pred_cols[0]].values
|
|
521
|
-
else:
|
|
522
|
-
raise ValueError(f"Cannot find prediction column in: {result.columns.tolist()}")
|
|
523
|
-
|
|
524
|
-
# If we have a label encoder, decode the predictions
|
|
525
|
-
if label_encoder:
|
|
526
|
-
predictions = label_encoder.inverse_transform(predictions.astype(int))
|
|
527
|
-
|
|
528
|
-
# Set the predictions on the DataFrame
|
|
529
|
-
df["prediction"] = predictions
|
|
530
|
-
|
|
531
|
-
# For classification, get probabilities
|
|
532
|
-
if label_encoder is not None:
|
|
533
|
-
prob_cols = [col for col in result.columns if col.endswith("_probability")]
|
|
534
|
-
if prob_cols:
|
|
535
|
-
probs = result[prob_cols].values
|
|
536
|
-
df["pred_proba"] = [p.tolist() for p in probs]
|
|
537
|
-
|
|
538
|
-
# Expand the pred_proba column into separate columns for each class
|
|
539
|
-
df = expand_proba_column(df, label_encoder.classes_)
|
|
540
|
-
|
|
541
|
-
# All done, return the DataFrame with new columns for the predictions
|
|
542
|
-
return df
|