workbench 0.8.157__py3-none-any.whl → 0.8.159__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- workbench/api/feature_set.py +12 -4
- workbench/api/meta.py +1 -1
- workbench/cached/cached_feature_set.py +1 -0
- workbench/cached/cached_meta.py +10 -12
- workbench/core/artifacts/cached_artifact_mixin.py +6 -3
- workbench/core/artifacts/data_source_abstract.py +1 -1
- workbench/core/artifacts/feature_set_core.py +2 -6
- workbench/core/artifacts/model_core.py +19 -7
- workbench/core/cloud_platform/aws/aws_meta.py +66 -45
- workbench/core/cloud_platform/cloud_meta.py +5 -2
- workbench/core/transforms/features_to_model/features_to_model.py +9 -5
- workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +6 -0
- workbench/core/transforms/pandas_transforms/pandas_to_features.py +6 -1
- workbench/model_scripts/{custom_models/nn_models → pytorch_model}/generated_model_script.py +170 -156
- workbench/model_scripts/{custom_models/nn_models → pytorch_model}/pytorch.template +153 -147
- workbench/model_scripts/pytorch_model/requirements.txt +2 -0
- workbench/model_scripts/scikit_learn/generated_model_script.py +307 -0
- workbench/model_scripts/script_generation.py +6 -2
- workbench/model_scripts/xgb_model/generated_model_script.py +5 -5
- workbench/repl/workbench_shell.py +4 -9
- workbench/utils/cloudwatch_handler.py +1 -9
- workbench/utils/json_utils.py +27 -8
- workbench/utils/pandas_utils.py +12 -13
- workbench/utils/redis_cache.py +28 -13
- workbench/utils/workbench_cache.py +20 -14
- workbench/web_interface/page_views/endpoints_page_view.py +1 -1
- workbench/web_interface/page_views/main_page.py +1 -1
- {workbench-0.8.157.dist-info → workbench-0.8.159.dist-info}/METADATA +7 -10
- {workbench-0.8.157.dist-info → workbench-0.8.159.dist-info}/RECORD +33 -33
- workbench/model_scripts/custom_models/nn_models/Readme.md +0 -9
- workbench/model_scripts/custom_models/nn_models/requirements.txt +0 -4
- {workbench-0.8.157.dist-info → workbench-0.8.159.dist-info}/WHEEL +0 -0
- {workbench-0.8.157.dist-info → workbench-0.8.159.dist-info}/entry_points.txt +0 -0
- {workbench-0.8.157.dist-info → workbench-0.8.159.dist-info}/licenses/LICENSE +0 -0
- {workbench-0.8.157.dist-info → workbench-0.8.159.dist-info}/top_level.txt +0 -0
|
@@ -1,16 +1,14 @@
|
|
|
1
1
|
# Imports for PyTorch Tabular Model
|
|
2
|
-
import
|
|
3
|
-
from pytorch_tabular import TabularModel
|
|
4
|
-
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig
|
|
5
|
-
from pytorch_tabular.models import CategoryEmbeddingModelConfig, NodeConfig, TabNetModelConfig
|
|
2
|
+
import os
|
|
6
3
|
import awswrangler as wr
|
|
7
4
|
import numpy as np
|
|
8
5
|
|
|
9
|
-
# PyTorch
|
|
6
|
+
# PyTorch compatibility: pytorch-tabular saves complex objects, not just tensors
|
|
10
7
|
# Use legacy loading behavior for compatibility (recommended by PyTorch docs for this scenario)
|
|
11
|
-
import os
|
|
12
|
-
|
|
13
8
|
os.environ["TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD"] = "1"
|
|
9
|
+
from pytorch_tabular import TabularModel
|
|
10
|
+
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig
|
|
11
|
+
from pytorch_tabular.models import CategoryEmbeddingModelConfig, TabNetModelConfig
|
|
14
12
|
|
|
15
13
|
# Model Performance Scores
|
|
16
14
|
from sklearn.metrics import (
|
|
@@ -37,11 +35,11 @@ from typing import List, Tuple
|
|
|
37
35
|
|
|
38
36
|
# Template Parameters
|
|
39
37
|
TEMPLATE_PARAMS = {
|
|
40
|
-
"model_type": "
|
|
41
|
-
"target_column": "
|
|
38
|
+
"model_type": "classifier",
|
|
39
|
+
"target_column": "solubility_class",
|
|
42
40
|
"features": ['molwt', 'mollogp', 'molmr', 'heavyatomcount', 'numhacceptors', 'numhdonors', 'numheteroatoms', 'numrotatablebonds', 'numvalenceelectrons', 'numaromaticrings', 'numsaturatedrings', 'numaliphaticrings', 'ringcount', 'tpsa', 'labuteasa', 'balabanj', 'bertzct'],
|
|
43
41
|
"compressed_features": [],
|
|
44
|
-
"model_metrics_s3_path": "s3://sandbox-sageworks-artifacts/models/aqsol-pytorch-
|
|
42
|
+
"model_metrics_s3_path": "s3://sandbox-sageworks-artifacts/models/aqsol-pytorch-class/training",
|
|
45
43
|
"train_all_data": False
|
|
46
44
|
}
|
|
47
45
|
|
|
@@ -150,7 +148,9 @@ def convert_categorical_types(df: pd.DataFrame, features: list, category_mapping
|
|
|
150
148
|
return df, category_mappings
|
|
151
149
|
|
|
152
150
|
|
|
153
|
-
def decompress_features(
|
|
151
|
+
def decompress_features(
|
|
152
|
+
df: pd.DataFrame, features: List[str], compressed_features: List[str]
|
|
153
|
+
) -> Tuple[pd.DataFrame, List[str]]:
|
|
154
154
|
"""Prepare features for the model
|
|
155
155
|
|
|
156
156
|
Args:
|
|
@@ -203,6 +203,135 @@ def decompress_features(df: pd.DataFrame, features: List[str], compressed_featur
|
|
|
203
203
|
return df, decompressed_features
|
|
204
204
|
|
|
205
205
|
|
|
206
|
+
def model_fn(model_dir):
|
|
207
|
+
"""Deserialize and return fitted PyTorch Tabular model"""
|
|
208
|
+
#
|
|
209
|
+
os.environ['TEMP'] = '/tmp'
|
|
210
|
+
model_path = os.path.join(model_dir, "tabular_model")
|
|
211
|
+
model = TabularModel.load_model(model_path)
|
|
212
|
+
return model
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def model_fn(model_dir):
|
|
216
|
+
|
|
217
|
+
# Save current working directory
|
|
218
|
+
original_cwd = os.getcwd()
|
|
219
|
+
try:
|
|
220
|
+
# Change to /tmp because Pytorch Tabular needs write access (creates a .pt_tmp directory)
|
|
221
|
+
os.chdir('/tmp')
|
|
222
|
+
|
|
223
|
+
# Load the model
|
|
224
|
+
model_path = os.path.join(model_dir, "tabular_model")
|
|
225
|
+
model = TabularModel.load_model(model_path)
|
|
226
|
+
|
|
227
|
+
# Restore the original working directory
|
|
228
|
+
finally:
|
|
229
|
+
os.chdir(original_cwd)
|
|
230
|
+
|
|
231
|
+
return model
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def input_fn(input_data, content_type):
|
|
235
|
+
"""Parse input data and return a DataFrame."""
|
|
236
|
+
if not input_data:
|
|
237
|
+
raise ValueError("Empty input data is not supported!")
|
|
238
|
+
|
|
239
|
+
# Decode bytes to string if necessary
|
|
240
|
+
if isinstance(input_data, bytes):
|
|
241
|
+
input_data = input_data.decode("utf-8")
|
|
242
|
+
|
|
243
|
+
if "text/csv" in content_type:
|
|
244
|
+
return pd.read_csv(StringIO(input_data))
|
|
245
|
+
elif "application/json" in content_type:
|
|
246
|
+
return pd.DataFrame(json.loads(input_data)) # Assumes JSON array of records
|
|
247
|
+
else:
|
|
248
|
+
raise ValueError(f"{content_type} not supported!")
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def output_fn(output_df, accept_type):
|
|
252
|
+
"""Supports both CSV and JSON output formats."""
|
|
253
|
+
if "text/csv" in accept_type:
|
|
254
|
+
csv_output = output_df.fillna("N/A").to_csv(index=False) # CSV with N/A for missing values
|
|
255
|
+
return csv_output, "text/csv"
|
|
256
|
+
elif "application/json" in accept_type:
|
|
257
|
+
return output_df.to_json(orient="records"), "application/json" # JSON array of records (NaNs -> null)
|
|
258
|
+
else:
|
|
259
|
+
raise RuntimeError(f"{accept_type} accept type is not supported by this script.")
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def predict_fn(df, model) -> pd.DataFrame:
|
|
263
|
+
"""Make Predictions with our PyTorch Tabular Model
|
|
264
|
+
|
|
265
|
+
Args:
|
|
266
|
+
df (pd.DataFrame): The input DataFrame
|
|
267
|
+
model: The TabularModel use for predictions
|
|
268
|
+
|
|
269
|
+
Returns:
|
|
270
|
+
pd.DataFrame: The DataFrame with the predictions added
|
|
271
|
+
"""
|
|
272
|
+
compressed_features = TEMPLATE_PARAMS["compressed_features"]
|
|
273
|
+
|
|
274
|
+
# Grab our feature columns (from training)
|
|
275
|
+
model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
|
|
276
|
+
with open(os.path.join(model_dir, "feature_columns.json")) as fp:
|
|
277
|
+
features = json.load(fp)
|
|
278
|
+
print(f"Model Features: {features}")
|
|
279
|
+
|
|
280
|
+
# Load the category mappings (from training)
|
|
281
|
+
with open(os.path.join(model_dir, "category_mappings.json")) as fp:
|
|
282
|
+
category_mappings = json.load(fp)
|
|
283
|
+
|
|
284
|
+
# Load our Label Encoder if we have one
|
|
285
|
+
label_encoder = None
|
|
286
|
+
if os.path.exists(os.path.join(model_dir, "label_encoder.joblib")):
|
|
287
|
+
label_encoder = joblib.load(os.path.join(model_dir, "label_encoder.joblib"))
|
|
288
|
+
|
|
289
|
+
# We're going match features in a case-insensitive manner, accounting for all the permutations
|
|
290
|
+
# - Model has a feature list that's any case ("Id", "taCos", "cOunT", "likes_tacos")
|
|
291
|
+
# - Incoming data has columns that are mixed case ("ID", "Tacos", "Count", "Likes_Tacos")
|
|
292
|
+
matched_df = match_features_case_insensitive(df, features)
|
|
293
|
+
|
|
294
|
+
# Detect categorical types in the incoming DataFrame
|
|
295
|
+
matched_df, _ = convert_categorical_types(matched_df, features, category_mappings)
|
|
296
|
+
|
|
297
|
+
# If we have compressed features, decompress them
|
|
298
|
+
if compressed_features:
|
|
299
|
+
print("Decompressing features for prediction...")
|
|
300
|
+
matched_df, features = decompress_features(matched_df, features, compressed_features)
|
|
301
|
+
|
|
302
|
+
# Make predictions using the TabularModel
|
|
303
|
+
result = model.predict(matched_df[features])
|
|
304
|
+
|
|
305
|
+
# pytorch-tabular returns predictions using f"{target}_prediction" column
|
|
306
|
+
# and classification probabilities in columns ending with "_probability"
|
|
307
|
+
target = TEMPLATE_PARAMS["target_column"]
|
|
308
|
+
prediction_column = f"{target}_prediction"
|
|
309
|
+
if prediction_column in result.columns:
|
|
310
|
+
predictions = result[prediction_column].values
|
|
311
|
+
else:
|
|
312
|
+
raise ValueError(f"Cannot find prediction column in: {result.columns.tolist()}")
|
|
313
|
+
|
|
314
|
+
# If we have a label encoder, decode the predictions
|
|
315
|
+
if label_encoder:
|
|
316
|
+
predictions = label_encoder.inverse_transform(predictions.astype(int))
|
|
317
|
+
|
|
318
|
+
# Set the predictions on the DataFrame
|
|
319
|
+
df["prediction"] = predictions
|
|
320
|
+
|
|
321
|
+
# For classification, get probabilities
|
|
322
|
+
if label_encoder is not None:
|
|
323
|
+
prob_cols = [col for col in result.columns if col.endswith("_probability")]
|
|
324
|
+
if prob_cols:
|
|
325
|
+
probs = result[prob_cols].values
|
|
326
|
+
df["pred_proba"] = [p.tolist() for p in probs]
|
|
327
|
+
|
|
328
|
+
# Expand the pred_proba column into separate columns for each class
|
|
329
|
+
df = expand_proba_column(df, label_encoder.classes_)
|
|
330
|
+
|
|
331
|
+
# All done, return the DataFrame with new columns for the predictions
|
|
332
|
+
return df
|
|
333
|
+
|
|
334
|
+
|
|
206
335
|
if __name__ == "__main__":
|
|
207
336
|
"""The main function is for training the PyTorch Tabular model"""
|
|
208
337
|
|
|
@@ -265,14 +394,12 @@ if __name__ == "__main__":
|
|
|
265
394
|
else:
|
|
266
395
|
# Just do a random training Split
|
|
267
396
|
print("WARNING: No training column found, splitting data with random state=42")
|
|
268
|
-
df_train, df_val = train_test_split(
|
|
269
|
-
all_df, test_size=validation_split, random_state=42
|
|
270
|
-
)
|
|
397
|
+
df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
|
|
271
398
|
print(f"FIT/TRAIN: {df_train.shape}")
|
|
272
399
|
print(f"VALIDATION: {df_val.shape}")
|
|
273
400
|
|
|
274
401
|
# Determine categorical and continuous columns
|
|
275
|
-
categorical_cols = [col for col in features if df_train[col].dtype.name ==
|
|
402
|
+
categorical_cols = [col for col in features if df_train[col].dtype.name == "category"]
|
|
276
403
|
continuous_cols = [col for col in features if col not in categorical_cols]
|
|
277
404
|
|
|
278
405
|
print(f"Categorical columns: {categorical_cols}")
|
|
@@ -287,37 +414,39 @@ if __name__ == "__main__":
|
|
|
287
414
|
|
|
288
415
|
trainer_config = TrainerConfig(
|
|
289
416
|
auto_lr_find=True,
|
|
290
|
-
batch_size=1024,
|
|
417
|
+
batch_size=min(1024, len(df_train) // 4),
|
|
291
418
|
max_epochs=100,
|
|
292
419
|
early_stopping="valid_loss",
|
|
293
|
-
early_stopping_patience=
|
|
420
|
+
early_stopping_patience=15,
|
|
421
|
+
checkpoints="valid_loss",
|
|
422
|
+
accelerator="auto",
|
|
294
423
|
progress_bar="none",
|
|
424
|
+
gradient_clip_val=1.0,
|
|
295
425
|
)
|
|
296
426
|
|
|
297
427
|
optimizer_config = OptimizerConfig()
|
|
298
428
|
|
|
299
429
|
# Choose model configuration based on model type
|
|
300
430
|
if model_type == "classifier":
|
|
301
|
-
|
|
302
|
-
model_config = TabNetModelConfig(
|
|
303
|
-
task="classification",
|
|
304
|
-
learning_rate=1e-3,
|
|
305
|
-
)
|
|
306
|
-
|
|
431
|
+
task = "classification"
|
|
307
432
|
# Encode the target column
|
|
308
433
|
label_encoder = LabelEncoder()
|
|
309
434
|
df_train[target] = label_encoder.fit_transform(df_train[target])
|
|
310
435
|
df_val[target] = label_encoder.transform(df_val[target])
|
|
311
|
-
|
|
312
436
|
else:
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
437
|
+
task = "regression"
|
|
438
|
+
label_encoder = None
|
|
439
|
+
|
|
440
|
+
# Use CategoryEmbedding for both regression and classification tasks
|
|
441
|
+
model_config = CategoryEmbeddingModelConfig(
|
|
442
|
+
task=task,
|
|
443
|
+
layers="1024-512-512",
|
|
444
|
+
activation="ReLU",
|
|
445
|
+
learning_rate=1e-3,
|
|
446
|
+
dropout=0.1,
|
|
447
|
+
use_batch_norm=True,
|
|
448
|
+
initialization="kaiming",
|
|
449
|
+
)
|
|
321
450
|
|
|
322
451
|
# Create and train the TabularModel
|
|
323
452
|
tabular_model = TabularModel(
|
|
@@ -332,16 +461,15 @@ if __name__ == "__main__":
|
|
|
332
461
|
|
|
333
462
|
# Make Predictions on the Validation Set
|
|
334
463
|
print(f"Making Predictions on Validation Set...")
|
|
335
|
-
result = tabular_model.predict(df_val)
|
|
336
|
-
print(f"Result Columns: {result.columns.tolist()}")
|
|
464
|
+
result = tabular_model.predict(df_val, include_input_features=False)
|
|
337
465
|
|
|
338
|
-
#
|
|
339
|
-
#
|
|
466
|
+
# pytorch-tabular returns predictions using f"{target}_prediction" column
|
|
467
|
+
# and classification probabilities in columns ending with "_probability"
|
|
340
468
|
if model_type == "classifier":
|
|
341
|
-
preds = result["
|
|
469
|
+
preds = result[f"{target}_prediction"].values
|
|
342
470
|
else:
|
|
343
471
|
# Regression: use the target column name
|
|
344
|
-
preds = result[target].values
|
|
472
|
+
preds = result[f"{target}_prediction"].values
|
|
345
473
|
|
|
346
474
|
if model_type == "classifier":
|
|
347
475
|
# Get probabilities for classification
|
|
@@ -362,10 +490,10 @@ if __name__ == "__main__":
|
|
|
362
490
|
else:
|
|
363
491
|
y_validate = df_val[target].values
|
|
364
492
|
|
|
365
|
-
# Save predictions to S3 (just the target, prediction, and '
|
|
493
|
+
# Save predictions to S3 (just the target, prediction, and '_probability' columns)
|
|
366
494
|
df_val["prediction"] = preds
|
|
367
495
|
output_columns = [target, "prediction"]
|
|
368
|
-
output_columns += [col for col in df_val.columns if col.endswith("
|
|
496
|
+
output_columns += [col for col in df_val.columns if col.endswith("_probability")]
|
|
369
497
|
wr.s3.to_csv(
|
|
370
498
|
df_val[output_columns],
|
|
371
499
|
path=f"{model_metrics_s3_path}/validation_predictions.csv",
|
|
@@ -378,9 +506,7 @@ if __name__ == "__main__":
|
|
|
378
506
|
label_names = label_encoder.classes_
|
|
379
507
|
|
|
380
508
|
# Calculate various model performance metrics
|
|
381
|
-
scores = precision_recall_fscore_support(
|
|
382
|
-
y_validate, preds, average=None, labels=label_names
|
|
383
|
-
)
|
|
509
|
+
scores = precision_recall_fscore_support(y_validate, preds, average=None, labels=label_names)
|
|
384
510
|
|
|
385
511
|
# Put the scores into a dataframe
|
|
386
512
|
score_df = pd.DataFrame(
|
|
@@ -428,116 +554,4 @@ if __name__ == "__main__":
|
|
|
428
554
|
|
|
429
555
|
# Save the category mappings
|
|
430
556
|
with open(os.path.join(args.model_dir, "category_mappings.json"), "w") as fp:
|
|
431
|
-
json.dump(category_mappings, fp)
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
def model_fn(model_dir):
|
|
435
|
-
"""Deserialize and return fitted PyTorch Tabular model"""
|
|
436
|
-
model_path = os.path.join(model_dir, "tabular_model")
|
|
437
|
-
model = TabularModel.load_model(model_path)
|
|
438
|
-
return model
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
def input_fn(input_data, content_type):
|
|
442
|
-
"""Parse input data and return a DataFrame."""
|
|
443
|
-
if not input_data:
|
|
444
|
-
raise ValueError("Empty input data is not supported!")
|
|
445
|
-
|
|
446
|
-
# Decode bytes to string if necessary
|
|
447
|
-
if isinstance(input_data, bytes):
|
|
448
|
-
input_data = input_data.decode("utf-8")
|
|
449
|
-
|
|
450
|
-
if "text/csv" in content_type:
|
|
451
|
-
return pd.read_csv(StringIO(input_data))
|
|
452
|
-
elif "application/json" in content_type:
|
|
453
|
-
return pd.DataFrame(json.loads(input_data)) # Assumes JSON array of records
|
|
454
|
-
else:
|
|
455
|
-
raise ValueError(f"{content_type} not supported!")
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
def output_fn(output_df, accept_type):
|
|
459
|
-
"""Supports both CSV and JSON output formats."""
|
|
460
|
-
if "text/csv" in accept_type:
|
|
461
|
-
csv_output = output_df.fillna("N/A").to_csv(index=False) # CSV with N/A for missing values
|
|
462
|
-
return csv_output, "text/csv"
|
|
463
|
-
elif "application/json" in accept_type:
|
|
464
|
-
return output_df.to_json(orient="records"), "application/json" # JSON array of records (NaNs -> null)
|
|
465
|
-
else:
|
|
466
|
-
raise RuntimeError(f"{accept_type} accept type is not supported by this script.")
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
def predict_fn(df, model) -> pd.DataFrame:
|
|
470
|
-
"""Make Predictions with our PyTorch Tabular Model
|
|
471
|
-
|
|
472
|
-
Args:
|
|
473
|
-
df (pd.DataFrame): The input DataFrame
|
|
474
|
-
model: The TabularModel use for predictions
|
|
475
|
-
|
|
476
|
-
Returns:
|
|
477
|
-
pd.DataFrame: The DataFrame with the predictions added
|
|
478
|
-
"""
|
|
479
|
-
compressed_features = TEMPLATE_PARAMS["compressed_features"]
|
|
480
|
-
|
|
481
|
-
# Grab our feature columns (from training)
|
|
482
|
-
model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
|
|
483
|
-
with open(os.path.join(model_dir, "feature_columns.json")) as fp:
|
|
484
|
-
features = json.load(fp)
|
|
485
|
-
print(f"Model Features: {features}")
|
|
486
|
-
|
|
487
|
-
# Load the category mappings (from training)
|
|
488
|
-
with open(os.path.join(model_dir, "category_mappings.json")) as fp:
|
|
489
|
-
category_mappings = json.load(fp)
|
|
490
|
-
|
|
491
|
-
# Load our Label Encoder if we have one
|
|
492
|
-
label_encoder = None
|
|
493
|
-
if os.path.exists(os.path.join(model_dir, "label_encoder.joblib")):
|
|
494
|
-
label_encoder = joblib.load(os.path.join(model_dir, "label_encoder.joblib"))
|
|
495
|
-
|
|
496
|
-
# We're going match features in a case-insensitive manner, accounting for all the permutations
|
|
497
|
-
# - Model has a feature list that's any case ("Id", "taCos", "cOunT", "likes_tacos")
|
|
498
|
-
# - Incoming data has columns that are mixed case ("ID", "Tacos", "Count", "Likes_Tacos")
|
|
499
|
-
matched_df = match_features_case_insensitive(df, features)
|
|
500
|
-
|
|
501
|
-
# Detect categorical types in the incoming DataFrame
|
|
502
|
-
matched_df, _ = convert_categorical_types(matched_df, features, category_mappings)
|
|
503
|
-
|
|
504
|
-
# If we have compressed features, decompress them
|
|
505
|
-
if compressed_features:
|
|
506
|
-
print("Decompressing features for prediction...")
|
|
507
|
-
matched_df, features = decompress_features(matched_df, features, compressed_features)
|
|
508
|
-
|
|
509
|
-
# Make predictions using the TabularModel
|
|
510
|
-
result = model.predict(matched_df)
|
|
511
|
-
|
|
512
|
-
# Extract predictions based on model type
|
|
513
|
-
# For regression: pytorch-tabular uses target column name
|
|
514
|
-
# For classification: pytorch-tabular uses "prediction" column
|
|
515
|
-
if "prediction" in result.columns:
|
|
516
|
-
predictions = result["prediction"].values
|
|
517
|
-
else:
|
|
518
|
-
# For regression, find the new column (not in original dataframe)
|
|
519
|
-
pred_cols = [col for col in result.columns if col not in matched_df.columns]
|
|
520
|
-
if pred_cols:
|
|
521
|
-
predictions = result[pred_cols[0]].values
|
|
522
|
-
else:
|
|
523
|
-
raise ValueError(f"Cannot find prediction column in: {result.columns.tolist()}")
|
|
524
|
-
|
|
525
|
-
# If we have a label encoder, decode the predictions
|
|
526
|
-
if label_encoder:
|
|
527
|
-
predictions = label_encoder.inverse_transform(predictions.astype(int))
|
|
528
|
-
|
|
529
|
-
# Set the predictions on the DataFrame
|
|
530
|
-
df["prediction"] = predictions
|
|
531
|
-
|
|
532
|
-
# For classification, get probabilities
|
|
533
|
-
if label_encoder is not None:
|
|
534
|
-
prob_cols = [col for col in result.columns if col.endswith("_probability")]
|
|
535
|
-
if prob_cols:
|
|
536
|
-
probs = result[prob_cols].values
|
|
537
|
-
df["pred_proba"] = [p.tolist() for p in probs]
|
|
538
|
-
|
|
539
|
-
# Expand the pred_proba column into separate columns for each class
|
|
540
|
-
df = expand_proba_column(df, label_encoder.classes_)
|
|
541
|
-
|
|
542
|
-
# All done, return the DataFrame with new columns for the predictions
|
|
543
|
-
return df
|
|
557
|
+
json.dump(category_mappings, fp)
|