workbench 0.8.198__py3-none-any.whl → 0.8.201__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- workbench/algorithms/dataframe/proximity.py +11 -4
- workbench/api/__init__.py +2 -1
- workbench/api/feature_set.py +7 -4
- workbench/api/model.py +1 -1
- workbench/core/artifacts/__init__.py +11 -2
- workbench/core/artifacts/endpoint_core.py +84 -46
- workbench/core/artifacts/feature_set_core.py +69 -1
- workbench/core/artifacts/model_core.py +37 -7
- workbench/core/cloud_platform/aws/aws_parameter_store.py +18 -2
- workbench/core/transforms/features_to_model/features_to_model.py +23 -20
- workbench/core/views/view.py +2 -2
- workbench/model_scripts/chemprop/chemprop.template +931 -0
- workbench/model_scripts/chemprop/generated_model_script.py +931 -0
- workbench/model_scripts/chemprop/requirements.txt +11 -0
- workbench/model_scripts/custom_models/chem_info/fingerprints.py +134 -0
- workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py +1 -1
- workbench/model_scripts/custom_models/proximity/proximity.py +11 -4
- workbench/model_scripts/custom_models/uq_models/proximity.py +11 -4
- workbench/model_scripts/pytorch_model/generated_model_script.py +130 -88
- workbench/model_scripts/pytorch_model/pytorch.template +128 -86
- workbench/model_scripts/scikit_learn/generated_model_script.py +302 -0
- workbench/model_scripts/script_generation.py +10 -7
- workbench/model_scripts/uq_models/generated_model_script.py +25 -18
- workbench/model_scripts/uq_models/mapie.template +23 -16
- workbench/model_scripts/xgb_model/generated_model_script.py +6 -6
- workbench/model_scripts/xgb_model/xgb_model.template +2 -2
- workbench/repl/workbench_shell.py +14 -5
- workbench/scripts/endpoint_test.py +162 -0
- workbench/scripts/{lambda_launcher.py → lambda_test.py} +10 -0
- workbench/utils/chemprop_utils.py +724 -0
- workbench/utils/pytorch_utils.py +497 -0
- workbench/utils/xgboost_model_utils.py +10 -5
- {workbench-0.8.198.dist-info → workbench-0.8.201.dist-info}/METADATA +2 -2
- {workbench-0.8.198.dist-info → workbench-0.8.201.dist-info}/RECORD +38 -32
- {workbench-0.8.198.dist-info → workbench-0.8.201.dist-info}/entry_points.txt +2 -1
- workbench/model_scripts/__pycache__/script_generation.cpython-312.pyc +0 -0
- workbench/model_scripts/__pycache__/script_generation.cpython-313.pyc +0 -0
- {workbench-0.8.198.dist-info → workbench-0.8.201.dist-info}/WHEEL +0 -0
- {workbench-0.8.198.dist-info → workbench-0.8.201.dist-info}/licenses/LICENSE +0 -0
- {workbench-0.8.198.dist-info → workbench-0.8.201.dist-info}/top_level.txt +0 -0
|
@@ -29,9 +29,7 @@ from io import StringIO
|
|
|
29
29
|
import json
|
|
30
30
|
import argparse
|
|
31
31
|
import joblib
|
|
32
|
-
import os
|
|
33
32
|
import pandas as pd
|
|
34
|
-
from typing import List, Tuple
|
|
35
33
|
|
|
36
34
|
# Template Parameters
|
|
37
35
|
TEMPLATE_PARAMS = {
|
|
@@ -45,7 +43,6 @@ TEMPLATE_PARAMS = {
|
|
|
45
43
|
}
|
|
46
44
|
|
|
47
45
|
|
|
48
|
-
# Function to check if dataframe is empty
|
|
49
46
|
def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
|
|
50
47
|
"""
|
|
51
48
|
Check if the provided dataframe is empty and raise an exception if it is.
|
|
@@ -60,19 +57,17 @@ def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
|
|
|
60
57
|
raise ValueError(msg)
|
|
61
58
|
|
|
62
59
|
|
|
63
|
-
def expand_proba_column(df: pd.DataFrame, class_labels:
|
|
60
|
+
def expand_proba_column(df: pd.DataFrame, class_labels: list[str]) -> pd.DataFrame:
|
|
64
61
|
"""
|
|
65
62
|
Expands a column in a DataFrame containing a list of probabilities into separate columns.
|
|
66
63
|
|
|
67
64
|
Args:
|
|
68
65
|
df (pd.DataFrame): DataFrame containing a "pred_proba" column
|
|
69
|
-
class_labels (
|
|
66
|
+
class_labels (list[str]): List of class labels
|
|
70
67
|
|
|
71
68
|
Returns:
|
|
72
69
|
pd.DataFrame: DataFrame with the "pred_proba" expanded into separate columns
|
|
73
70
|
"""
|
|
74
|
-
|
|
75
|
-
# Sanity check
|
|
76
71
|
proba_column = "pred_proba"
|
|
77
72
|
if proba_column not in df.columns:
|
|
78
73
|
raise ValueError('DataFrame does not contain a "pred_proba" column')
|
|
@@ -89,11 +84,10 @@ def expand_proba_column(df: pd.DataFrame, class_labels: List[str]) -> pd.DataFra
|
|
|
89
84
|
|
|
90
85
|
# Concatenate the new columns with the original DataFrame
|
|
91
86
|
df = pd.concat([df, proba_df], axis=1)
|
|
92
|
-
print(df)
|
|
93
87
|
return df
|
|
94
88
|
|
|
95
89
|
|
|
96
|
-
def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> pd.DataFrame:
|
|
90
|
+
def match_features_case_insensitive(df: pd.DataFrame, model_features: list[str]) -> pd.DataFrame:
|
|
97
91
|
"""
|
|
98
92
|
Matches and renames DataFrame columns to match model feature names (case-insensitive).
|
|
99
93
|
Prioritizes exact matches, then case-insensitive matches.
|
|
@@ -118,55 +112,60 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
|
|
|
118
112
|
return df.rename(columns=rename_dict)
|
|
119
113
|
|
|
120
114
|
|
|
121
|
-
def convert_categorical_types(
|
|
115
|
+
def convert_categorical_types(
|
|
116
|
+
df: pd.DataFrame, features: list[str], category_mappings: dict[str, list[str]] | None = None
|
|
117
|
+
) -> tuple[pd.DataFrame, dict[str, list[str]]]:
|
|
122
118
|
"""
|
|
123
119
|
Converts appropriate columns to categorical type with consistent mappings.
|
|
124
120
|
|
|
125
121
|
Args:
|
|
126
122
|
df (pd.DataFrame): The DataFrame to process.
|
|
127
123
|
features (list): List of feature names to consider for conversion.
|
|
128
|
-
category_mappings (dict, optional): Existing category mappings. If empty
|
|
129
|
-
training mode. If populated, we're in
|
|
124
|
+
category_mappings (dict, optional): Existing category mappings. If None or empty,
|
|
125
|
+
we're in training mode. If populated, we're in
|
|
126
|
+
inference mode.
|
|
130
127
|
|
|
131
128
|
Returns:
|
|
132
129
|
tuple: (processed DataFrame, category mappings dictionary)
|
|
133
130
|
"""
|
|
131
|
+
if category_mappings is None:
|
|
132
|
+
category_mappings = {}
|
|
133
|
+
|
|
134
134
|
# Training mode
|
|
135
|
-
if category_mappings
|
|
135
|
+
if not category_mappings:
|
|
136
136
|
for col in df.select_dtypes(include=["object", "string"]):
|
|
137
137
|
if col in features and df[col].nunique() < 20:
|
|
138
138
|
print(f"Training mode: Converting {col} to category")
|
|
139
139
|
df[col] = df[col].astype("category")
|
|
140
|
-
category_mappings[col] = df[col].cat.categories.tolist()
|
|
140
|
+
category_mappings[col] = df[col].cat.categories.tolist()
|
|
141
141
|
|
|
142
142
|
# Inference mode
|
|
143
143
|
else:
|
|
144
144
|
for col, categories in category_mappings.items():
|
|
145
145
|
if col in df.columns:
|
|
146
146
|
print(f"Inference mode: Applying categorical mapping for {col}")
|
|
147
|
-
df[col] = pd.Categorical(df[col], categories=categories)
|
|
147
|
+
df[col] = pd.Categorical(df[col], categories=categories)
|
|
148
148
|
|
|
149
149
|
return df, category_mappings
|
|
150
150
|
|
|
151
151
|
|
|
152
152
|
def decompress_features(
|
|
153
|
-
df: pd.DataFrame, features:
|
|
154
|
-
) ->
|
|
153
|
+
df: pd.DataFrame, features: list[str], compressed_features: list[str]
|
|
154
|
+
) -> tuple[pd.DataFrame, list[str]]:
|
|
155
155
|
"""Prepare features for the model
|
|
156
156
|
|
|
157
157
|
Args:
|
|
158
158
|
df (pd.DataFrame): The features DataFrame
|
|
159
|
-
features (
|
|
160
|
-
compressed_features (
|
|
159
|
+
features (list[str]): Full list of feature names
|
|
160
|
+
compressed_features (list[str]): List of feature names to decompress (bitstrings)
|
|
161
161
|
|
|
162
162
|
Returns:
|
|
163
163
|
pd.DataFrame: DataFrame with the decompressed features
|
|
164
|
-
|
|
164
|
+
list[str]: Updated list of feature names after decompression
|
|
165
165
|
|
|
166
166
|
Raises:
|
|
167
167
|
ValueError: If any missing values are found in the specified features
|
|
168
168
|
"""
|
|
169
|
-
|
|
170
169
|
# Check for any missing values in the required features
|
|
171
170
|
missing_counts = df[features].isna().sum()
|
|
172
171
|
if missing_counts.any():
|
|
@@ -176,10 +175,11 @@ def decompress_features(
|
|
|
176
175
|
"WARNING: You might want to remove/replace all NaN values before processing."
|
|
177
176
|
)
|
|
178
177
|
|
|
179
|
-
#
|
|
180
|
-
decompressed_features = features
|
|
178
|
+
# Make a copy to avoid mutating the original list
|
|
179
|
+
decompressed_features = features.copy()
|
|
180
|
+
|
|
181
181
|
for feature in compressed_features:
|
|
182
|
-
if (feature not in df.columns) or (feature not in
|
|
182
|
+
if (feature not in df.columns) or (feature not in decompressed_features):
|
|
183
183
|
print(f"Feature '{feature}' not in the features list, skipping decompression.")
|
|
184
184
|
continue
|
|
185
185
|
|
|
@@ -204,26 +204,39 @@ def decompress_features(
|
|
|
204
204
|
return df, decompressed_features
|
|
205
205
|
|
|
206
206
|
|
|
207
|
-
def model_fn(model_dir):
|
|
207
|
+
def model_fn(model_dir: str) -> TabularModel:
|
|
208
|
+
"""Load the PyTorch Tabular model from the specified directory.
|
|
208
209
|
|
|
210
|
+
Args:
|
|
211
|
+
model_dir: Directory containing the saved model
|
|
212
|
+
|
|
213
|
+
Returns:
|
|
214
|
+
Loaded TabularModel instance
|
|
215
|
+
"""
|
|
209
216
|
# Save current working directory
|
|
210
217
|
original_cwd = os.getcwd()
|
|
211
218
|
try:
|
|
212
219
|
# Change to /tmp because Pytorch Tabular needs write access (creates a .pt_tmp directory)
|
|
213
220
|
os.chdir("/tmp")
|
|
214
221
|
|
|
215
|
-
#
|
|
222
|
+
# Remove callbacks.sav if it exists - it's not needed for inference and causes
|
|
223
|
+
# GPU->CPU loading issues (joblib.load doesn't support map_location)
|
|
216
224
|
model_path = os.path.join(model_dir, "tabular_model")
|
|
217
|
-
|
|
225
|
+
callbacks_path = os.path.join(model_path, "callbacks.sav")
|
|
226
|
+
if os.path.exists(callbacks_path):
|
|
227
|
+
os.remove(callbacks_path)
|
|
228
|
+
|
|
229
|
+
# Load the model (map_location="cpu" ensures GPU-trained models work on CPU endpoints)
|
|
230
|
+
model = TabularModel.load_model(model_path, map_location="cpu")
|
|
218
231
|
|
|
219
|
-
# Restore the original working directory
|
|
220
232
|
finally:
|
|
233
|
+
# Restore the original working directory
|
|
221
234
|
os.chdir(original_cwd)
|
|
222
235
|
|
|
223
236
|
return model
|
|
224
237
|
|
|
225
238
|
|
|
226
|
-
def input_fn(input_data, content_type):
|
|
239
|
+
def input_fn(input_data, content_type: str) -> pd.DataFrame:
|
|
227
240
|
"""Parse input data and return a DataFrame."""
|
|
228
241
|
if not input_data:
|
|
229
242
|
raise ValueError("Empty input data is not supported!")
|
|
@@ -240,18 +253,18 @@ def input_fn(input_data, content_type):
|
|
|
240
253
|
raise ValueError(f"{content_type} not supported!")
|
|
241
254
|
|
|
242
255
|
|
|
243
|
-
def output_fn(output_df, accept_type):
|
|
256
|
+
def output_fn(output_df: pd.DataFrame, accept_type: str) -> tuple[str, str]:
|
|
244
257
|
"""Supports both CSV and JSON output formats."""
|
|
245
258
|
if "text/csv" in accept_type:
|
|
246
|
-
csv_output = output_df.fillna("N/A").to_csv(index=False)
|
|
259
|
+
csv_output = output_df.fillna("N/A").to_csv(index=False)
|
|
247
260
|
return csv_output, "text/csv"
|
|
248
261
|
elif "application/json" in accept_type:
|
|
249
|
-
return output_df.to_json(orient="records"), "application/json"
|
|
262
|
+
return output_df.to_json(orient="records"), "application/json"
|
|
250
263
|
else:
|
|
251
264
|
raise RuntimeError(f"{accept_type} accept type is not supported by this script.")
|
|
252
265
|
|
|
253
266
|
|
|
254
|
-
def predict_fn(df, model) -> pd.DataFrame:
|
|
267
|
+
def predict_fn(df: pd.DataFrame, model: TabularModel) -> pd.DataFrame:
|
|
255
268
|
"""Make Predictions with our PyTorch Tabular Model
|
|
256
269
|
|
|
257
270
|
Args:
|
|
@@ -275,12 +288,11 @@ def predict_fn(df, model) -> pd.DataFrame:
|
|
|
275
288
|
|
|
276
289
|
# Load our Label Encoder if we have one
|
|
277
290
|
label_encoder = None
|
|
278
|
-
|
|
279
|
-
|
|
291
|
+
label_encoder_path = os.path.join(model_dir, "label_encoder.joblib")
|
|
292
|
+
if os.path.exists(label_encoder_path):
|
|
293
|
+
label_encoder = joblib.load(label_encoder_path)
|
|
280
294
|
|
|
281
|
-
#
|
|
282
|
-
# - Model has a feature list that's any case ("Id", "taCos", "cOunT", "likes_tacos")
|
|
283
|
-
# - Incoming data has columns that are mixed case ("ID", "Tacos", "Count", "Likes_Tacos")
|
|
295
|
+
# Match features in a case-insensitive manner
|
|
284
296
|
matched_df = match_features_case_insensitive(df, features)
|
|
285
297
|
|
|
286
298
|
# Detect categorical types in the incoming DataFrame
|
|
@@ -291,12 +303,25 @@ def predict_fn(df, model) -> pd.DataFrame:
|
|
|
291
303
|
print("Decompressing features for prediction...")
|
|
292
304
|
matched_df, features = decompress_features(matched_df, features, compressed_features)
|
|
293
305
|
|
|
306
|
+
# Track rows with missing features
|
|
307
|
+
missing_mask = matched_df[features].isna().any(axis=1)
|
|
308
|
+
if missing_mask.any():
|
|
309
|
+
print(f"Warning: {missing_mask.sum()} rows have missing features, will return NaN predictions")
|
|
310
|
+
|
|
311
|
+
# Initialize prediction column with NaN
|
|
312
|
+
df["prediction"] = np.nan
|
|
313
|
+
|
|
314
|
+
# Only predict on complete rows
|
|
315
|
+
complete_df = matched_df[~missing_mask]
|
|
316
|
+
if len(complete_df) == 0:
|
|
317
|
+
print("Warning: No complete rows to predict on")
|
|
318
|
+
return df
|
|
319
|
+
|
|
294
320
|
# Make predictions using the TabularModel
|
|
295
|
-
result = model.predict(
|
|
321
|
+
result = model.predict(complete_df[features])
|
|
296
322
|
|
|
297
323
|
# pytorch-tabular returns predictions using f"{target}_prediction" column
|
|
298
|
-
|
|
299
|
-
target = TEMPLATE_PARAMS["target_column"]
|
|
324
|
+
target = TEMPLATE_PARAMS["target"]
|
|
300
325
|
prediction_column = f"{target}_prediction"
|
|
301
326
|
if prediction_column in result.columns:
|
|
302
327
|
predictions = result[prediction_column].values
|
|
@@ -307,20 +332,23 @@ def predict_fn(df, model) -> pd.DataFrame:
|
|
|
307
332
|
if label_encoder:
|
|
308
333
|
predictions = label_encoder.inverse_transform(predictions.astype(int))
|
|
309
334
|
|
|
310
|
-
# Set
|
|
311
|
-
df["prediction"] = predictions
|
|
335
|
+
# Set predictions only for complete rows
|
|
336
|
+
df.loc[~missing_mask, "prediction"] = predictions
|
|
312
337
|
|
|
313
338
|
# For classification, get probabilities
|
|
314
339
|
if label_encoder is not None:
|
|
315
340
|
prob_cols = [col for col in result.columns if col.endswith("_probability")]
|
|
316
341
|
if prob_cols:
|
|
317
342
|
probs = result[prob_cols].values
|
|
318
|
-
|
|
343
|
+
|
|
344
|
+
# Build full proba Series with None for missing rows
|
|
345
|
+
all_proba = pd.Series([None] * len(df), index=df.index, dtype=object)
|
|
346
|
+
all_proba.loc[~missing_mask] = [p.tolist() for p in probs]
|
|
347
|
+
df["pred_proba"] = all_proba
|
|
319
348
|
|
|
320
349
|
# Expand the pred_proba column into separate columns for each class
|
|
321
350
|
df = expand_proba_column(df, label_encoder.classes_)
|
|
322
351
|
|
|
323
|
-
# All done, return the DataFrame with new columns for the predictions
|
|
324
352
|
return df
|
|
325
353
|
|
|
326
354
|
|
|
@@ -354,9 +382,21 @@ if __name__ == "__main__":
|
|
|
354
382
|
# Combine files and read them all into a single pandas dataframe
|
|
355
383
|
all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
|
|
356
384
|
|
|
385
|
+
# Print out some info about the dataframe
|
|
386
|
+
print(f"All Data Shape: {all_df.shape}")
|
|
387
|
+
print(f"Feature dtypes:\n{all_df[features].dtypes.value_counts()}")
|
|
388
|
+
print(f"Int64 columns: {all_df[features].select_dtypes(include=['int64']).columns.tolist()}")
|
|
389
|
+
|
|
357
390
|
# Check if the dataframe is empty
|
|
358
391
|
check_dataframe(all_df, "training_df")
|
|
359
392
|
|
|
393
|
+
# Drop any rows with missing feature values
|
|
394
|
+
initial_row_count = all_df.shape[0]
|
|
395
|
+
all_df = all_df.dropna(subset=features)
|
|
396
|
+
dropped_rows = initial_row_count - all_df.shape[0]
|
|
397
|
+
if dropped_rows > 0:
|
|
398
|
+
print(f"Dropped {dropped_rows} rows due to missing feature values.")
|
|
399
|
+
|
|
360
400
|
# Features/Target output
|
|
361
401
|
print(f"Target: {target}")
|
|
362
402
|
print(f"Features: {str(features)}")
|
|
@@ -364,11 +404,25 @@ if __name__ == "__main__":
|
|
|
364
404
|
# Convert any features that might be categorical to 'category' type
|
|
365
405
|
all_df, category_mappings = convert_categorical_types(all_df, features)
|
|
366
406
|
|
|
407
|
+
# Print out some info about the dataframe
|
|
408
|
+
print(f"All Data Shape: {all_df.shape}")
|
|
409
|
+
print(f"Feature dtypes:\n{all_df[features].dtypes.value_counts()}")
|
|
410
|
+
print(f"Int64 columns: {all_df[features].select_dtypes(include=['int64']).columns.tolist()}")
|
|
411
|
+
|
|
367
412
|
# If we have compressed features, decompress them
|
|
368
413
|
if compressed_features:
|
|
369
414
|
print(f"Decompressing features {compressed_features}...")
|
|
370
415
|
all_df, features = decompress_features(all_df, features, compressed_features)
|
|
371
416
|
|
|
417
|
+
# Determine categorical and continuous columns
|
|
418
|
+
categorical_cols = [col for col in features if all_df[col].dtype.name == "category"]
|
|
419
|
+
continuous_cols = [col for col in features if col not in categorical_cols]
|
|
420
|
+
print(f"Categorical columns: {categorical_cols}")
|
|
421
|
+
print(f"Continuous columns: {continuous_cols}")
|
|
422
|
+
|
|
423
|
+
# Cast continuous columns to float
|
|
424
|
+
all_df[continuous_cols] = all_df[continuous_cols].astype("float64")
|
|
425
|
+
|
|
372
426
|
# Do we want to train on all the data?
|
|
373
427
|
if train_all_data:
|
|
374
428
|
print("Training on ALL of the data")
|
|
@@ -378,8 +432,8 @@ if __name__ == "__main__":
|
|
|
378
432
|
# Does the dataframe have a training column?
|
|
379
433
|
elif "training" in all_df.columns:
|
|
380
434
|
print("Found training column, splitting data based on training column")
|
|
381
|
-
df_train = all_df[all_df["training"]]
|
|
382
|
-
df_val = all_df[~all_df["training"]]
|
|
435
|
+
df_train = all_df[all_df["training"]].copy()
|
|
436
|
+
df_val = all_df[~all_df["training"]].copy()
|
|
383
437
|
else:
|
|
384
438
|
# Just do a random training Split
|
|
385
439
|
print("WARNING: No training column found, splitting data with random state=42")
|
|
@@ -387,13 +441,6 @@ if __name__ == "__main__":
|
|
|
387
441
|
print(f"FIT/TRAIN: {df_train.shape}")
|
|
388
442
|
print(f"VALIDATION: {df_val.shape}")
|
|
389
443
|
|
|
390
|
-
# Determine categorical and continuous columns
|
|
391
|
-
categorical_cols = [col for col in features if df_train[col].dtype.name == "category"]
|
|
392
|
-
continuous_cols = [col for col in features if col not in categorical_cols]
|
|
393
|
-
|
|
394
|
-
print(f"Categorical columns: {categorical_cols}")
|
|
395
|
-
print(f"Continuous columns: {continuous_cols}")
|
|
396
|
-
|
|
397
444
|
# Set up PyTorch Tabular configuration
|
|
398
445
|
data_config = DataConfig(
|
|
399
446
|
target=[target],
|
|
@@ -417,11 +464,12 @@ if __name__ == "__main__":
|
|
|
417
464
|
|
|
418
465
|
# Set up PyTorch Tabular configuration with defaults
|
|
419
466
|
trainer_defaults = {
|
|
420
|
-
"auto_lr_find":
|
|
421
|
-
"batch_size": min(
|
|
467
|
+
"auto_lr_find": False,
|
|
468
|
+
"batch_size": min(128, max(32, len(df_train) // 16)),
|
|
422
469
|
"max_epochs": 100,
|
|
470
|
+
"min_epochs": 10,
|
|
423
471
|
"early_stopping": "valid_loss",
|
|
424
|
-
"early_stopping_patience":
|
|
472
|
+
"early_stopping_patience": 10,
|
|
425
473
|
"checkpoints": "valid_loss",
|
|
426
474
|
"accelerator": "auto",
|
|
427
475
|
"progress_bar": "none",
|
|
@@ -430,7 +478,6 @@ if __name__ == "__main__":
|
|
|
430
478
|
|
|
431
479
|
# Override defaults with training_config if present
|
|
432
480
|
training_overrides = {k: v for k, v in hyperparameters.get("training_config", {}).items() if k in trainer_defaults}
|
|
433
|
-
# Print overwrites
|
|
434
481
|
for key, value in training_overrides.items():
|
|
435
482
|
print(f"TRAINING CONFIG Override: {key}: {trainer_defaults[key]} → {value}")
|
|
436
483
|
trainer_params = {**trainer_defaults, **training_overrides}
|
|
@@ -438,23 +485,20 @@ if __name__ == "__main__":
|
|
|
438
485
|
|
|
439
486
|
# Model config defaults
|
|
440
487
|
model_defaults = {
|
|
441
|
-
"layers": "
|
|
442
|
-
"activation": "
|
|
488
|
+
"layers": "256-128-64",
|
|
489
|
+
"activation": "LeakyReLU",
|
|
443
490
|
"learning_rate": 1e-3,
|
|
444
|
-
"dropout": 0.
|
|
491
|
+
"dropout": 0.3,
|
|
445
492
|
"use_batch_norm": True,
|
|
446
493
|
"initialization": "kaiming",
|
|
447
494
|
}
|
|
448
495
|
# Override defaults with model_config if present
|
|
449
496
|
model_overrides = {k: v for k, v in hyperparameters.get("model_config", {}).items() if k in model_defaults}
|
|
450
|
-
# Print overwrites
|
|
451
497
|
for key, value in model_overrides.items():
|
|
452
498
|
print(f"MODEL CONFIG Override: {key}: {model_defaults[key]} → {value}")
|
|
453
499
|
model_params = {**model_defaults, **model_overrides}
|
|
454
500
|
|
|
455
501
|
# Use CategoryEmbedding model configuration for general-purpose tabular modeling.
|
|
456
|
-
# Works effectively for both regression and classification as the foundational
|
|
457
|
-
# architecture in PyTorch Tabular
|
|
458
502
|
model_config = CategoryEmbeddingModelConfig(task=task, **model_params)
|
|
459
503
|
optimizer_config = OptimizerConfig()
|
|
460
504
|
|
|
@@ -474,36 +518,34 @@ if __name__ == "__main__":
|
|
|
474
518
|
result = tabular_model.predict(df_val, include_input_features=False)
|
|
475
519
|
|
|
476
520
|
# pytorch-tabular returns predictions using f"{target}_prediction" column
|
|
477
|
-
|
|
478
|
-
if model_type == "classifier":
|
|
479
|
-
preds = result[f"{target}_prediction"].values
|
|
480
|
-
else:
|
|
481
|
-
# Regression: use the target column name
|
|
482
|
-
preds = result[f"{target}_prediction"].values
|
|
521
|
+
preds = result[f"{target}_prediction"].values
|
|
483
522
|
|
|
484
523
|
if model_type == "classifier":
|
|
485
524
|
# Get probabilities for classification
|
|
486
525
|
print("Processing Probabilities...")
|
|
487
|
-
prob_cols = [col for col in result.columns if col.endswith("_probability")]
|
|
526
|
+
prob_cols = sorted([col for col in result.columns if col.endswith("_probability")])
|
|
488
527
|
if prob_cols:
|
|
489
528
|
probs = result[prob_cols].values
|
|
529
|
+
df_val = df_val.copy() # Avoid SettingWithCopyWarning
|
|
490
530
|
df_val["pred_proba"] = [p.tolist() for p in probs]
|
|
491
531
|
|
|
492
532
|
# Expand the pred_proba column into separate columns for each class
|
|
493
|
-
print(df_val.columns)
|
|
533
|
+
print(df_val.columns.tolist())
|
|
494
534
|
df_val = expand_proba_column(df_val, label_encoder.classes_)
|
|
495
|
-
print(df_val.columns)
|
|
535
|
+
print(df_val.columns.tolist())
|
|
496
536
|
|
|
497
537
|
# Decode the target and prediction labels
|
|
498
538
|
y_validate = label_encoder.inverse_transform(df_val[target])
|
|
499
|
-
|
|
539
|
+
preds_decoded = label_encoder.inverse_transform(preds.astype(int))
|
|
500
540
|
else:
|
|
501
541
|
y_validate = df_val[target].values
|
|
542
|
+
preds_decoded = preds
|
|
502
543
|
|
|
503
|
-
# Save predictions to S3
|
|
504
|
-
df_val
|
|
544
|
+
# Save predictions to S3
|
|
545
|
+
df_val = df_val.copy()
|
|
546
|
+
df_val["prediction"] = preds_decoded
|
|
505
547
|
output_columns = [target, "prediction"]
|
|
506
|
-
output_columns += [col for col in df_val.columns if col.endswith("
|
|
548
|
+
output_columns += [col for col in df_val.columns if col.endswith("_proba")]
|
|
507
549
|
wr.s3.to_csv(
|
|
508
550
|
df_val[output_columns],
|
|
509
551
|
path=f"{model_metrics_s3_path}/validation_predictions.csv",
|
|
@@ -516,7 +558,7 @@ if __name__ == "__main__":
|
|
|
516
558
|
label_names = label_encoder.classes_
|
|
517
559
|
|
|
518
560
|
# Calculate various model performance metrics
|
|
519
|
-
scores = precision_recall_fscore_support(y_validate,
|
|
561
|
+
scores = precision_recall_fscore_support(y_validate, preds_decoded, average=None, labels=label_names)
|
|
520
562
|
|
|
521
563
|
# Put the scores into a dataframe
|
|
522
564
|
score_df = pd.DataFrame(
|
|
@@ -524,20 +566,20 @@ if __name__ == "__main__":
|
|
|
524
566
|
target: label_names,
|
|
525
567
|
"precision": scores[0],
|
|
526
568
|
"recall": scores[1],
|
|
527
|
-
"
|
|
569
|
+
"f1": scores[2],
|
|
528
570
|
"support": scores[3],
|
|
529
571
|
}
|
|
530
572
|
)
|
|
531
573
|
|
|
532
|
-
#
|
|
533
|
-
metrics = ["precision", "recall", "
|
|
574
|
+
# Output metrics per class
|
|
575
|
+
metrics = ["precision", "recall", "f1", "support"]
|
|
534
576
|
for t in label_names:
|
|
535
577
|
for m in metrics:
|
|
536
578
|
value = score_df.loc[score_df[target] == t, m].iloc[0]
|
|
537
579
|
print(f"Metrics:{t}:{m} {value}")
|
|
538
580
|
|
|
539
581
|
# Compute and output the confusion matrix
|
|
540
|
-
conf_mtx = confusion_matrix(y_validate,
|
|
582
|
+
conf_mtx = confusion_matrix(y_validate, preds_decoded, labels=label_names)
|
|
541
583
|
for i, row_name in enumerate(label_names):
|
|
542
584
|
for j, col_name in enumerate(label_names):
|
|
543
585
|
value = conf_mtx[i, j]
|
|
@@ -545,9 +587,9 @@ if __name__ == "__main__":
|
|
|
545
587
|
|
|
546
588
|
else:
|
|
547
589
|
# Calculate various model performance metrics (regression)
|
|
548
|
-
rmse = root_mean_squared_error(y_validate,
|
|
549
|
-
mae = mean_absolute_error(y_validate,
|
|
550
|
-
r2 = r2_score(y_validate,
|
|
590
|
+
rmse = root_mean_squared_error(y_validate, preds_decoded)
|
|
591
|
+
mae = mean_absolute_error(y_validate, preds_decoded)
|
|
592
|
+
r2 = r2_score(y_validate, preds_decoded)
|
|
551
593
|
print(f"RMSE: {rmse:.3f}")
|
|
552
594
|
print(f"MAE: {mae:.3f}")
|
|
553
595
|
print(f"R2: {r2:.3f}")
|
|
@@ -560,7 +602,7 @@ if __name__ == "__main__":
|
|
|
560
602
|
|
|
561
603
|
# Save the features (this will validate input during predictions)
|
|
562
604
|
with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
|
|
563
|
-
json.dump(orig_features, fp)
|
|
605
|
+
json.dump(orig_features, fp)
|
|
564
606
|
|
|
565
607
|
# Save the category mappings
|
|
566
608
|
with open(os.path.join(args.model_dir, "category_mappings.json"), "w") as fp:
|