workbench 0.8.178__py3-none-any.whl → 0.8.180__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of workbench might be problematic. Click here for more details.
- workbench/api/endpoint.py +3 -2
- workbench/core/artifacts/endpoint_core.py +5 -5
- workbench/core/artifacts/feature_set_core.py +32 -2
- workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +3 -5
- workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
- workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +10 -17
- workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
- workbench/model_scripts/custom_models/uq_models/generated_model_script.py +37 -34
- workbench/model_scripts/custom_models/uq_models/mapie.template +35 -32
- workbench/model_scripts/custom_models/uq_models/meta_uq.template +7 -22
- workbench/model_scripts/custom_models/uq_models/ngboost.template +5 -12
- workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +5 -13
- workbench/model_scripts/pytorch_model/pytorch.template +9 -18
- workbench/model_scripts/quant_regression/quant_regression.template +5 -10
- workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
- workbench/model_scripts/xgb_model/generated_model_script.py +24 -33
- workbench/model_scripts/xgb_model/xgb_model.template +23 -32
- workbench/utils/model_utils.py +2 -1
- workbench/utils/shap_utils.py +10 -2
- workbench/utils/xgboost_model_utils.py +160 -137
- {workbench-0.8.178.dist-info → workbench-0.8.180.dist-info}/METADATA +1 -1
- {workbench-0.8.178.dist-info → workbench-0.8.180.dist-info}/RECORD +26 -26
- {workbench-0.8.178.dist-info → workbench-0.8.180.dist-info}/WHEEL +0 -0
- {workbench-0.8.178.dist-info → workbench-0.8.180.dist-info}/entry_points.txt +0 -0
- {workbench-0.8.178.dist-info → workbench-0.8.180.dist-info}/licenses/LICENSE +0 -0
- {workbench-0.8.178.dist-info → workbench-0.8.180.dist-info}/top_level.txt +0 -0
|
@@ -5,11 +5,7 @@ from xgboost import XGBRegressor
|
|
|
5
5
|
from sklearn.model_selection import train_test_split
|
|
6
6
|
|
|
7
7
|
# Model Performance Scores
|
|
8
|
-
from sklearn.metrics import
|
|
9
|
-
mean_absolute_error,
|
|
10
|
-
r2_score,
|
|
11
|
-
root_mean_squared_error
|
|
12
|
-
)
|
|
8
|
+
from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
|
|
13
9
|
|
|
14
10
|
from io import StringIO
|
|
15
11
|
import json
|
|
@@ -25,7 +21,8 @@ TEMPLATE_PARAMS = {
|
|
|
25
21
|
"target": "{{target_column}}",
|
|
26
22
|
"features": "{{feature_list}}",
|
|
27
23
|
"compressed_features": "{{compressed_features}}",
|
|
28
|
-
"train_all_data": "{{train_all_data}}"
|
|
24
|
+
"train_all_data": "{{train_all_data}}",
|
|
25
|
+
"hyperparameters": "{{hyperparameters}}",
|
|
29
26
|
}
|
|
30
27
|
|
|
31
28
|
|
|
@@ -101,7 +98,7 @@ def convert_categorical_types(df: pd.DataFrame, features: list, category_mapping
|
|
|
101
98
|
|
|
102
99
|
|
|
103
100
|
def decompress_features(
|
|
104
|
-
|
|
101
|
+
df: pd.DataFrame, features: List[str], compressed_features: List[str]
|
|
105
102
|
) -> Tuple[pd.DataFrame, List[str]]:
|
|
106
103
|
"""Prepare features for the model by decompressing bitstring features
|
|
107
104
|
|
|
@@ -162,6 +159,7 @@ if __name__ == "__main__":
|
|
|
162
159
|
orig_features = features.copy()
|
|
163
160
|
compressed_features = TEMPLATE_PARAMS["compressed_features"]
|
|
164
161
|
train_all_data = TEMPLATE_PARAMS["train_all_data"]
|
|
162
|
+
hyperparameters = TEMPLATE_PARAMS["hyperparameters"]
|
|
165
163
|
validation_split = 0.2
|
|
166
164
|
|
|
167
165
|
# Script arguments for input/output directories
|
|
@@ -174,11 +172,7 @@ if __name__ == "__main__":
|
|
|
174
172
|
args = parser.parse_args()
|
|
175
173
|
|
|
176
174
|
# Read the training data into DataFrames
|
|
177
|
-
training_files = [
|
|
178
|
-
os.path.join(args.train, file)
|
|
179
|
-
for file in os.listdir(args.train)
|
|
180
|
-
if file.endswith(".csv")
|
|
181
|
-
]
|
|
175
|
+
training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
|
|
182
176
|
print(f"Training Files: {training_files}")
|
|
183
177
|
|
|
184
178
|
# Combine files and read them all into a single pandas dataframe
|
|
@@ -213,9 +207,7 @@ if __name__ == "__main__":
|
|
|
213
207
|
else:
|
|
214
208
|
# Just do a random training Split
|
|
215
209
|
print("WARNING: No training column found, splitting data with random state=42")
|
|
216
|
-
df_train, df_val = train_test_split(
|
|
217
|
-
all_df, test_size=validation_split, random_state=42
|
|
218
|
-
)
|
|
210
|
+
df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
|
|
219
211
|
print(f"FIT/TRAIN: {df_train.shape}")
|
|
220
212
|
print(f"VALIDATION: {df_val.shape}")
|
|
221
213
|
|
|
@@ -227,7 +219,8 @@ if __name__ == "__main__":
|
|
|
227
219
|
|
|
228
220
|
# Train XGBoost for point predictions
|
|
229
221
|
print("\nTraining XGBoost for point predictions...")
|
|
230
|
-
|
|
222
|
+
print(f" Hyperparameters: {hyperparameters}")
|
|
223
|
+
xgb_model = XGBRegressor(enable_categorical=True, **hyperparameters)
|
|
231
224
|
xgb_model.fit(X_train, y_train)
|
|
232
225
|
|
|
233
226
|
# Evaluate XGBoost performance
|
|
@@ -272,7 +265,7 @@ if __name__ == "__main__":
|
|
|
272
265
|
colsample_bytree=0.8,
|
|
273
266
|
random_state=42,
|
|
274
267
|
verbose=-1,
|
|
275
|
-
force_col_wise=True
|
|
268
|
+
force_col_wise=True,
|
|
276
269
|
)
|
|
277
270
|
est.fit(X_train, y_train)
|
|
278
271
|
quantile_estimators.append(est)
|
|
@@ -280,9 +273,7 @@ if __name__ == "__main__":
|
|
|
280
273
|
# Create MAPIE CQR model for this confidence level
|
|
281
274
|
print(f" Setting up MAPIE CQR for {confidence_level * 100:.0f}% confidence...")
|
|
282
275
|
mapie_model = ConformalizedQuantileRegressor(
|
|
283
|
-
quantile_estimators,
|
|
284
|
-
confidence_level=confidence_level,
|
|
285
|
-
prefit=True
|
|
276
|
+
quantile_estimators, confidence_level=confidence_level, prefit=True
|
|
286
277
|
)
|
|
287
278
|
|
|
288
279
|
# Conformalize the model
|
|
@@ -337,8 +328,8 @@ if __name__ == "__main__":
|
|
|
337
328
|
"xgb_rmse": float(xgb_rmse),
|
|
338
329
|
"xgb_mae": float(xgb_mae),
|
|
339
330
|
"xgb_r2": float(xgb_r2),
|
|
340
|
-
"n_validation": len(df_val)
|
|
341
|
-
}
|
|
331
|
+
"n_validation": len(df_val),
|
|
332
|
+
},
|
|
342
333
|
}
|
|
343
334
|
with open(os.path.join(args.model_dir, "model_config.json"), "w") as fp:
|
|
344
335
|
json.dump(model_config, fp, indent=2)
|
|
@@ -379,7 +370,7 @@ def model_fn(model_dir) -> dict:
|
|
|
379
370
|
"xgb_model": xgb_model,
|
|
380
371
|
"mapie_models": mapie_models,
|
|
381
372
|
"confidence_levels": config["confidence_levels"],
|
|
382
|
-
"category_mappings": category_mappings
|
|
373
|
+
"category_mappings": category_mappings,
|
|
383
374
|
}
|
|
384
375
|
|
|
385
376
|
|
|
@@ -404,7 +395,7 @@ def output_fn(output_df, accept_type):
|
|
|
404
395
|
"""Supports both CSV and JSON output formats."""
|
|
405
396
|
if "text/csv" in accept_type:
|
|
406
397
|
# Convert categorical columns to string to avoid fillna issues
|
|
407
|
-
for col in output_df.select_dtypes(include=[
|
|
398
|
+
for col in output_df.select_dtypes(include=["category"]).columns:
|
|
408
399
|
output_df[col] = output_df[col].astype(str)
|
|
409
400
|
csv_output = output_df.fillna("N/A").to_csv(index=False)
|
|
410
401
|
return csv_output, "text/csv"
|
|
@@ -425,6 +416,10 @@ def predict_fn(df, models) -> pd.DataFrame:
|
|
|
425
416
|
pd.DataFrame: DataFrame with XGBoost predictions and conformalized intervals
|
|
426
417
|
"""
|
|
427
418
|
|
|
419
|
+
# Flag for outlier stretch adjustment for the prediction intervals
|
|
420
|
+
# if the predicted values are outside the intervals
|
|
421
|
+
outlier_stretch = False
|
|
422
|
+
|
|
428
423
|
# Grab our feature columns (from training)
|
|
429
424
|
model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
|
|
430
425
|
with open(os.path.join(model_dir, "feature_columns.json")) as fp:
|
|
@@ -435,11 +430,7 @@ def predict_fn(df, models) -> pd.DataFrame:
|
|
|
435
430
|
|
|
436
431
|
# Apply categorical mappings if they exist
|
|
437
432
|
if models.get("category_mappings"):
|
|
438
|
-
matched_df, _ = convert_categorical_types(
|
|
439
|
-
matched_df,
|
|
440
|
-
model_features,
|
|
441
|
-
models["category_mappings"]
|
|
442
|
-
)
|
|
433
|
+
matched_df, _ = convert_categorical_types(matched_df, model_features, models["category_mappings"])
|
|
443
434
|
|
|
444
435
|
# Get features for prediction
|
|
445
436
|
X = matched_df[model_features]
|
|
@@ -475,7 +466,7 @@ def predict_fn(df, models) -> pd.DataFrame:
|
|
|
475
466
|
# Add median (q_50) from XGBoost prediction
|
|
476
467
|
df["q_50"] = df["prediction"]
|
|
477
468
|
|
|
478
|
-
# Calculate a
|
|
469
|
+
# Calculate a pseudo-standard deviation from the 68% interval width
|
|
479
470
|
df["prediction_std"] = (df["q_84"] - df["q_16"]) / 2.0
|
|
480
471
|
|
|
481
472
|
# Reorder the quantile columns for easier reading
|
|
@@ -484,7 +475,19 @@ def predict_fn(df, models) -> pd.DataFrame:
|
|
|
484
475
|
df = df[other_cols + quantile_cols]
|
|
485
476
|
|
|
486
477
|
# Adjust the outer quantiles to ensure they encompass the prediction
|
|
487
|
-
|
|
488
|
-
|
|
478
|
+
if outlier_stretch:
|
|
479
|
+
# Lower intervals adjustments
|
|
480
|
+
df["q_025"] = np.minimum(df["q_025"], df["prediction"])
|
|
481
|
+
df["q_05"] = np.minimum(df["q_05"], df["prediction"])
|
|
482
|
+
df["q_10"] = np.minimum(df["q_10"], df["prediction"])
|
|
483
|
+
df["q_16"] = np.minimum(df["q_16"], df["prediction"])
|
|
484
|
+
df["q_25"] = np.minimum(df["q_25"], df["prediction"])
|
|
485
|
+
|
|
486
|
+
# Upper intervals adjustments
|
|
487
|
+
df["q_75"] = np.maximum(df["q_75"], df["prediction"])
|
|
488
|
+
df["q_84"] = np.maximum(df["q_84"], df["prediction"])
|
|
489
|
+
df["q_90"] = np.maximum(df["q_90"], df["prediction"])
|
|
490
|
+
df["q_95"] = np.maximum(df["q_95"], df["prediction"])
|
|
491
|
+
df["q_975"] = np.maximum(df["q_975"], df["prediction"])
|
|
489
492
|
|
|
490
493
|
return df
|
|
@@ -5,11 +5,7 @@ from xgboost import XGBRegressor # Point Estimator
|
|
|
5
5
|
from sklearn.model_selection import train_test_split
|
|
6
6
|
|
|
7
7
|
# Model Performance Scores
|
|
8
|
-
from sklearn.metrics import
|
|
9
|
-
mean_absolute_error,
|
|
10
|
-
r2_score,
|
|
11
|
-
root_mean_squared_error
|
|
12
|
-
)
|
|
8
|
+
from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
|
|
13
9
|
|
|
14
10
|
from io import StringIO
|
|
15
11
|
import json
|
|
@@ -24,7 +20,6 @@ from typing import List, Tuple
|
|
|
24
20
|
from proximity import Proximity
|
|
25
21
|
|
|
26
22
|
|
|
27
|
-
|
|
28
23
|
# Template Placeholders
|
|
29
24
|
TEMPLATE_PARAMS = {
|
|
30
25
|
"id_column": "{{id_column}}",
|
|
@@ -32,7 +27,7 @@ TEMPLATE_PARAMS = {
|
|
|
32
27
|
"features": "{{feature_list}}",
|
|
33
28
|
"compressed_features": "{{compressed_features}}",
|
|
34
29
|
"train_all_data": "{{train_all_data}}",
|
|
35
|
-
"track_columns": "{{track_columns}}"
|
|
30
|
+
"track_columns": "{{track_columns}}",
|
|
36
31
|
}
|
|
37
32
|
|
|
38
33
|
|
|
@@ -183,11 +178,7 @@ if __name__ == "__main__":
|
|
|
183
178
|
args = parser.parse_args()
|
|
184
179
|
|
|
185
180
|
# Read the training data into DataFrames
|
|
186
|
-
training_files = [
|
|
187
|
-
os.path.join(args.train, file)
|
|
188
|
-
for file in os.listdir(args.train)
|
|
189
|
-
if file.endswith(".csv")
|
|
190
|
-
]
|
|
181
|
+
training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
|
|
191
182
|
print(f"Training Files: {training_files}")
|
|
192
183
|
|
|
193
184
|
# Combine files and read them all into a single pandas dataframe
|
|
@@ -222,9 +213,7 @@ if __name__ == "__main__":
|
|
|
222
213
|
else:
|
|
223
214
|
# Just do a random training Split
|
|
224
215
|
print("WARNING: No training column found, splitting data with random state=42")
|
|
225
|
-
df_train, df_val = train_test_split(
|
|
226
|
-
all_df, test_size=validation_split, random_state=42
|
|
227
|
-
)
|
|
216
|
+
df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
|
|
228
217
|
print(f"FIT/TRAIN: {df_train.shape}")
|
|
229
218
|
print(f"VALIDATION: {df_val.shape}")
|
|
230
219
|
|
|
@@ -289,11 +278,7 @@ def model_fn(model_dir) -> dict:
|
|
|
289
278
|
# Deserialize the proximity model
|
|
290
279
|
prox_model = Proximity.deserialize(model_dir)
|
|
291
280
|
|
|
292
|
-
return {
|
|
293
|
-
"xgboost": xgb_model,
|
|
294
|
-
"ngboost": ngb_model,
|
|
295
|
-
"proximity": prox_model
|
|
296
|
-
}
|
|
281
|
+
return {"xgboost": xgb_model, "ngboost": ngb_model, "proximity": prox_model}
|
|
297
282
|
|
|
298
283
|
|
|
299
284
|
def input_fn(input_data, content_type):
|
|
@@ -353,8 +338,8 @@ def predict_fn(df, models) -> pd.DataFrame:
|
|
|
353
338
|
dist_params = y_dists.params
|
|
354
339
|
|
|
355
340
|
# Extract mean and std from distribution parameters
|
|
356
|
-
df["prediction_uq"] = dist_params[
|
|
357
|
-
df["prediction_std"] = dist_params[
|
|
341
|
+
df["prediction_uq"] = dist_params["loc"] # mean
|
|
342
|
+
df["prediction_std"] = dist_params["scale"] # standard deviation
|
|
358
343
|
|
|
359
344
|
# Add 95% prediction intervals using ppf (percent point function)
|
|
360
345
|
# Note: Our hybrid model uses XGB point prediction and NGBoost UQ
|
|
@@ -3,11 +3,7 @@ from ngboost import NGBRegressor
|
|
|
3
3
|
from sklearn.model_selection import train_test_split
|
|
4
4
|
|
|
5
5
|
# Model Performance Scores
|
|
6
|
-
from sklearn.metrics import
|
|
7
|
-
mean_absolute_error,
|
|
8
|
-
r2_score,
|
|
9
|
-
root_mean_squared_error
|
|
10
|
-
)
|
|
6
|
+
from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
|
|
11
7
|
|
|
12
8
|
from io import StringIO
|
|
13
9
|
import json
|
|
@@ -21,7 +17,7 @@ import pandas as pd
|
|
|
21
17
|
TEMPLATE_PARAMS = {
|
|
22
18
|
"features": "{{feature_list}}",
|
|
23
19
|
"target": "{{target_column}}",
|
|
24
|
-
"train_all_data": "{{train_all_data}}"
|
|
20
|
+
"train_all_data": "{{train_all_data}}",
|
|
25
21
|
}
|
|
26
22
|
|
|
27
23
|
|
|
@@ -87,10 +83,7 @@ if __name__ == "__main__":
|
|
|
87
83
|
args = parser.parse_args()
|
|
88
84
|
|
|
89
85
|
# Load training data from the specified directory
|
|
90
|
-
training_files = [
|
|
91
|
-
os.path.join(args.train, file)
|
|
92
|
-
for file in os.listdir(args.train) if file.endswith(".csv")
|
|
93
|
-
]
|
|
86
|
+
training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
|
|
94
87
|
print(f"Training Files: {training_files}")
|
|
95
88
|
|
|
96
89
|
# Combine files and read them all into a single pandas dataframe
|
|
@@ -212,8 +205,8 @@ def predict_fn(df, model) -> pd.DataFrame:
|
|
|
212
205
|
dist_params = y_dists.params
|
|
213
206
|
|
|
214
207
|
# Extract mean and std from distribution parameters
|
|
215
|
-
df["prediction"] = dist_params[
|
|
216
|
-
df["prediction_std"] = dist_params[
|
|
208
|
+
df["prediction"] = dist_params["loc"] # mean
|
|
209
|
+
df["prediction_std"] = dist_params["scale"] # standard deviation
|
|
217
210
|
|
|
218
211
|
# Add 95% prediction intervals using ppf (percent point function)
|
|
219
212
|
df["q_025"] = y_dists.ppf(0.025) # 2.5th percentile
|
|
@@ -3,7 +3,7 @@ TEMPLATE_PARAMS = {
|
|
|
3
3
|
"model_type": "{{model_type}}",
|
|
4
4
|
"target_column": "{{target_column}}",
|
|
5
5
|
"feature_list": "{{feature_list}}",
|
|
6
|
-
"model_metrics_s3_path": "{{model_metrics_s3_path}}"
|
|
6
|
+
"model_metrics_s3_path": "{{model_metrics_s3_path}}",
|
|
7
7
|
}
|
|
8
8
|
|
|
9
9
|
# Imports for XGB Model
|
|
@@ -12,11 +12,7 @@ import awswrangler as wr
|
|
|
12
12
|
import numpy as np
|
|
13
13
|
|
|
14
14
|
# Model Performance Scores
|
|
15
|
-
from sklearn.metrics import
|
|
16
|
-
mean_absolute_error,
|
|
17
|
-
r2_score,
|
|
18
|
-
root_mean_squared_error
|
|
19
|
-
)
|
|
15
|
+
from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
|
|
20
16
|
|
|
21
17
|
from io import StringIO
|
|
22
18
|
import json
|
|
@@ -39,6 +35,7 @@ def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
|
|
|
39
35
|
print(msg)
|
|
40
36
|
raise ValueError(msg)
|
|
41
37
|
|
|
38
|
+
|
|
42
39
|
def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> pd.DataFrame:
|
|
43
40
|
"""
|
|
44
41
|
Matches and renames the DataFrame's column names to match the model's feature names (case-insensitive).
|
|
@@ -95,11 +92,7 @@ if __name__ == "__main__":
|
|
|
95
92
|
args = parser.parse_args()
|
|
96
93
|
|
|
97
94
|
# Read the training data into DataFrames
|
|
98
|
-
training_files = [
|
|
99
|
-
os.path.join(args.train, file)
|
|
100
|
-
for file in os.listdir(args.train)
|
|
101
|
-
if file.endswith(".csv")
|
|
102
|
-
]
|
|
95
|
+
training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
|
|
103
96
|
print(f"Training Files: {training_files}")
|
|
104
97
|
|
|
105
98
|
# Combine files and read them all into a single pandas dataframe
|
|
@@ -150,7 +143,6 @@ if __name__ == "__main__":
|
|
|
150
143
|
result_df["residual"] = result_df[target] - result_df["prediction"]
|
|
151
144
|
result_df["residual_abs"] = result_df["residual"].abs()
|
|
152
145
|
|
|
153
|
-
|
|
154
146
|
# Save the results dataframe to S3
|
|
155
147
|
wr.s3.to_csv(
|
|
156
148
|
result_df,
|
|
@@ -210,7 +202,7 @@ def input_fn(input_data, content_type):
|
|
|
210
202
|
"""Parse input data and return a DataFrame."""
|
|
211
203
|
if not input_data:
|
|
212
204
|
raise ValueError("Empty input data is not supported!")
|
|
213
|
-
|
|
205
|
+
|
|
214
206
|
# Decode bytes to string if necessary
|
|
215
207
|
if isinstance(input_data, bytes):
|
|
216
208
|
input_data = input_data.decode("utf-8")
|
|
@@ -36,12 +36,12 @@ from typing import List, Tuple
|
|
|
36
36
|
# Template Parameters
|
|
37
37
|
TEMPLATE_PARAMS = {
|
|
38
38
|
"model_type": "{{model_type}}",
|
|
39
|
-
"
|
|
39
|
+
"target": "{{target_column}}",
|
|
40
40
|
"features": "{{feature_list}}",
|
|
41
41
|
"compressed_features": "{{compressed_features}}",
|
|
42
42
|
"model_metrics_s3_path": "{{model_metrics_s3_path}}",
|
|
43
43
|
"train_all_data": "{{train_all_data}}",
|
|
44
|
-
"hyperparameters": "{{hyperparameters}}"
|
|
44
|
+
"hyperparameters": "{{hyperparameters}}",
|
|
45
45
|
}
|
|
46
46
|
|
|
47
47
|
|
|
@@ -103,7 +103,6 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
|
|
|
103
103
|
df_columns_lower = {col.lower(): col for col in df.columns}
|
|
104
104
|
rename_dict = {}
|
|
105
105
|
missing = []
|
|
106
|
-
|
|
107
106
|
for feature in model_features:
|
|
108
107
|
if feature in df.columns:
|
|
109
108
|
continue # Exact match
|
|
@@ -115,6 +114,7 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
|
|
|
115
114
|
if missing:
|
|
116
115
|
raise ValueError(f"Features not found: {missing}")
|
|
117
116
|
|
|
117
|
+
# Rename the DataFrame columns to match the model features
|
|
118
118
|
return df.rename(columns=rename_dict)
|
|
119
119
|
|
|
120
120
|
|
|
@@ -210,7 +210,7 @@ def model_fn(model_dir):
|
|
|
210
210
|
original_cwd = os.getcwd()
|
|
211
211
|
try:
|
|
212
212
|
# Change to /tmp because Pytorch Tabular needs write access (creates a .pt_tmp directory)
|
|
213
|
-
os.chdir(
|
|
213
|
+
os.chdir("/tmp")
|
|
214
214
|
|
|
215
215
|
# Load the model
|
|
216
216
|
model_path = os.path.join(model_dir, "tabular_model")
|
|
@@ -328,7 +328,7 @@ if __name__ == "__main__":
|
|
|
328
328
|
"""The main function is for training the PyTorch Tabular model"""
|
|
329
329
|
|
|
330
330
|
# Harness Template Parameters
|
|
331
|
-
target = TEMPLATE_PARAMS["
|
|
331
|
+
target = TEMPLATE_PARAMS["target"]
|
|
332
332
|
features = TEMPLATE_PARAMS["features"]
|
|
333
333
|
orig_features = features.copy()
|
|
334
334
|
compressed_features = TEMPLATE_PARAMS["compressed_features"]
|
|
@@ -348,11 +348,7 @@ if __name__ == "__main__":
|
|
|
348
348
|
args = parser.parse_args()
|
|
349
349
|
|
|
350
350
|
# Read the training data into DataFrames
|
|
351
|
-
training_files = [
|
|
352
|
-
os.path.join(args.train, file)
|
|
353
|
-
for file in os.listdir(args.train)
|
|
354
|
-
if file.endswith(".csv")
|
|
355
|
-
]
|
|
351
|
+
training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
|
|
356
352
|
print(f"Training Files: {training_files}")
|
|
357
353
|
|
|
358
354
|
# Combine files and read them all into a single pandas dataframe
|
|
@@ -433,8 +429,7 @@ if __name__ == "__main__":
|
|
|
433
429
|
}
|
|
434
430
|
|
|
435
431
|
# Override defaults with training_config if present
|
|
436
|
-
training_overrides = {k: v for k, v in hyperparameters.get(
|
|
437
|
-
if k in trainer_defaults}
|
|
432
|
+
training_overrides = {k: v for k, v in hyperparameters.get("training_config", {}).items() if k in trainer_defaults}
|
|
438
433
|
# Print overwrites
|
|
439
434
|
for key, value in training_overrides.items():
|
|
440
435
|
print(f"TRAINING CONFIG Override: {key}: {trainer_defaults[key]} → {value}")
|
|
@@ -451,8 +446,7 @@ if __name__ == "__main__":
|
|
|
451
446
|
"initialization": "kaiming",
|
|
452
447
|
}
|
|
453
448
|
# Override defaults with model_config if present
|
|
454
|
-
model_overrides = {k: v for k, v in hyperparameters.get(
|
|
455
|
-
if k in model_defaults}
|
|
449
|
+
model_overrides = {k: v for k, v in hyperparameters.get("model_config", {}).items() if k in model_defaults}
|
|
456
450
|
# Print overwrites
|
|
457
451
|
for key, value in model_overrides.items():
|
|
458
452
|
print(f"MODEL CONFIG Override: {key}: {model_defaults[key]} → {value}")
|
|
@@ -461,10 +455,7 @@ if __name__ == "__main__":
|
|
|
461
455
|
# Use CategoryEmbedding model configuration for general-purpose tabular modeling.
|
|
462
456
|
# Works effectively for both regression and classification as the foundational
|
|
463
457
|
# architecture in PyTorch Tabular
|
|
464
|
-
model_config = CategoryEmbeddingModelConfig(
|
|
465
|
-
task=task,
|
|
466
|
-
**model_params
|
|
467
|
-
)
|
|
458
|
+
model_config = CategoryEmbeddingModelConfig(task=task, **model_params)
|
|
468
459
|
optimizer_config = OptimizerConfig()
|
|
469
460
|
|
|
470
461
|
#####################################
|
|
@@ -4,11 +4,7 @@ import awswrangler as wr
|
|
|
4
4
|
from sklearn.model_selection import train_test_split
|
|
5
5
|
|
|
6
6
|
# Model Performance Scores
|
|
7
|
-
from sklearn.metrics import
|
|
8
|
-
mean_absolute_error,
|
|
9
|
-
r2_score,
|
|
10
|
-
root_mean_squared_error
|
|
11
|
-
)
|
|
7
|
+
from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
|
|
12
8
|
|
|
13
9
|
from io import StringIO
|
|
14
10
|
import json
|
|
@@ -22,9 +18,10 @@ TEMPLATE_PARAMS = {
|
|
|
22
18
|
"target_column": "{{target_column}}",
|
|
23
19
|
"features": "{{feature_list}}",
|
|
24
20
|
"model_metrics_s3_path": "{{model_metrics_s3_path}}",
|
|
25
|
-
"train_all_data": "{{train_all_data}}"
|
|
21
|
+
"train_all_data": "{{train_all_data}}",
|
|
26
22
|
}
|
|
27
23
|
|
|
24
|
+
|
|
28
25
|
# Function to check if dataframe is empty
|
|
29
26
|
def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
|
|
30
27
|
"""
|
|
@@ -64,6 +61,7 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
|
|
|
64
61
|
# Rename the DataFrame columns to match the model features
|
|
65
62
|
return df.rename(columns=rename_dict)
|
|
66
63
|
|
|
64
|
+
|
|
67
65
|
if __name__ == "__main__":
|
|
68
66
|
"""The main function is for training the XGBoost Quantile Regression models"""
|
|
69
67
|
|
|
@@ -86,10 +84,7 @@ if __name__ == "__main__":
|
|
|
86
84
|
args = parser.parse_args()
|
|
87
85
|
|
|
88
86
|
# Load training data from the specified directory
|
|
89
|
-
training_files = [
|
|
90
|
-
os.path.join(args.train, file)
|
|
91
|
-
for file in os.listdir(args.train) if file.endswith(".csv")
|
|
92
|
-
]
|
|
87
|
+
training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
|
|
93
88
|
print(f"Training Files: {training_files}")
|
|
94
89
|
|
|
95
90
|
# Combine files and read them all into a single pandas dataframe
|
|
@@ -8,7 +8,7 @@ TEMPLATE_PARAMS = {
|
|
|
8
8
|
"feature_list": "{{feature_list}}",
|
|
9
9
|
"model_class": "{{model_class}}",
|
|
10
10
|
"model_metrics_s3_path": "{{model_metrics_s3_path}}",
|
|
11
|
-
"train_all_data": "{{train_all_data}}"
|
|
11
|
+
"train_all_data": "{{train_all_data}}",
|
|
12
12
|
}
|
|
13
13
|
|
|
14
14
|
import awswrangler as wr
|
|
@@ -99,10 +99,7 @@ if __name__ == "__main__":
|
|
|
99
99
|
args = parser.parse_args()
|
|
100
100
|
|
|
101
101
|
# Load training data from the specified directory
|
|
102
|
-
training_files = [
|
|
103
|
-
os.path.join(args.train, file)
|
|
104
|
-
for file in os.listdir(args.train) if file.endswith(".csv")
|
|
105
|
-
]
|
|
102
|
+
training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
|
|
106
103
|
all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
|
|
107
104
|
|
|
108
105
|
# Check if the DataFrame is empty
|
|
@@ -116,10 +113,7 @@ if __name__ == "__main__":
|
|
|
116
113
|
|
|
117
114
|
if needs_standardization:
|
|
118
115
|
# Create a pipeline with standardization and the model
|
|
119
|
-
model = Pipeline([
|
|
120
|
-
("scaler", StandardScaler()),
|
|
121
|
-
("model", model)
|
|
122
|
-
])
|
|
116
|
+
model = Pipeline([("scaler", StandardScaler()), ("model", model)])
|
|
123
117
|
|
|
124
118
|
# Handle logic based on the model_type
|
|
125
119
|
if model_type in ["classifier", "regressor"]:
|
|
@@ -206,6 +200,7 @@ if __name__ == "__main__":
|
|
|
206
200
|
with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
|
|
207
201
|
json.dump(feature_list, fp)
|
|
208
202
|
|
|
203
|
+
|
|
209
204
|
#
|
|
210
205
|
# Inference Section
|
|
211
206
|
#
|