workbench 0.8.177__py3-none-any.whl → 0.8.179__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of workbench might be problematic. Click here for more details.
- workbench/api/endpoint.py +3 -2
- workbench/core/artifacts/endpoint_core.py +5 -5
- workbench/core/artifacts/feature_set_core.py +67 -8
- workbench/core/views/training_view.py +38 -48
- workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +3 -5
- workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
- workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +10 -17
- workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
- workbench/model_scripts/custom_models/uq_models/generated_model_script.py +44 -45
- workbench/model_scripts/custom_models/uq_models/mapie.template +42 -43
- workbench/model_scripts/custom_models/uq_models/meta_uq.template +7 -22
- workbench/model_scripts/custom_models/uq_models/ngboost.template +5 -12
- workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +5 -13
- workbench/model_scripts/pytorch_model/pytorch.template +9 -18
- workbench/model_scripts/quant_regression/quant_regression.template +5 -10
- workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
- workbench/model_scripts/xgb_model/generated_model_script.py +24 -33
- workbench/model_scripts/xgb_model/xgb_model.template +23 -32
- workbench/scripts/ml_pipeline_sqs.py +14 -2
- workbench/utils/model_utils.py +12 -2
- workbench/utils/xgboost_model_utils.py +161 -138
- {workbench-0.8.177.dist-info → workbench-0.8.179.dist-info}/METADATA +1 -1
- {workbench-0.8.177.dist-info → workbench-0.8.179.dist-info}/RECORD +27 -27
- {workbench-0.8.177.dist-info → workbench-0.8.179.dist-info}/WHEEL +0 -0
- {workbench-0.8.177.dist-info → workbench-0.8.179.dist-info}/entry_points.txt +0 -0
- {workbench-0.8.177.dist-info → workbench-0.8.179.dist-info}/licenses/LICENSE +0 -0
- {workbench-0.8.177.dist-info → workbench-0.8.179.dist-info}/top_level.txt +0 -0
|
@@ -36,12 +36,12 @@ from typing import List, Tuple
|
|
|
36
36
|
# Template Parameters
|
|
37
37
|
TEMPLATE_PARAMS = {
|
|
38
38
|
"model_type": "{{model_type}}",
|
|
39
|
-
"
|
|
39
|
+
"target": "{{target_column}}",
|
|
40
40
|
"features": "{{feature_list}}",
|
|
41
41
|
"compressed_features": "{{compressed_features}}",
|
|
42
42
|
"model_metrics_s3_path": "{{model_metrics_s3_path}}",
|
|
43
43
|
"train_all_data": "{{train_all_data}}",
|
|
44
|
-
"hyperparameters": "{{hyperparameters}}"
|
|
44
|
+
"hyperparameters": "{{hyperparameters}}",
|
|
45
45
|
}
|
|
46
46
|
|
|
47
47
|
|
|
@@ -103,7 +103,6 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
|
|
|
103
103
|
df_columns_lower = {col.lower(): col for col in df.columns}
|
|
104
104
|
rename_dict = {}
|
|
105
105
|
missing = []
|
|
106
|
-
|
|
107
106
|
for feature in model_features:
|
|
108
107
|
if feature in df.columns:
|
|
109
108
|
continue # Exact match
|
|
@@ -115,6 +114,7 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
|
|
|
115
114
|
if missing:
|
|
116
115
|
raise ValueError(f"Features not found: {missing}")
|
|
117
116
|
|
|
117
|
+
# Rename the DataFrame columns to match the model features
|
|
118
118
|
return df.rename(columns=rename_dict)
|
|
119
119
|
|
|
120
120
|
|
|
@@ -210,7 +210,7 @@ def model_fn(model_dir):
|
|
|
210
210
|
original_cwd = os.getcwd()
|
|
211
211
|
try:
|
|
212
212
|
# Change to /tmp because Pytorch Tabular needs write access (creates a .pt_tmp directory)
|
|
213
|
-
os.chdir(
|
|
213
|
+
os.chdir("/tmp")
|
|
214
214
|
|
|
215
215
|
# Load the model
|
|
216
216
|
model_path = os.path.join(model_dir, "tabular_model")
|
|
@@ -328,7 +328,7 @@ if __name__ == "__main__":
|
|
|
328
328
|
"""The main function is for training the PyTorch Tabular model"""
|
|
329
329
|
|
|
330
330
|
# Harness Template Parameters
|
|
331
|
-
target = TEMPLATE_PARAMS["
|
|
331
|
+
target = TEMPLATE_PARAMS["target"]
|
|
332
332
|
features = TEMPLATE_PARAMS["features"]
|
|
333
333
|
orig_features = features.copy()
|
|
334
334
|
compressed_features = TEMPLATE_PARAMS["compressed_features"]
|
|
@@ -348,11 +348,7 @@ if __name__ == "__main__":
|
|
|
348
348
|
args = parser.parse_args()
|
|
349
349
|
|
|
350
350
|
# Read the training data into DataFrames
|
|
351
|
-
training_files = [
|
|
352
|
-
os.path.join(args.train, file)
|
|
353
|
-
for file in os.listdir(args.train)
|
|
354
|
-
if file.endswith(".csv")
|
|
355
|
-
]
|
|
351
|
+
training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
|
|
356
352
|
print(f"Training Files: {training_files}")
|
|
357
353
|
|
|
358
354
|
# Combine files and read them all into a single pandas dataframe
|
|
@@ -433,8 +429,7 @@ if __name__ == "__main__":
|
|
|
433
429
|
}
|
|
434
430
|
|
|
435
431
|
# Override defaults with training_config if present
|
|
436
|
-
training_overrides = {k: v for k, v in hyperparameters.get(
|
|
437
|
-
if k in trainer_defaults}
|
|
432
|
+
training_overrides = {k: v for k, v in hyperparameters.get("training_config", {}).items() if k in trainer_defaults}
|
|
438
433
|
# Print overwrites
|
|
439
434
|
for key, value in training_overrides.items():
|
|
440
435
|
print(f"TRAINING CONFIG Override: {key}: {trainer_defaults[key]} → {value}")
|
|
@@ -451,8 +446,7 @@ if __name__ == "__main__":
|
|
|
451
446
|
"initialization": "kaiming",
|
|
452
447
|
}
|
|
453
448
|
# Override defaults with model_config if present
|
|
454
|
-
model_overrides = {k: v for k, v in hyperparameters.get(
|
|
455
|
-
if k in model_defaults}
|
|
449
|
+
model_overrides = {k: v for k, v in hyperparameters.get("model_config", {}).items() if k in model_defaults}
|
|
456
450
|
# Print overwrites
|
|
457
451
|
for key, value in model_overrides.items():
|
|
458
452
|
print(f"MODEL CONFIG Override: {key}: {model_defaults[key]} → {value}")
|
|
@@ -461,10 +455,7 @@ if __name__ == "__main__":
|
|
|
461
455
|
# Use CategoryEmbedding model configuration for general-purpose tabular modeling.
|
|
462
456
|
# Works effectively for both regression and classification as the foundational
|
|
463
457
|
# architecture in PyTorch Tabular
|
|
464
|
-
model_config = CategoryEmbeddingModelConfig(
|
|
465
|
-
task=task,
|
|
466
|
-
**model_params
|
|
467
|
-
)
|
|
458
|
+
model_config = CategoryEmbeddingModelConfig(task=task, **model_params)
|
|
468
459
|
optimizer_config = OptimizerConfig()
|
|
469
460
|
|
|
470
461
|
#####################################
|
|
@@ -4,11 +4,7 @@ import awswrangler as wr
|
|
|
4
4
|
from sklearn.model_selection import train_test_split
|
|
5
5
|
|
|
6
6
|
# Model Performance Scores
|
|
7
|
-
from sklearn.metrics import
|
|
8
|
-
mean_absolute_error,
|
|
9
|
-
r2_score,
|
|
10
|
-
root_mean_squared_error
|
|
11
|
-
)
|
|
7
|
+
from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
|
|
12
8
|
|
|
13
9
|
from io import StringIO
|
|
14
10
|
import json
|
|
@@ -22,9 +18,10 @@ TEMPLATE_PARAMS = {
|
|
|
22
18
|
"target_column": "{{target_column}}",
|
|
23
19
|
"features": "{{feature_list}}",
|
|
24
20
|
"model_metrics_s3_path": "{{model_metrics_s3_path}}",
|
|
25
|
-
"train_all_data": "{{train_all_data}}"
|
|
21
|
+
"train_all_data": "{{train_all_data}}",
|
|
26
22
|
}
|
|
27
23
|
|
|
24
|
+
|
|
28
25
|
# Function to check if dataframe is empty
|
|
29
26
|
def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
|
|
30
27
|
"""
|
|
@@ -64,6 +61,7 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
|
|
|
64
61
|
# Rename the DataFrame columns to match the model features
|
|
65
62
|
return df.rename(columns=rename_dict)
|
|
66
63
|
|
|
64
|
+
|
|
67
65
|
if __name__ == "__main__":
|
|
68
66
|
"""The main function is for training the XGBoost Quantile Regression models"""
|
|
69
67
|
|
|
@@ -86,10 +84,7 @@ if __name__ == "__main__":
|
|
|
86
84
|
args = parser.parse_args()
|
|
87
85
|
|
|
88
86
|
# Load training data from the specified directory
|
|
89
|
-
training_files = [
|
|
90
|
-
os.path.join(args.train, file)
|
|
91
|
-
for file in os.listdir(args.train) if file.endswith(".csv")
|
|
92
|
-
]
|
|
87
|
+
training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
|
|
93
88
|
print(f"Training Files: {training_files}")
|
|
94
89
|
|
|
95
90
|
# Combine files and read them all into a single pandas dataframe
|
|
@@ -8,7 +8,7 @@ TEMPLATE_PARAMS = {
|
|
|
8
8
|
"feature_list": "{{feature_list}}",
|
|
9
9
|
"model_class": "{{model_class}}",
|
|
10
10
|
"model_metrics_s3_path": "{{model_metrics_s3_path}}",
|
|
11
|
-
"train_all_data": "{{train_all_data}}"
|
|
11
|
+
"train_all_data": "{{train_all_data}}",
|
|
12
12
|
}
|
|
13
13
|
|
|
14
14
|
import awswrangler as wr
|
|
@@ -99,10 +99,7 @@ if __name__ == "__main__":
|
|
|
99
99
|
args = parser.parse_args()
|
|
100
100
|
|
|
101
101
|
# Load training data from the specified directory
|
|
102
|
-
training_files = [
|
|
103
|
-
os.path.join(args.train, file)
|
|
104
|
-
for file in os.listdir(args.train) if file.endswith(".csv")
|
|
105
|
-
]
|
|
102
|
+
training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
|
|
106
103
|
all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
|
|
107
104
|
|
|
108
105
|
# Check if the DataFrame is empty
|
|
@@ -116,10 +113,7 @@ if __name__ == "__main__":
|
|
|
116
113
|
|
|
117
114
|
if needs_standardization:
|
|
118
115
|
# Create a pipeline with standardization and the model
|
|
119
|
-
model = Pipeline([
|
|
120
|
-
("scaler", StandardScaler()),
|
|
121
|
-
("model", model)
|
|
122
|
-
])
|
|
116
|
+
model = Pipeline([("scaler", StandardScaler()), ("model", model)])
|
|
123
117
|
|
|
124
118
|
# Handle logic based on the model_type
|
|
125
119
|
if model_type in ["classifier", "regressor"]:
|
|
@@ -206,6 +200,7 @@ if __name__ == "__main__":
|
|
|
206
200
|
with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
|
|
207
201
|
json.dump(feature_list, fp)
|
|
208
202
|
|
|
203
|
+
|
|
209
204
|
#
|
|
210
205
|
# Inference Section
|
|
211
206
|
#
|
|
@@ -32,10 +32,12 @@ TEMPLATE_PARAMS = {
|
|
|
32
32
|
"target": "udm_asy_res_value",
|
|
33
33
|
"features": ['chi2v', 'fr_sulfone', 'chi1v', 'bcut2d_logplow', 'fr_piperzine', 'kappa3', 'smr_vsa1', 'slogp_vsa5', 'fr_ketone_topliss', 'fr_sulfonamd', 'fr_imine', 'fr_benzene', 'fr_ester', 'chi2n', 'labuteasa', 'peoe_vsa2', 'smr_vsa6', 'bcut2d_chglo', 'fr_sh', 'peoe_vsa1', 'fr_allylic_oxid', 'chi4n', 'fr_ar_oh', 'fr_nh0', 'fr_term_acetylene', 'slogp_vsa7', 'slogp_vsa4', 'estate_vsa1', 'vsa_estate4', 'numbridgeheadatoms', 'numheterocycles', 'fr_ketone', 'fr_morpholine', 'fr_guanido', 'estate_vsa2', 'numheteroatoms', 'fr_nitro_arom_nonortho', 'fr_piperdine', 'nocount', 'numspiroatoms', 'fr_aniline', 'fr_thiophene', 'slogp_vsa10', 'fr_amide', 'slogp_vsa2', 'fr_epoxide', 'vsa_estate7', 'fr_ar_coo', 'fr_imidazole', 'fr_nitrile', 'fr_oxazole', 'numsaturatedrings', 'fr_pyridine', 'fr_hoccn', 'fr_ndealkylation1', 'numaliphaticheterocycles', 'fr_phenol', 'maxpartialcharge', 'vsa_estate5', 'peoe_vsa13', 'minpartialcharge', 'qed', 'fr_al_oh', 'slogp_vsa11', 'chi0n', 'fr_bicyclic', 'peoe_vsa12', 'fpdensitymorgan1', 'fr_oxime', 'molwt', 'fr_dihydropyridine', 'smr_vsa5', 'peoe_vsa5', 'fr_nitro', 'hallkieralpha', 'heavyatommolwt', 'fr_alkyl_halide', 'peoe_vsa8', 'fr_nhpyrrole', 'fr_isocyan', 'bcut2d_chghi', 'fr_lactam', 'peoe_vsa11', 'smr_vsa9', 'tpsa', 'chi4v', 'slogp_vsa1', 'phi', 'bcut2d_logphi', 'avgipc', 'estate_vsa11', 'fr_coo', 'bcut2d_mwhi', 'numunspecifiedatomstereocenters', 'vsa_estate10', 'estate_vsa8', 'numvalenceelectrons', 'fr_nh2', 'fr_lactone', 'vsa_estate1', 'estate_vsa4', 'numatomstereocenters', 'vsa_estate8', 'fr_para_hydroxylation', 'peoe_vsa3', 'fr_thiazole', 'peoe_vsa10', 'fr_ndealkylation2', 'slogp_vsa12', 'peoe_vsa9', 'maxestateindex', 'fr_quatn', 'smr_vsa7', 'minestateindex', 'numaromaticheterocycles', 'numrotatablebonds', 'fr_ar_nh', 'fr_ether', 'exactmolwt', 'fr_phenol_noorthohbond', 'slogp_vsa3', 'fr_ar_n', 'sps', 'fr_c_o_nocoo', 'bertzct', 'peoe_vsa7', 'slogp_vsa8', 'numradicalelectrons', 'molmr', 'fr_tetrazole', 'numsaturatedcarbocycles', 'bcut2d_mrhi', 'kappa1', 'numamidebonds', 'fpdensitymorgan2', 'smr_vsa8', 'chi1n', 'estate_vsa6', 'fr_barbitur', 'fr_diazo', 'kappa2', 'chi0', 'bcut2d_mrlow', 'balabanj', 'peoe_vsa4', 'numhacceptors', 'fr_sulfide', 'chi3n', 'smr_vsa2', 'fr_al_oh_notert', 'fr_benzodiazepine', 'fr_phos_ester', 'fr_aldehyde', 'fr_coo2', 'estate_vsa5', 'fr_prisulfonamd', 'numaromaticcarbocycles', 'fr_unbrch_alkane', 'fr_urea', 'fr_nitroso', 'smr_vsa10', 'fr_c_s', 'smr_vsa3', 'fr_methoxy', 'maxabspartialcharge', 'slogp_vsa9', 'heavyatomcount', 'fr_azide', 'chi3v', 'smr_vsa4', 'mollogp', 'chi0v', 'fr_aryl_methyl', 'fr_nh1', 'fpdensitymorgan3', 'fr_furan', 'fr_hdrzine', 'fr_arn', 'numaromaticrings', 'vsa_estate3', 'fr_azo', 'fr_halogen', 'estate_vsa9', 'fr_hdrzone', 'numhdonors', 'fr_alkyl_carbamate', 'fr_isothiocyan', 'minabspartialcharge', 'fr_al_coo', 'ringcount', 'chi1', 'estate_vsa7', 'fr_nitro_arom', 'vsa_estate9', 'minabsestateindex', 'maxabsestateindex', 'vsa_estate6', 'estate_vsa10', 'estate_vsa3', 'fr_n_o', 'fr_amidine', 'fr_thiocyan', 'fr_phos_acid', 'fr_c_o', 'fr_imide', 'numaliphaticrings', 'peoe_vsa6', 'vsa_estate2', 'nhohcount', 'numsaturatedheterocycles', 'slogp_vsa6', 'peoe_vsa14', 'fractioncsp3', 'bcut2d_mwlow', 'numaliphaticcarbocycles', 'fr_priamide', 'nacid', 'nbase', 'naromatom', 'narombond', 'sz', 'sm', 'sv', 'sse', 'spe', 'sare', 'sp', 'si', 'mz', 'mm', 'mv', 'mse', 'mpe', 'mare', 'mp', 'mi', 'xch_3d', 'xch_4d', 'xch_5d', 'xch_6d', 'xch_7d', 'xch_3dv', 'xch_4dv', 'xch_5dv', 'xch_6dv', 'xch_7dv', 'xc_3d', 'xc_4d', 'xc_5d', 'xc_6d', 'xc_3dv', 'xc_4dv', 'xc_5dv', 'xc_6dv', 'xpc_4d', 'xpc_5d', 'xpc_6d', 'xpc_4dv', 'xpc_5dv', 'xpc_6dv', 'xp_0d', 'xp_1d', 'xp_2d', 'xp_3d', 'xp_4d', 'xp_5d', 'xp_6d', 'xp_7d', 'axp_0d', 'axp_1d', 'axp_2d', 'axp_3d', 'axp_4d', 'axp_5d', 'axp_6d', 'axp_7d', 'xp_0dv', 'xp_1dv', 'xp_2dv', 'xp_3dv', 'xp_4dv', 'xp_5dv', 'xp_6dv', 'xp_7dv', 'axp_0dv', 'axp_1dv', 'axp_2dv', 'axp_3dv', 'axp_4dv', 'axp_5dv', 'axp_6dv', 'axp_7dv', 'c1sp1', 'c2sp1', 'c1sp2', 'c2sp2', 'c3sp2', 'c1sp3', 'c2sp3', 'c3sp3', 'c4sp3', 'hybratio', 'fcsp3', 'num_stereocenters', 'num_unspecified_stereocenters', 'num_defined_stereocenters', 'num_r_centers', 'num_s_centers', 'num_stereobonds', 'num_e_bonds', 'num_z_bonds', 'stereo_complexity', 'frac_defined_stereo'],
|
|
34
34
|
"compressed_features": [],
|
|
35
|
-
"model_metrics_s3_path": "s3://ideaya-sageworks-bucket/models/
|
|
36
|
-
"train_all_data":
|
|
35
|
+
"model_metrics_s3_path": "s3://ideaya-sageworks-bucket/models/logd-hyper-80/training",
|
|
36
|
+
"train_all_data": False,
|
|
37
|
+
"hyperparameters": {'n_estimators': 200, 'max_depth': 6, 'learning_rate': 0.05, 'subsample': 0.7, 'colsample_bytree': 0.3, 'colsample_bylevel': 0.5, 'min_child_weight': 5, 'gamma': 0.2, 'reg_alpha': 0.5, 'reg_lambda': 2.0, 'scale_pos_weight': 1},
|
|
37
38
|
}
|
|
38
39
|
|
|
40
|
+
|
|
39
41
|
# Function to check if dataframe is empty
|
|
40
42
|
def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
|
|
41
43
|
"""
|
|
@@ -75,7 +77,7 @@ def expand_proba_column(df: pd.DataFrame, class_labels: List[str]) -> pd.DataFra
|
|
|
75
77
|
proba_df = pd.DataFrame(df[proba_column].tolist(), columns=proba_splits)
|
|
76
78
|
|
|
77
79
|
# Drop any proba columns and reset the index in prep for the concat
|
|
78
|
-
df = df.drop(columns=[proba_column]+proba_splits, errors="ignore")
|
|
80
|
+
df = df.drop(columns=[proba_column] + proba_splits, errors="ignore")
|
|
79
81
|
df = df.reset_index(drop=True)
|
|
80
82
|
|
|
81
83
|
# Concatenate the new columns with the original DataFrame
|
|
@@ -140,8 +142,10 @@ def convert_categorical_types(df: pd.DataFrame, features: list, category_mapping
|
|
|
140
142
|
return df, category_mappings
|
|
141
143
|
|
|
142
144
|
|
|
143
|
-
def decompress_features(
|
|
144
|
-
|
|
145
|
+
def decompress_features(
|
|
146
|
+
df: pd.DataFrame, features: List[str], compressed_features: List[str]
|
|
147
|
+
) -> Tuple[pd.DataFrame, List[str]]:
|
|
148
|
+
"""Prepare features for the model
|
|
145
149
|
|
|
146
150
|
Args:
|
|
147
151
|
df (pd.DataFrame): The features DataFrame
|
|
@@ -204,6 +208,7 @@ if __name__ == "__main__":
|
|
|
204
208
|
model_type = TEMPLATE_PARAMS["model_type"]
|
|
205
209
|
model_metrics_s3_path = TEMPLATE_PARAMS["model_metrics_s3_path"]
|
|
206
210
|
train_all_data = TEMPLATE_PARAMS["train_all_data"]
|
|
211
|
+
hyperparameters = TEMPLATE_PARAMS["hyperparameters"]
|
|
207
212
|
validation_split = 0.2
|
|
208
213
|
|
|
209
214
|
# Script arguments for input/output directories
|
|
@@ -216,11 +221,7 @@ if __name__ == "__main__":
|
|
|
216
221
|
args = parser.parse_args()
|
|
217
222
|
|
|
218
223
|
# Read the training data into DataFrames
|
|
219
|
-
training_files = [
|
|
220
|
-
os.path.join(args.train, file)
|
|
221
|
-
for file in os.listdir(args.train)
|
|
222
|
-
if file.endswith(".csv")
|
|
223
|
-
]
|
|
224
|
+
training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
|
|
224
225
|
print(f"Training Files: {training_files}")
|
|
225
226
|
|
|
226
227
|
# Combine files and read them all into a single pandas dataframe
|
|
@@ -255,15 +256,16 @@ if __name__ == "__main__":
|
|
|
255
256
|
else:
|
|
256
257
|
# Just do a random training Split
|
|
257
258
|
print("WARNING: No training column found, splitting data with random state=42")
|
|
258
|
-
df_train, df_val = train_test_split(
|
|
259
|
-
all_df, test_size=validation_split, random_state=42
|
|
260
|
-
)
|
|
259
|
+
df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
|
|
261
260
|
print(f"FIT/TRAIN: {df_train.shape}")
|
|
262
261
|
print(f"VALIDATION: {df_val.shape}")
|
|
263
262
|
|
|
263
|
+
# Use any hyperparameters to set up both the trainer and model configurations
|
|
264
|
+
print(f"Hyperparameters: {hyperparameters}")
|
|
265
|
+
|
|
264
266
|
# Now spin up our XGB Model
|
|
265
267
|
if model_type == "classifier":
|
|
266
|
-
xgb_model = xgb.XGBClassifier(enable_categorical=True)
|
|
268
|
+
xgb_model = xgb.XGBClassifier(enable_categorical=True, **hyperparameters)
|
|
267
269
|
|
|
268
270
|
# Encode the target column
|
|
269
271
|
label_encoder = LabelEncoder()
|
|
@@ -271,12 +273,12 @@ if __name__ == "__main__":
|
|
|
271
273
|
df_val[target] = label_encoder.transform(df_val[target])
|
|
272
274
|
|
|
273
275
|
else:
|
|
274
|
-
xgb_model = xgb.XGBRegressor(enable_categorical=True)
|
|
276
|
+
xgb_model = xgb.XGBRegressor(enable_categorical=True, **hyperparameters)
|
|
275
277
|
label_encoder = None # We don't need this for regression
|
|
276
278
|
|
|
277
279
|
# Grab our Features, Target and Train the Model
|
|
278
280
|
y_train = df_train[target]
|
|
279
|
-
X_train= df_train[features]
|
|
281
|
+
X_train = df_train[features]
|
|
280
282
|
xgb_model.fit(X_train, y_train)
|
|
281
283
|
|
|
282
284
|
# Make Predictions on the Validation Set
|
|
@@ -315,9 +317,7 @@ if __name__ == "__main__":
|
|
|
315
317
|
label_names = label_encoder.classes_
|
|
316
318
|
|
|
317
319
|
# Calculate various model performance metrics
|
|
318
|
-
scores = precision_recall_fscore_support(
|
|
319
|
-
y_validate, preds, average=None, labels=label_names
|
|
320
|
-
)
|
|
320
|
+
scores = precision_recall_fscore_support(y_validate, preds, average=None, labels=label_names)
|
|
321
321
|
|
|
322
322
|
# Put the scores into a dataframe
|
|
323
323
|
score_df = pd.DataFrame(
|
|
@@ -355,7 +355,9 @@ if __name__ == "__main__":
|
|
|
355
355
|
print(f"NumRows: {len(df_val)}")
|
|
356
356
|
|
|
357
357
|
# Now save the model to the standard place/name
|
|
358
|
-
|
|
358
|
+
joblib.dump(xgb_model, os.path.join(args.model_dir, "xgb_model.joblib"))
|
|
359
|
+
|
|
360
|
+
# Save the label encoder if we have one
|
|
359
361
|
if label_encoder:
|
|
360
362
|
joblib.dump(label_encoder, os.path.join(args.model_dir, "label_encoder.joblib"))
|
|
361
363
|
|
|
@@ -370,19 +372,8 @@ if __name__ == "__main__":
|
|
|
370
372
|
|
|
371
373
|
def model_fn(model_dir):
|
|
372
374
|
"""Deserialize and return fitted XGBoost model"""
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
with open(model_path, "r") as f:
|
|
377
|
-
model_json = json.load(f)
|
|
378
|
-
|
|
379
|
-
sklearn_data = model_json['learner']['attributes']['scikit_learn']
|
|
380
|
-
model_type = json.loads(sklearn_data)['_estimator_type']
|
|
381
|
-
|
|
382
|
-
model_class = xgb.XGBClassifier if model_type == "classifier" else xgb.XGBRegressor
|
|
383
|
-
model = model_class(enable_categorical=True)
|
|
384
|
-
model.load_model(model_path)
|
|
385
|
-
|
|
375
|
+
model_path = os.path.join(model_dir, "xgb_model.joblib")
|
|
376
|
+
model = joblib.load(model_path)
|
|
386
377
|
return model
|
|
387
378
|
|
|
388
379
|
|
|
@@ -33,9 +33,11 @@ TEMPLATE_PARAMS = {
|
|
|
33
33
|
"features": "{{feature_list}}",
|
|
34
34
|
"compressed_features": "{{compressed_features}}",
|
|
35
35
|
"model_metrics_s3_path": "{{model_metrics_s3_path}}",
|
|
36
|
-
"train_all_data": "{{train_all_data}}"
|
|
36
|
+
"train_all_data": "{{train_all_data}}",
|
|
37
|
+
"hyperparameters": "{{hyperparameters}}",
|
|
37
38
|
}
|
|
38
39
|
|
|
40
|
+
|
|
39
41
|
# Function to check if dataframe is empty
|
|
40
42
|
def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
|
|
41
43
|
"""
|
|
@@ -75,7 +77,7 @@ def expand_proba_column(df: pd.DataFrame, class_labels: List[str]) -> pd.DataFra
|
|
|
75
77
|
proba_df = pd.DataFrame(df[proba_column].tolist(), columns=proba_splits)
|
|
76
78
|
|
|
77
79
|
# Drop any proba columns and reset the index in prep for the concat
|
|
78
|
-
df = df.drop(columns=[proba_column]+proba_splits, errors="ignore")
|
|
80
|
+
df = df.drop(columns=[proba_column] + proba_splits, errors="ignore")
|
|
79
81
|
df = df.reset_index(drop=True)
|
|
80
82
|
|
|
81
83
|
# Concatenate the new columns with the original DataFrame
|
|
@@ -140,8 +142,10 @@ def convert_categorical_types(df: pd.DataFrame, features: list, category_mapping
|
|
|
140
142
|
return df, category_mappings
|
|
141
143
|
|
|
142
144
|
|
|
143
|
-
def decompress_features(
|
|
144
|
-
|
|
145
|
+
def decompress_features(
|
|
146
|
+
df: pd.DataFrame, features: List[str], compressed_features: List[str]
|
|
147
|
+
) -> Tuple[pd.DataFrame, List[str]]:
|
|
148
|
+
"""Prepare features for the model
|
|
145
149
|
|
|
146
150
|
Args:
|
|
147
151
|
df (pd.DataFrame): The features DataFrame
|
|
@@ -204,6 +208,7 @@ if __name__ == "__main__":
|
|
|
204
208
|
model_type = TEMPLATE_PARAMS["model_type"]
|
|
205
209
|
model_metrics_s3_path = TEMPLATE_PARAMS["model_metrics_s3_path"]
|
|
206
210
|
train_all_data = TEMPLATE_PARAMS["train_all_data"]
|
|
211
|
+
hyperparameters = TEMPLATE_PARAMS["hyperparameters"]
|
|
207
212
|
validation_split = 0.2
|
|
208
213
|
|
|
209
214
|
# Script arguments for input/output directories
|
|
@@ -216,11 +221,7 @@ if __name__ == "__main__":
|
|
|
216
221
|
args = parser.parse_args()
|
|
217
222
|
|
|
218
223
|
# Read the training data into DataFrames
|
|
219
|
-
training_files = [
|
|
220
|
-
os.path.join(args.train, file)
|
|
221
|
-
for file in os.listdir(args.train)
|
|
222
|
-
if file.endswith(".csv")
|
|
223
|
-
]
|
|
224
|
+
training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
|
|
224
225
|
print(f"Training Files: {training_files}")
|
|
225
226
|
|
|
226
227
|
# Combine files and read them all into a single pandas dataframe
|
|
@@ -255,15 +256,16 @@ if __name__ == "__main__":
|
|
|
255
256
|
else:
|
|
256
257
|
# Just do a random training Split
|
|
257
258
|
print("WARNING: No training column found, splitting data with random state=42")
|
|
258
|
-
df_train, df_val = train_test_split(
|
|
259
|
-
all_df, test_size=validation_split, random_state=42
|
|
260
|
-
)
|
|
259
|
+
df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
|
|
261
260
|
print(f"FIT/TRAIN: {df_train.shape}")
|
|
262
261
|
print(f"VALIDATION: {df_val.shape}")
|
|
263
262
|
|
|
263
|
+
# Use any hyperparameters to set up both the trainer and model configurations
|
|
264
|
+
print(f"Hyperparameters: {hyperparameters}")
|
|
265
|
+
|
|
264
266
|
# Now spin up our XGB Model
|
|
265
267
|
if model_type == "classifier":
|
|
266
|
-
xgb_model = xgb.XGBClassifier(enable_categorical=True)
|
|
268
|
+
xgb_model = xgb.XGBClassifier(enable_categorical=True, **hyperparameters)
|
|
267
269
|
|
|
268
270
|
# Encode the target column
|
|
269
271
|
label_encoder = LabelEncoder()
|
|
@@ -271,12 +273,12 @@ if __name__ == "__main__":
|
|
|
271
273
|
df_val[target] = label_encoder.transform(df_val[target])
|
|
272
274
|
|
|
273
275
|
else:
|
|
274
|
-
xgb_model = xgb.XGBRegressor(enable_categorical=True)
|
|
276
|
+
xgb_model = xgb.XGBRegressor(enable_categorical=True, **hyperparameters)
|
|
275
277
|
label_encoder = None # We don't need this for regression
|
|
276
278
|
|
|
277
279
|
# Grab our Features, Target and Train the Model
|
|
278
280
|
y_train = df_train[target]
|
|
279
|
-
X_train= df_train[features]
|
|
281
|
+
X_train = df_train[features]
|
|
280
282
|
xgb_model.fit(X_train, y_train)
|
|
281
283
|
|
|
282
284
|
# Make Predictions on the Validation Set
|
|
@@ -315,9 +317,7 @@ if __name__ == "__main__":
|
|
|
315
317
|
label_names = label_encoder.classes_
|
|
316
318
|
|
|
317
319
|
# Calculate various model performance metrics
|
|
318
|
-
scores = precision_recall_fscore_support(
|
|
319
|
-
y_validate, preds, average=None, labels=label_names
|
|
320
|
-
)
|
|
320
|
+
scores = precision_recall_fscore_support(y_validate, preds, average=None, labels=label_names)
|
|
321
321
|
|
|
322
322
|
# Put the scores into a dataframe
|
|
323
323
|
score_df = pd.DataFrame(
|
|
@@ -355,7 +355,9 @@ if __name__ == "__main__":
|
|
|
355
355
|
print(f"NumRows: {len(df_val)}")
|
|
356
356
|
|
|
357
357
|
# Now save the model to the standard place/name
|
|
358
|
-
|
|
358
|
+
joblib.dump(xgb_model, os.path.join(args.model_dir, "xgb_model.joblib"))
|
|
359
|
+
|
|
360
|
+
# Save the label encoder if we have one
|
|
359
361
|
if label_encoder:
|
|
360
362
|
joblib.dump(label_encoder, os.path.join(args.model_dir, "label_encoder.joblib"))
|
|
361
363
|
|
|
@@ -370,19 +372,8 @@ if __name__ == "__main__":
|
|
|
370
372
|
|
|
371
373
|
def model_fn(model_dir):
|
|
372
374
|
"""Deserialize and return fitted XGBoost model"""
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
with open(model_path, "r") as f:
|
|
377
|
-
model_json = json.load(f)
|
|
378
|
-
|
|
379
|
-
sklearn_data = model_json['learner']['attributes']['scikit_learn']
|
|
380
|
-
model_type = json.loads(sklearn_data)['_estimator_type']
|
|
381
|
-
|
|
382
|
-
model_class = xgb.XGBClassifier if model_type == "classifier" else xgb.XGBRegressor
|
|
383
|
-
model = model_class(enable_categorical=True)
|
|
384
|
-
model.load_model(model_path)
|
|
385
|
-
|
|
375
|
+
model_path = os.path.join(model_dir, "xgb_model.joblib")
|
|
376
|
+
model = joblib.load(model_path)
|
|
386
377
|
return model
|
|
387
378
|
|
|
388
379
|
|
|
@@ -13,12 +13,13 @@ cm = ConfigManager()
|
|
|
13
13
|
workbench_bucket = cm.get_config("WORKBENCH_BUCKET")
|
|
14
14
|
|
|
15
15
|
|
|
16
|
-
def submit_to_sqs(script_path: str, size: str = "small") -> None:
|
|
16
|
+
def submit_to_sqs(script_path: str, size: str = "small", realtime: bool = False) -> None:
|
|
17
17
|
"""
|
|
18
18
|
Upload script to S3 and submit message to SQS queue for processing.
|
|
19
19
|
Args:
|
|
20
20
|
script_path: Local path to the ML pipeline script
|
|
21
21
|
size: Job size tier - "small" (default), "medium", or "large"
|
|
22
|
+
realtime: If True, sets serverless=False for real-time processing (default: False, meaning serverless=True)
|
|
22
23
|
"""
|
|
23
24
|
print(f"\n{'=' * 60}")
|
|
24
25
|
print("🚀 SUBMITTING ML PIPELINE JOB")
|
|
@@ -33,6 +34,7 @@ def submit_to_sqs(script_path: str, size: str = "small") -> None:
|
|
|
33
34
|
|
|
34
35
|
print(f"📄 Script: {script_file.name}")
|
|
35
36
|
print(f"📏 Size tier: {size}")
|
|
37
|
+
print(f"⚡ Mode: {'Real-time' if realtime else 'Serverless'} (serverless={'False' if realtime else 'True'})")
|
|
36
38
|
print(f"🪣 Bucket: {workbench_bucket}")
|
|
37
39
|
sqs = AWSAccountClamp().boto3_session.client("sqs")
|
|
38
40
|
script_name = script_file.name
|
|
@@ -88,6 +90,10 @@ def submit_to_sqs(script_path: str, size: str = "small") -> None:
|
|
|
88
90
|
|
|
89
91
|
# Prepare message
|
|
90
92
|
message = {"script_path": s3_path, "size": size}
|
|
93
|
+
|
|
94
|
+
# Set serverless environment variable (defaults to True, False if --realtime)
|
|
95
|
+
message["environment"] = {"SERVERLESS": "False" if realtime else "True"}
|
|
96
|
+
|
|
91
97
|
print("\n📨 Sending message to SQS...")
|
|
92
98
|
|
|
93
99
|
# Send the message to SQS
|
|
@@ -110,6 +116,7 @@ def submit_to_sqs(script_path: str, size: str = "small") -> None:
|
|
|
110
116
|
print(f"{'=' * 60}")
|
|
111
117
|
print(f"📄 Script: {script_name}")
|
|
112
118
|
print(f"📏 Size: {size}")
|
|
119
|
+
print(f"⚡ Mode: {'Real-time' if realtime else 'Serverless'} (SERVERLESS={'False' if realtime else 'True'})")
|
|
113
120
|
print(f"🆔 Message ID: {message_id}")
|
|
114
121
|
print("\n🔍 MONITORING LOCATIONS:")
|
|
115
122
|
print(f" • SQS Queue: AWS Console → SQS → {queue_name}")
|
|
@@ -126,9 +133,14 @@ def main():
|
|
|
126
133
|
parser.add_argument(
|
|
127
134
|
"--size", default="small", choices=["small", "medium", "large"], help="Job size tier (default: small)"
|
|
128
135
|
)
|
|
136
|
+
parser.add_argument(
|
|
137
|
+
"--realtime",
|
|
138
|
+
action="store_true",
|
|
139
|
+
help="Run in real-time mode (sets serverless=False). Default is serverless mode (serverless=True)",
|
|
140
|
+
)
|
|
129
141
|
args = parser.parse_args()
|
|
130
142
|
try:
|
|
131
|
-
submit_to_sqs(args.script_file, args.size)
|
|
143
|
+
submit_to_sqs(args.script_file, args.size, realtime=args.realtime)
|
|
132
144
|
except Exception as e:
|
|
133
145
|
print(f"\n❌ ERROR: {e}")
|
|
134
146
|
log.error(f"Error: {e}")
|
workbench/utils/model_utils.py
CHANGED
|
@@ -222,6 +222,8 @@ def uq_metrics(df: pd.DataFrame, target_col: str) -> Dict[str, Any]:
|
|
|
222
222
|
lower_95, upper_95 = df["q_025"], df["q_975"]
|
|
223
223
|
lower_90, upper_90 = df["q_05"], df["q_95"]
|
|
224
224
|
lower_80, upper_80 = df["q_10"], df["q_90"]
|
|
225
|
+
lower_68 = df.get("q_16", 0)
|
|
226
|
+
upper_68 = df.get("q_84", 0)
|
|
225
227
|
lower_50, upper_50 = df["q_25"], df["q_75"]
|
|
226
228
|
elif "prediction_std" in df.columns:
|
|
227
229
|
lower_95 = df["prediction"] - 1.96 * df["prediction_std"]
|
|
@@ -230,6 +232,8 @@ def uq_metrics(df: pd.DataFrame, target_col: str) -> Dict[str, Any]:
|
|
|
230
232
|
upper_90 = df["prediction"] + 1.645 * df["prediction_std"]
|
|
231
233
|
lower_80 = df["prediction"] - 1.282 * df["prediction_std"]
|
|
232
234
|
upper_80 = df["prediction"] + 1.282 * df["prediction_std"]
|
|
235
|
+
lower_68 = df["prediction"] - 1.0 * df["prediction_std"]
|
|
236
|
+
upper_68 = df["prediction"] + 1.0 * df["prediction_std"]
|
|
233
237
|
lower_50 = df["prediction"] - 0.674 * df["prediction_std"]
|
|
234
238
|
upper_50 = df["prediction"] + 0.674 * df["prediction_std"]
|
|
235
239
|
else:
|
|
@@ -241,11 +245,13 @@ def uq_metrics(df: pd.DataFrame, target_col: str) -> Dict[str, Any]:
|
|
|
241
245
|
coverage_95 = np.mean((df[target_col] >= lower_95) & (df[target_col] <= upper_95))
|
|
242
246
|
coverage_90 = np.mean((df[target_col] >= lower_90) & (df[target_col] <= upper_90))
|
|
243
247
|
coverage_80 = np.mean((df[target_col] >= lower_80) & (df[target_col] <= upper_80))
|
|
248
|
+
coverage_68 = np.mean((df[target_col] >= lower_68) & (df[target_col] <= upper_68))
|
|
244
249
|
coverage_50 = np.mean((df[target_col] >= lower_50) & (df[target_col] <= upper_50))
|
|
245
250
|
avg_width_95 = np.mean(upper_95 - lower_95)
|
|
246
251
|
avg_width_90 = np.mean(upper_90 - lower_90)
|
|
247
252
|
avg_width_80 = np.mean(upper_80 - lower_80)
|
|
248
253
|
avg_width_50 = np.mean(upper_50 - lower_50)
|
|
254
|
+
avg_width_68 = np.mean(upper_68 - lower_68)
|
|
249
255
|
|
|
250
256
|
# --- CRPS (measures calibration + sharpness) ---
|
|
251
257
|
z = (df[target_col] - df["prediction"]) / df["prediction_std"]
|
|
@@ -269,12 +275,14 @@ def uq_metrics(df: pd.DataFrame, target_col: str) -> Dict[str, Any]:
|
|
|
269
275
|
# Collect results
|
|
270
276
|
results = {
|
|
271
277
|
"coverage_50": coverage_50,
|
|
278
|
+
"coverage_68": coverage_68,
|
|
272
279
|
"coverage_80": coverage_80,
|
|
273
280
|
"coverage_90": coverage_90,
|
|
274
281
|
"coverage_95": coverage_95,
|
|
275
|
-
"avg_std": avg_std,
|
|
276
282
|
"median_std": median_std,
|
|
283
|
+
"avg_std": avg_std,
|
|
277
284
|
"avg_width_50": avg_width_50,
|
|
285
|
+
"avg_width_68": avg_width_68,
|
|
278
286
|
"avg_width_80": avg_width_80,
|
|
279
287
|
"avg_width_90": avg_width_90,
|
|
280
288
|
"avg_width_95": avg_width_95,
|
|
@@ -286,12 +294,14 @@ def uq_metrics(df: pd.DataFrame, target_col: str) -> Dict[str, Any]:
|
|
|
286
294
|
|
|
287
295
|
print("\n=== UQ Metrics ===")
|
|
288
296
|
print(f"Coverage @ 50%: {coverage_50:.3f} (target: 0.50)")
|
|
297
|
+
print(f"Coverage @ 68%: {coverage_68:.3f} (target: 0.68)")
|
|
289
298
|
print(f"Coverage @ 80%: {coverage_80:.3f} (target: 0.80)")
|
|
290
299
|
print(f"Coverage @ 90%: {coverage_90:.3f} (target: 0.90)")
|
|
291
300
|
print(f"Coverage @ 95%: {coverage_95:.3f} (target: 0.95)")
|
|
292
|
-
print(f"Avg Prediction StdDev: {avg_std:.3f}")
|
|
293
301
|
print(f"Median Prediction StdDev: {median_std:.3f}")
|
|
302
|
+
print(f"Avg Prediction StdDev: {avg_std:.3f}")
|
|
294
303
|
print(f"Average 50% Width: {avg_width_50:.3f}")
|
|
304
|
+
print(f"Average 68% Width: {avg_width_68:.3f}")
|
|
295
305
|
print(f"Average 80% Width: {avg_width_80:.3f}")
|
|
296
306
|
print(f"Average 90% Width: {avg_width_90:.3f}")
|
|
297
307
|
print(f"Average 95% Width: {avg_width_95:.3f}")
|