workbench 0.8.162__py3-none-any.whl → 0.8.220__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of workbench might be problematic. Click here for more details.
- workbench/algorithms/dataframe/__init__.py +1 -2
- workbench/algorithms/dataframe/compound_dataset_overlap.py +321 -0
- workbench/algorithms/dataframe/feature_space_proximity.py +168 -75
- workbench/algorithms/dataframe/fingerprint_proximity.py +422 -86
- workbench/algorithms/dataframe/projection_2d.py +44 -21
- workbench/algorithms/dataframe/proximity.py +259 -305
- workbench/algorithms/graph/light/proximity_graph.py +14 -12
- workbench/algorithms/models/cleanlab_model.py +382 -0
- workbench/algorithms/models/noise_model.py +388 -0
- workbench/algorithms/sql/outliers.py +3 -3
- workbench/api/__init__.py +5 -1
- workbench/api/compound.py +1 -1
- workbench/api/df_store.py +17 -108
- workbench/api/endpoint.py +18 -5
- workbench/api/feature_set.py +121 -15
- workbench/api/meta.py +5 -2
- workbench/api/meta_model.py +289 -0
- workbench/api/model.py +55 -21
- workbench/api/monitor.py +1 -16
- workbench/api/parameter_store.py +3 -52
- workbench/cached/cached_model.py +4 -4
- workbench/core/artifacts/__init__.py +11 -2
- workbench/core/artifacts/artifact.py +16 -8
- workbench/core/artifacts/data_capture_core.py +355 -0
- workbench/core/artifacts/df_store_core.py +114 -0
- workbench/core/artifacts/endpoint_core.py +382 -253
- workbench/core/artifacts/feature_set_core.py +249 -45
- workbench/core/artifacts/model_core.py +135 -80
- workbench/core/artifacts/monitor_core.py +33 -248
- workbench/core/artifacts/parameter_store_core.py +98 -0
- workbench/core/cloud_platform/aws/aws_account_clamp.py +50 -1
- workbench/core/cloud_platform/aws/aws_meta.py +12 -5
- workbench/core/cloud_platform/aws/aws_session.py +4 -4
- workbench/core/pipelines/pipeline_executor.py +1 -1
- workbench/core/transforms/data_to_features/light/molecular_descriptors.py +4 -4
- workbench/core/transforms/features_to_model/features_to_model.py +62 -40
- workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +76 -15
- workbench/core/transforms/pandas_transforms/pandas_to_features.py +38 -2
- workbench/core/views/training_view.py +113 -42
- workbench/core/views/view.py +53 -3
- workbench/core/views/view_utils.py +4 -4
- workbench/model_script_utils/model_script_utils.py +339 -0
- workbench/model_script_utils/pytorch_utils.py +405 -0
- workbench/model_script_utils/uq_harness.py +278 -0
- workbench/model_scripts/chemprop/chemprop.template +649 -0
- workbench/model_scripts/chemprop/generated_model_script.py +649 -0
- workbench/model_scripts/chemprop/model_script_utils.py +339 -0
- workbench/model_scripts/chemprop/requirements.txt +3 -0
- workbench/model_scripts/custom_models/chem_info/fingerprints.py +175 -0
- workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +483 -0
- workbench/model_scripts/custom_models/chem_info/mol_standardize.py +450 -0
- workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +7 -9
- workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py +1 -1
- workbench/model_scripts/custom_models/proximity/feature_space_proximity.py +194 -0
- workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +8 -10
- workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
- workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +20 -21
- workbench/model_scripts/custom_models/uq_models/feature_space_proximity.py +194 -0
- workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
- workbench/model_scripts/custom_models/uq_models/ngboost.template +30 -18
- workbench/model_scripts/custom_models/uq_models/requirements.txt +1 -3
- workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +15 -17
- workbench/model_scripts/meta_model/generated_model_script.py +209 -0
- workbench/model_scripts/meta_model/meta_model.template +209 -0
- workbench/model_scripts/pytorch_model/generated_model_script.py +444 -500
- workbench/model_scripts/pytorch_model/model_script_utils.py +339 -0
- workbench/model_scripts/pytorch_model/pytorch.template +440 -496
- workbench/model_scripts/pytorch_model/pytorch_utils.py +405 -0
- workbench/model_scripts/pytorch_model/requirements.txt +1 -1
- workbench/model_scripts/pytorch_model/uq_harness.py +278 -0
- workbench/model_scripts/scikit_learn/generated_model_script.py +7 -12
- workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
- workbench/model_scripts/script_generation.py +20 -11
- workbench/model_scripts/uq_models/generated_model_script.py +248 -0
- workbench/model_scripts/xgb_model/generated_model_script.py +372 -404
- workbench/model_scripts/xgb_model/model_script_utils.py +339 -0
- workbench/model_scripts/xgb_model/uq_harness.py +278 -0
- workbench/model_scripts/xgb_model/xgb_model.template +369 -401
- workbench/repl/workbench_shell.py +28 -19
- workbench/resources/open_source_api.key +1 -1
- workbench/scripts/endpoint_test.py +162 -0
- workbench/scripts/lambda_test.py +73 -0
- workbench/scripts/meta_model_sim.py +35 -0
- workbench/scripts/ml_pipeline_batch.py +137 -0
- workbench/scripts/ml_pipeline_sqs.py +186 -0
- workbench/scripts/monitor_cloud_watch.py +20 -100
- workbench/scripts/training_test.py +85 -0
- workbench/utils/aws_utils.py +4 -3
- workbench/utils/chem_utils/__init__.py +0 -0
- workbench/utils/chem_utils/fingerprints.py +175 -0
- workbench/utils/chem_utils/misc.py +194 -0
- workbench/utils/chem_utils/mol_descriptors.py +483 -0
- workbench/utils/chem_utils/mol_standardize.py +450 -0
- workbench/utils/chem_utils/mol_tagging.py +348 -0
- workbench/utils/chem_utils/projections.py +219 -0
- workbench/utils/chem_utils/salts.py +256 -0
- workbench/utils/chem_utils/sdf.py +292 -0
- workbench/utils/chem_utils/toxicity.py +250 -0
- workbench/utils/chem_utils/vis.py +253 -0
- workbench/utils/chemprop_utils.py +141 -0
- workbench/utils/cloudwatch_handler.py +1 -1
- workbench/utils/cloudwatch_utils.py +137 -0
- workbench/utils/config_manager.py +3 -7
- workbench/utils/endpoint_utils.py +5 -7
- workbench/utils/license_manager.py +2 -6
- workbench/utils/meta_model_simulator.py +499 -0
- workbench/utils/metrics_utils.py +256 -0
- workbench/utils/model_utils.py +278 -79
- workbench/utils/monitor_utils.py +44 -62
- workbench/utils/pandas_utils.py +3 -3
- workbench/utils/pytorch_utils.py +87 -0
- workbench/utils/shap_utils.py +11 -57
- workbench/utils/workbench_logging.py +0 -3
- workbench/utils/workbench_sqs.py +1 -1
- workbench/utils/xgboost_local_crossfold.py +267 -0
- workbench/utils/xgboost_model_utils.py +127 -219
- workbench/web_interface/components/model_plot.py +14 -2
- workbench/web_interface/components/plugin_unit_test.py +5 -2
- workbench/web_interface/components/plugins/dashboard_status.py +3 -1
- workbench/web_interface/components/plugins/generated_compounds.py +1 -1
- workbench/web_interface/components/plugins/model_details.py +38 -74
- workbench/web_interface/components/plugins/scatter_plot.py +6 -10
- {workbench-0.8.162.dist-info → workbench-0.8.220.dist-info}/METADATA +31 -9
- {workbench-0.8.162.dist-info → workbench-0.8.220.dist-info}/RECORD +128 -96
- workbench-0.8.220.dist-info/entry_points.txt +11 -0
- {workbench-0.8.162.dist-info → workbench-0.8.220.dist-info}/licenses/LICENSE +1 -1
- workbench/core/cloud_platform/aws/aws_df_store.py +0 -404
- workbench/core/cloud_platform/aws/aws_parameter_store.py +0 -280
- workbench/model_scripts/custom_models/chem_info/local_utils.py +0 -769
- workbench/model_scripts/custom_models/chem_info/tautomerize.py +0 -83
- workbench/model_scripts/custom_models/meta_endpoints/example.py +0 -53
- workbench/model_scripts/custom_models/proximity/generated_model_script.py +0 -138
- workbench/model_scripts/custom_models/proximity/proximity.py +0 -384
- workbench/model_scripts/custom_models/uq_models/generated_model_script.py +0 -393
- workbench/model_scripts/custom_models/uq_models/mapie_xgb.template +0 -203
- workbench/model_scripts/custom_models/uq_models/meta_uq.template +0 -273
- workbench/model_scripts/custom_models/uq_models/proximity.py +0 -384
- workbench/model_scripts/ensemble_xgb/generated_model_script.py +0 -279
- workbench/model_scripts/quant_regression/quant_regression.template +0 -279
- workbench/model_scripts/quant_regression/requirements.txt +0 -1
- workbench/utils/chem_utils.py +0 -1556
- workbench/utils/execution_environment.py +0 -211
- workbench/utils/fast_inference.py +0 -167
- workbench/utils/resource_utils.py +0 -39
- workbench-0.8.162.dist-info/entry_points.txt +0 -5
- {workbench-0.8.162.dist-info → workbench-0.8.220.dist-info}/WHEEL +0 -0
- {workbench-0.8.162.dist-info → workbench-0.8.220.dist-info}/top_level.txt +0 -0
|
@@ -1,279 +0,0 @@
|
|
|
1
|
-
# Template Placeholders
|
|
2
|
-
TEMPLATE_PARAMS = {
|
|
3
|
-
"model_type": "ensemble_regressor",
|
|
4
|
-
"target_column": "solubility",
|
|
5
|
-
"feature_list": ['molwt', 'mollogp', 'molmr', 'heavyatomcount', 'numhacceptors', 'numhdonors', 'numheteroatoms', 'numrotatablebonds', 'numvalenceelectrons', 'numaromaticrings', 'numsaturatedrings', 'numaliphaticrings', 'ringcount', 'tpsa', 'labuteasa', 'balabanj', 'bertzct'],
|
|
6
|
-
"model_metrics_s3_path": "s3://sandbox-sageworks-artifacts/models/aqsol-ensemble/training"
|
|
7
|
-
}
|
|
8
|
-
|
|
9
|
-
# Imports for XGB Model
|
|
10
|
-
import xgboost as xgb
|
|
11
|
-
import awswrangler as wr
|
|
12
|
-
import numpy as np
|
|
13
|
-
|
|
14
|
-
# Model Performance Scores
|
|
15
|
-
from sklearn.metrics import (
|
|
16
|
-
mean_absolute_error,
|
|
17
|
-
r2_score,
|
|
18
|
-
root_mean_squared_error
|
|
19
|
-
)
|
|
20
|
-
|
|
21
|
-
from io import StringIO
|
|
22
|
-
import json
|
|
23
|
-
import argparse
|
|
24
|
-
import os
|
|
25
|
-
import pandas as pd
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
# Function to check if dataframe is empty
|
|
29
|
-
def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
|
|
30
|
-
"""
|
|
31
|
-
Check if the provided dataframe is empty and raise an exception if it is.
|
|
32
|
-
|
|
33
|
-
Args:
|
|
34
|
-
df (pd.DataFrame): DataFrame to check
|
|
35
|
-
df_name (str): Name of the DataFrame
|
|
36
|
-
"""
|
|
37
|
-
if df.empty:
|
|
38
|
-
msg = f"*** The training data {df_name} has 0 rows! ***STOPPING***"
|
|
39
|
-
print(msg)
|
|
40
|
-
raise ValueError(msg)
|
|
41
|
-
|
|
42
|
-
def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> pd.DataFrame:
|
|
43
|
-
"""
|
|
44
|
-
Matches and renames the DataFrame's column names to match the model's feature names (case-insensitive).
|
|
45
|
-
Prioritizes exact case matches first, then falls back to case-insensitive matching if no exact match exists.
|
|
46
|
-
|
|
47
|
-
Args:
|
|
48
|
-
df (pd.DataFrame): The DataFrame with the original columns.
|
|
49
|
-
model_features (list): The desired list of feature names (mixed case allowed).
|
|
50
|
-
|
|
51
|
-
Returns:
|
|
52
|
-
pd.DataFrame: The DataFrame with renamed columns to match the model's feature names.
|
|
53
|
-
"""
|
|
54
|
-
# Create a mapping for exact and case-insensitive matching
|
|
55
|
-
exact_match_set = set(df.columns)
|
|
56
|
-
column_map = {}
|
|
57
|
-
|
|
58
|
-
# Build the case-insensitive map (if we have any duplicate columns, the first one wins)
|
|
59
|
-
for col in df.columns:
|
|
60
|
-
lower_col = col.lower()
|
|
61
|
-
if lower_col not in column_map:
|
|
62
|
-
column_map[lower_col] = col
|
|
63
|
-
|
|
64
|
-
# Create a dictionary for renaming
|
|
65
|
-
rename_dict = {}
|
|
66
|
-
for feature in model_features:
|
|
67
|
-
# Check for an exact match first
|
|
68
|
-
if feature in exact_match_set:
|
|
69
|
-
rename_dict[feature] = feature
|
|
70
|
-
|
|
71
|
-
# If not an exact match, fall back to case-insensitive matching
|
|
72
|
-
elif feature.lower() in column_map:
|
|
73
|
-
rename_dict[column_map[feature.lower()]] = feature
|
|
74
|
-
|
|
75
|
-
# Rename the columns in the DataFrame to match the model's feature names
|
|
76
|
-
return df.rename(columns=rename_dict)
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
if __name__ == "__main__":
|
|
80
|
-
"""The main function is for training the XGBoost Quantile Regression models"""
|
|
81
|
-
|
|
82
|
-
# Harness Template Parameters
|
|
83
|
-
target = TEMPLATE_PARAMS["target_column"]
|
|
84
|
-
feature_list = TEMPLATE_PARAMS["feature_list"]
|
|
85
|
-
model_metrics_s3_path = TEMPLATE_PARAMS["model_metrics_s3_path"]
|
|
86
|
-
models = {}
|
|
87
|
-
|
|
88
|
-
# Script arguments for input/output directories
|
|
89
|
-
parser = argparse.ArgumentParser()
|
|
90
|
-
parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model"))
|
|
91
|
-
parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train"))
|
|
92
|
-
parser.add_argument(
|
|
93
|
-
"--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data")
|
|
94
|
-
)
|
|
95
|
-
args = parser.parse_args()
|
|
96
|
-
|
|
97
|
-
# Read the training data into DataFrames
|
|
98
|
-
training_files = [
|
|
99
|
-
os.path.join(args.train, file)
|
|
100
|
-
for file in os.listdir(args.train)
|
|
101
|
-
if file.endswith(".csv")
|
|
102
|
-
]
|
|
103
|
-
print(f"Training Files: {training_files}")
|
|
104
|
-
|
|
105
|
-
# Combine files and read them all into a single pandas dataframe
|
|
106
|
-
df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
|
|
107
|
-
|
|
108
|
-
# Check if the dataframe is empty
|
|
109
|
-
check_dataframe(df, "training_df")
|
|
110
|
-
|
|
111
|
-
# Features/Target output
|
|
112
|
-
print(f"Target: {target}")
|
|
113
|
-
print(f"Features: {str(feature_list)}")
|
|
114
|
-
print(f"Data Shape: {df.shape}")
|
|
115
|
-
|
|
116
|
-
# Grab our Features and Target with traditional X, y handles
|
|
117
|
-
y = df[target]
|
|
118
|
-
X = df[feature_list]
|
|
119
|
-
|
|
120
|
-
# Train 50 models with random 70/30 splits of the data
|
|
121
|
-
for model_id in range(50):
|
|
122
|
-
# Model Name
|
|
123
|
-
model_name = f"m_{model_id:02}"
|
|
124
|
-
|
|
125
|
-
# Bootstrap sample (50% with replacement)
|
|
126
|
-
sample_size = int(0.5 * len(X))
|
|
127
|
-
bootstrap_indices = np.random.choice(len(X), size=sample_size, replace=True)
|
|
128
|
-
X_train, y_train = X.iloc[bootstrap_indices], y.iloc[bootstrap_indices]
|
|
129
|
-
print(f"Training Model {model_name} with {len(X_train)} rows")
|
|
130
|
-
model = xgb.XGBRegressor(reg_alpha=0.1, reg_lambda=1.0)
|
|
131
|
-
model.fit(X_train, y_train)
|
|
132
|
-
|
|
133
|
-
# Store the model
|
|
134
|
-
models[model_name] = model
|
|
135
|
-
|
|
136
|
-
# Run predictions for each model
|
|
137
|
-
all_predictions = {model_name: model.predict(X) for model_name, model in models.items()}
|
|
138
|
-
|
|
139
|
-
# Create a copy of the provided DataFrame and add the new columns
|
|
140
|
-
result_df = df[[target]].copy()
|
|
141
|
-
|
|
142
|
-
# Add the model predictions to the DataFrame
|
|
143
|
-
for name, preds in all_predictions.items():
|
|
144
|
-
result_df[name] = preds
|
|
145
|
-
|
|
146
|
-
# Add the main prediction to the DataFrame (mean of all models)
|
|
147
|
-
result_df["prediction"] = result_df[[name for name in result_df.columns if name.startswith("m_")]].mean(axis=1)
|
|
148
|
-
|
|
149
|
-
# Now compute residuals on the rmse prediction
|
|
150
|
-
result_df["residual"] = result_df[target] - result_df["prediction"]
|
|
151
|
-
result_df["residual_abs"] = result_df["residual"].abs()
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
# Save the results dataframe to S3
|
|
155
|
-
wr.s3.to_csv(
|
|
156
|
-
result_df,
|
|
157
|
-
path=f"{model_metrics_s3_path}/validation_predictions.csv",
|
|
158
|
-
index=False,
|
|
159
|
-
)
|
|
160
|
-
|
|
161
|
-
# Report Performance Metrics
|
|
162
|
-
rmse = root_mean_squared_error(result_df[target], result_df["prediction"])
|
|
163
|
-
mae = mean_absolute_error(result_df[target], result_df["prediction"])
|
|
164
|
-
r2 = r2_score(result_df[target], result_df["prediction"])
|
|
165
|
-
print(f"RMSE: {rmse:.3f}")
|
|
166
|
-
print(f"MAE: {mae:.3f}")
|
|
167
|
-
print(f"R2: {r2:.3f}")
|
|
168
|
-
print(f"NumRows: {len(result_df)}")
|
|
169
|
-
|
|
170
|
-
# Now save the models
|
|
171
|
-
for name, model in models.items():
|
|
172
|
-
model_path = os.path.join(args.model_dir, f"{name}.json")
|
|
173
|
-
print(f"Saving model: {model_path}")
|
|
174
|
-
model.save_model(model_path)
|
|
175
|
-
|
|
176
|
-
# Also save the features (this will validate input during predictions)
|
|
177
|
-
with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
|
|
178
|
-
json.dump(feature_list, fp)
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
def model_fn(model_dir) -> dict:
|
|
182
|
-
"""Deserialized and return all the fitted models from the model directory.
|
|
183
|
-
|
|
184
|
-
Args:
|
|
185
|
-
model_dir (str): The directory where the models are stored.
|
|
186
|
-
|
|
187
|
-
Returns:
|
|
188
|
-
dict: A dictionary of the models.
|
|
189
|
-
"""
|
|
190
|
-
|
|
191
|
-
# Load ALL the models from the model directory
|
|
192
|
-
models = {}
|
|
193
|
-
for file in os.listdir(model_dir):
|
|
194
|
-
if file.startswith("m_") and file.endswith(".json"): # The Quantile models
|
|
195
|
-
# Load the model
|
|
196
|
-
model_path = os.path.join(model_dir, file)
|
|
197
|
-
print(f"Loading model: {model_path}")
|
|
198
|
-
model = xgb.XGBRegressor()
|
|
199
|
-
model.load_model(model_path)
|
|
200
|
-
|
|
201
|
-
# Store the model
|
|
202
|
-
m_name = os.path.splitext(file)[0]
|
|
203
|
-
models[m_name] = model
|
|
204
|
-
|
|
205
|
-
# Return all the models
|
|
206
|
-
return models
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
def input_fn(input_data, content_type):
|
|
210
|
-
"""Parse input data and return a DataFrame."""
|
|
211
|
-
if not input_data:
|
|
212
|
-
raise ValueError("Empty input data is not supported!")
|
|
213
|
-
|
|
214
|
-
# Decode bytes to string if necessary
|
|
215
|
-
if isinstance(input_data, bytes):
|
|
216
|
-
input_data = input_data.decode("utf-8")
|
|
217
|
-
|
|
218
|
-
if "text/csv" in content_type:
|
|
219
|
-
return pd.read_csv(StringIO(input_data))
|
|
220
|
-
elif "application/json" in content_type:
|
|
221
|
-
return pd.DataFrame(json.loads(input_data)) # Assumes JSON array of records
|
|
222
|
-
else:
|
|
223
|
-
raise ValueError(f"{content_type} not supported!")
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
def output_fn(output_df, accept_type):
|
|
227
|
-
"""Supports both CSV and JSON output formats."""
|
|
228
|
-
if "text/csv" in accept_type:
|
|
229
|
-
csv_output = output_df.fillna("N/A").to_csv(index=False) # CSV with N/A for missing values
|
|
230
|
-
return csv_output, "text/csv"
|
|
231
|
-
elif "application/json" in accept_type:
|
|
232
|
-
return output_df.to_json(orient="records"), "application/json" # JSON array of records (NaNs -> null)
|
|
233
|
-
else:
|
|
234
|
-
raise RuntimeError(f"{accept_type} accept type is not supported by this script.")
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
def predict_fn(df, models) -> pd.DataFrame:
|
|
238
|
-
"""Make Predictions with our XGB Quantile Regression Model
|
|
239
|
-
|
|
240
|
-
Args:
|
|
241
|
-
df (pd.DataFrame): The input DataFrame
|
|
242
|
-
models (dict): The dictionary of models to use for predictions
|
|
243
|
-
|
|
244
|
-
Returns:
|
|
245
|
-
pd.DataFrame: The DataFrame with the predictions added
|
|
246
|
-
"""
|
|
247
|
-
|
|
248
|
-
# Grab our feature columns (from training)
|
|
249
|
-
model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
|
|
250
|
-
with open(os.path.join(model_dir, "feature_columns.json")) as fp:
|
|
251
|
-
model_features = json.load(fp)
|
|
252
|
-
print(f"Model Features: {model_features}")
|
|
253
|
-
|
|
254
|
-
# We're going match features in a case-insensitive manner, accounting for all the permutations
|
|
255
|
-
# - Model has a feature list that's any case ("Id", "taCos", "cOunT", "likes_tacos")
|
|
256
|
-
# - Incoming data has columns that are mixed case ("ID", "Tacos", "Count", "Likes_Tacos")
|
|
257
|
-
matched_df = match_features_case_insensitive(df, model_features)
|
|
258
|
-
|
|
259
|
-
# Predict the features against all the models
|
|
260
|
-
for name, model in models.items():
|
|
261
|
-
df[name] = model.predict(matched_df[model_features])
|
|
262
|
-
|
|
263
|
-
# Add quantiles for consistency with other UQ models
|
|
264
|
-
df["q_025"] = df[[name for name in df.columns if name.startswith("m_")]].quantile(0.025, axis=1)
|
|
265
|
-
df["q_975"] = df[[name for name in df.columns if name.startswith("m_")]].quantile(0.975, axis=1)
|
|
266
|
-
df["q_25"] = df[[name for name in df.columns if name.startswith("m_")]].quantile(0.25, axis=1)
|
|
267
|
-
df["q_75"] = df[[name for name in df.columns if name.startswith("m_")]].quantile(0.75, axis=1)
|
|
268
|
-
|
|
269
|
-
# Compute the mean, min, max and stddev of the predictions
|
|
270
|
-
df["prediction"] = df[[name for name in df.columns if name.startswith("m_")]].mean(axis=1)
|
|
271
|
-
df["p_min"] = df[[name for name in df.columns if name.startswith("m_")]].min(axis=1)
|
|
272
|
-
df["p_max"] = df[[name for name in df.columns if name.startswith("m_")]].max(axis=1)
|
|
273
|
-
df["prediction_std"] = df[[name for name in df.columns if name.startswith("m_")]].std(axis=1)
|
|
274
|
-
|
|
275
|
-
# Reorganize the columns so they are in alphabetical order
|
|
276
|
-
df = df.reindex(sorted(df.columns), axis=1)
|
|
277
|
-
|
|
278
|
-
# All done, return the DataFrame
|
|
279
|
-
return df
|
|
@@ -1,279 +0,0 @@
|
|
|
1
|
-
# Imports for XGB Model
|
|
2
|
-
import xgboost as xgb
|
|
3
|
-
import awswrangler as wr
|
|
4
|
-
from sklearn.model_selection import train_test_split
|
|
5
|
-
|
|
6
|
-
# Model Performance Scores
|
|
7
|
-
from sklearn.metrics import (
|
|
8
|
-
mean_absolute_error,
|
|
9
|
-
r2_score,
|
|
10
|
-
root_mean_squared_error
|
|
11
|
-
)
|
|
12
|
-
|
|
13
|
-
from io import StringIO
|
|
14
|
-
import json
|
|
15
|
-
import argparse
|
|
16
|
-
import os
|
|
17
|
-
import pandas as pd
|
|
18
|
-
|
|
19
|
-
# Template Placeholders
|
|
20
|
-
TEMPLATE_PARAMS = {
|
|
21
|
-
"model_type": "{{model_type}}",
|
|
22
|
-
"target_column": "{{target_column}}",
|
|
23
|
-
"features": "{{feature_list}}",
|
|
24
|
-
"model_metrics_s3_path": "{{model_metrics_s3_path}}",
|
|
25
|
-
"train_all_data": "{{train_all_data}}"
|
|
26
|
-
}
|
|
27
|
-
|
|
28
|
-
# Function to check if dataframe is empty
|
|
29
|
-
def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
|
|
30
|
-
"""
|
|
31
|
-
Check if the provided dataframe is empty and raise an exception if it is.
|
|
32
|
-
|
|
33
|
-
Args:
|
|
34
|
-
df (pd.DataFrame): DataFrame to check
|
|
35
|
-
df_name (str): Name of the DataFrame
|
|
36
|
-
"""
|
|
37
|
-
if df.empty:
|
|
38
|
-
msg = f"*** The training data {df_name} has 0 rows! ***STOPPING***"
|
|
39
|
-
print(msg)
|
|
40
|
-
raise ValueError(msg)
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> pd.DataFrame:
|
|
44
|
-
"""
|
|
45
|
-
Matches and renames DataFrame columns to match model feature names (case-insensitive).
|
|
46
|
-
Prioritizes exact matches, then case-insensitive matches.
|
|
47
|
-
|
|
48
|
-
Raises ValueError if any model features cannot be matched.
|
|
49
|
-
"""
|
|
50
|
-
df_columns_lower = {col.lower(): col for col in df.columns}
|
|
51
|
-
rename_dict = {}
|
|
52
|
-
missing = []
|
|
53
|
-
for feature in model_features:
|
|
54
|
-
if feature in df.columns:
|
|
55
|
-
continue # Exact match
|
|
56
|
-
elif feature.lower() in df_columns_lower:
|
|
57
|
-
rename_dict[df_columns_lower[feature.lower()]] = feature
|
|
58
|
-
else:
|
|
59
|
-
missing.append(feature)
|
|
60
|
-
|
|
61
|
-
if missing:
|
|
62
|
-
raise ValueError(f"Features not found: {missing}")
|
|
63
|
-
|
|
64
|
-
# Rename the DataFrame columns to match the model features
|
|
65
|
-
return df.rename(columns=rename_dict)
|
|
66
|
-
|
|
67
|
-
if __name__ == "__main__":
|
|
68
|
-
"""The main function is for training the XGBoost Quantile Regression models"""
|
|
69
|
-
|
|
70
|
-
# Harness Template Parameters
|
|
71
|
-
target = TEMPLATE_PARAMS["target_column"]
|
|
72
|
-
features = TEMPLATE_PARAMS["features"]
|
|
73
|
-
model_metrics_s3_path = TEMPLATE_PARAMS["model_metrics_s3_path"]
|
|
74
|
-
train_all_data = TEMPLATE_PARAMS["train_all_data"]
|
|
75
|
-
validation_split = 0.2
|
|
76
|
-
quantiles = [0.025, 0.25, 0.50, 0.75, 0.975]
|
|
77
|
-
q_models = {}
|
|
78
|
-
|
|
79
|
-
# Script arguments for input/output directories
|
|
80
|
-
parser = argparse.ArgumentParser()
|
|
81
|
-
parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model"))
|
|
82
|
-
parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train"))
|
|
83
|
-
parser.add_argument(
|
|
84
|
-
"--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data")
|
|
85
|
-
)
|
|
86
|
-
args = parser.parse_args()
|
|
87
|
-
|
|
88
|
-
# Load training data from the specified directory
|
|
89
|
-
training_files = [
|
|
90
|
-
os.path.join(args.train, file)
|
|
91
|
-
for file in os.listdir(args.train) if file.endswith(".csv")
|
|
92
|
-
]
|
|
93
|
-
print(f"Training Files: {training_files}")
|
|
94
|
-
|
|
95
|
-
# Combine files and read them all into a single pandas dataframe
|
|
96
|
-
df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
|
|
97
|
-
|
|
98
|
-
# Check if the DataFrame is empty
|
|
99
|
-
check_dataframe(df, "training_df")
|
|
100
|
-
|
|
101
|
-
# Training data split logic
|
|
102
|
-
if train_all_data:
|
|
103
|
-
# Use all data for both training and validation
|
|
104
|
-
print("Training on all data...")
|
|
105
|
-
df_train = df.copy()
|
|
106
|
-
df_val = df.copy()
|
|
107
|
-
elif "training" in df.columns:
|
|
108
|
-
# Split data based on a 'training' column if it exists
|
|
109
|
-
print("Splitting data based on 'training' column...")
|
|
110
|
-
df_train = df[df["training"]].copy()
|
|
111
|
-
df_val = df[~df["training"]].copy()
|
|
112
|
-
else:
|
|
113
|
-
# Perform a random split if no 'training' column is found
|
|
114
|
-
print("Splitting data randomly...")
|
|
115
|
-
df_train, df_val = train_test_split(df, test_size=validation_split, random_state=42)
|
|
116
|
-
|
|
117
|
-
# Features/Target output
|
|
118
|
-
print(f"Target: {target}")
|
|
119
|
-
print(f"Features: {str(features)}")
|
|
120
|
-
print(f"Data Shape: {df.shape}")
|
|
121
|
-
|
|
122
|
-
# Prepare features and targets for training
|
|
123
|
-
X_train = df_train[features]
|
|
124
|
-
X_val = df_val[features]
|
|
125
|
-
y_train = df_train[target]
|
|
126
|
-
y_val = df_val[target]
|
|
127
|
-
|
|
128
|
-
# Train models for each of the quantiles
|
|
129
|
-
for q in quantiles:
|
|
130
|
-
params = {
|
|
131
|
-
"objective": "reg:quantileerror",
|
|
132
|
-
"quantile_alpha": q,
|
|
133
|
-
}
|
|
134
|
-
model = xgb.XGBRegressor(**params)
|
|
135
|
-
model.fit(X_train, y_train)
|
|
136
|
-
|
|
137
|
-
# Convert quantile to string
|
|
138
|
-
q_str = f"q_{int(q * 100)}" if (q * 100) == int(q * 100) else f"q_{int(q * 1000):03d}"
|
|
139
|
-
|
|
140
|
-
# Store the model
|
|
141
|
-
q_models[q_str] = model
|
|
142
|
-
|
|
143
|
-
# Run predictions for each quantile
|
|
144
|
-
quantile_predictions = {q: model.predict(X_val) for q, model in q_models.items()}
|
|
145
|
-
|
|
146
|
-
# Create a copy of the validation DataFrame and add the new columns
|
|
147
|
-
result_df = df_val[[target]].copy()
|
|
148
|
-
|
|
149
|
-
# Add the quantile predictions to the DataFrame
|
|
150
|
-
for name, preds in quantile_predictions.items():
|
|
151
|
-
result_df[name] = preds
|
|
152
|
-
|
|
153
|
-
# Add the median as the main prediction
|
|
154
|
-
result_df["prediction"] = result_df["q_50"]
|
|
155
|
-
|
|
156
|
-
# Now compute residuals on the prediction
|
|
157
|
-
result_df["residual"] = result_df[target] - result_df["prediction"]
|
|
158
|
-
result_df["residual_abs"] = result_df["residual"].abs()
|
|
159
|
-
|
|
160
|
-
# Save the results dataframe to S3
|
|
161
|
-
wr.s3.to_csv(
|
|
162
|
-
result_df,
|
|
163
|
-
path=f"{model_metrics_s3_path}/validation_predictions.csv",
|
|
164
|
-
index=False,
|
|
165
|
-
)
|
|
166
|
-
|
|
167
|
-
# Report Performance Metrics
|
|
168
|
-
rmse = root_mean_squared_error(result_df[target], result_df["prediction"])
|
|
169
|
-
mae = mean_absolute_error(result_df[target], result_df["prediction"])
|
|
170
|
-
r2 = r2_score(result_df[target], result_df["prediction"])
|
|
171
|
-
print(f"RMSE: {rmse:.3f}")
|
|
172
|
-
print(f"MAE: {mae:.3f}")
|
|
173
|
-
print(f"R2: {r2:.3f}")
|
|
174
|
-
print(f"NumRows: {len(result_df)}")
|
|
175
|
-
|
|
176
|
-
# Now save the quantile models
|
|
177
|
-
for name, model in q_models.items():
|
|
178
|
-
model_path = os.path.join(args.model_dir, f"{name}.json")
|
|
179
|
-
print(f"Saving model: {model_path}")
|
|
180
|
-
model.save_model(model_path)
|
|
181
|
-
|
|
182
|
-
# Also save the features (this will validate input during predictions)
|
|
183
|
-
with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
|
|
184
|
-
json.dump(features, fp)
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
def model_fn(model_dir) -> dict:
|
|
188
|
-
"""Deserialized and return all the fitted models from the model directory.
|
|
189
|
-
|
|
190
|
-
Args:
|
|
191
|
-
model_dir (str): The directory where the models are stored.
|
|
192
|
-
|
|
193
|
-
Returns:
|
|
194
|
-
dict: A dictionary of the models.
|
|
195
|
-
"""
|
|
196
|
-
|
|
197
|
-
# Load ALL the Quantile models from the model directory
|
|
198
|
-
models = {}
|
|
199
|
-
for file in os.listdir(model_dir):
|
|
200
|
-
if file.startswith("q") and file.endswith(".json"): # The Quantile models
|
|
201
|
-
# Load the model
|
|
202
|
-
model_path = os.path.join(model_dir, file)
|
|
203
|
-
print(f"Loading model: {model_path}")
|
|
204
|
-
model = xgb.XGBRegressor()
|
|
205
|
-
model.load_model(model_path)
|
|
206
|
-
|
|
207
|
-
# Store the quantile model
|
|
208
|
-
q_name = os.path.splitext(file)[0]
|
|
209
|
-
models[q_name] = model
|
|
210
|
-
|
|
211
|
-
# Return all the models
|
|
212
|
-
return models
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
def input_fn(input_data, content_type):
|
|
216
|
-
"""Parse input data and return a DataFrame."""
|
|
217
|
-
if not input_data:
|
|
218
|
-
raise ValueError("Empty input data is not supported!")
|
|
219
|
-
|
|
220
|
-
# Decode bytes to string if necessary
|
|
221
|
-
if isinstance(input_data, bytes):
|
|
222
|
-
input_data = input_data.decode("utf-8")
|
|
223
|
-
|
|
224
|
-
if "text/csv" in content_type:
|
|
225
|
-
return pd.read_csv(StringIO(input_data))
|
|
226
|
-
elif "application/json" in content_type:
|
|
227
|
-
return pd.DataFrame(json.loads(input_data)) # Assumes JSON array of records
|
|
228
|
-
else:
|
|
229
|
-
raise ValueError(f"{content_type} not supported!")
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
def output_fn(output_df, accept_type):
|
|
233
|
-
"""Supports both CSV and JSON output formats."""
|
|
234
|
-
if "text/csv" in accept_type:
|
|
235
|
-
csv_output = output_df.fillna("N/A").to_csv(index=False) # CSV with N/A for missing values
|
|
236
|
-
return csv_output, "text/csv"
|
|
237
|
-
elif "application/json" in accept_type:
|
|
238
|
-
return output_df.to_json(orient="records"), "application/json" # JSON array of records (NaNs -> null)
|
|
239
|
-
else:
|
|
240
|
-
raise RuntimeError(f"{accept_type} accept type is not supported by this script.")
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
def predict_fn(df, models) -> pd.DataFrame:
|
|
244
|
-
"""Make Predictions with our XGB Quantile Regression Model
|
|
245
|
-
|
|
246
|
-
Args:
|
|
247
|
-
df (pd.DataFrame): The input DataFrame
|
|
248
|
-
models (dict): The dictionary of models to use for predictions
|
|
249
|
-
|
|
250
|
-
Returns:
|
|
251
|
-
pd.DataFrame: The DataFrame with the predictions added
|
|
252
|
-
"""
|
|
253
|
-
|
|
254
|
-
# Grab our feature columns (from training)
|
|
255
|
-
model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
|
|
256
|
-
with open(os.path.join(model_dir, "feature_columns.json")) as fp:
|
|
257
|
-
model_features = json.load(fp)
|
|
258
|
-
print(f"Model Features: {model_features}")
|
|
259
|
-
|
|
260
|
-
# We're going match features in a case-insensitive manner, accounting for all the permutations
|
|
261
|
-
# - Model has a feature list that's any case ("Id", "taCos", "cOunT", "likes_tacos")
|
|
262
|
-
# - Incoming data has columns that are mixed case ("ID", "Tacos", "Count", "Likes_Tacos")
|
|
263
|
-
matched_df = match_features_case_insensitive(df, model_features)
|
|
264
|
-
|
|
265
|
-
# Predict the features against all the models
|
|
266
|
-
for name, model in models.items():
|
|
267
|
-
df[name] = model.predict(matched_df[model_features])
|
|
268
|
-
|
|
269
|
-
# Use the median prediction as the main prediction
|
|
270
|
-
df["prediction"] = df["q_50"]
|
|
271
|
-
|
|
272
|
-
# Estimate the standard deviation of the predictions using the interquartile range
|
|
273
|
-
df["prediction_std"] = (df["q_75"] - df["q_25"]) / 1.35
|
|
274
|
-
|
|
275
|
-
# Reorganize the columns so they are in alphabetical order
|
|
276
|
-
df = df.reindex(sorted(df.columns), axis=1)
|
|
277
|
-
|
|
278
|
-
# All done, return the DataFrame
|
|
279
|
-
return df
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
# Note: In general this file should be empty (as the default inference image has all required libraries)
|