workbench 0.8.162__py3-none-any.whl → 0.8.202__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of workbench might be problematic. Click here for more details.
- workbench/algorithms/dataframe/__init__.py +1 -2
- workbench/algorithms/dataframe/fingerprint_proximity.py +2 -2
- workbench/algorithms/dataframe/proximity.py +261 -235
- workbench/algorithms/graph/light/proximity_graph.py +10 -8
- workbench/api/__init__.py +2 -1
- workbench/api/compound.py +1 -1
- workbench/api/endpoint.py +11 -0
- workbench/api/feature_set.py +11 -8
- workbench/api/meta.py +5 -2
- workbench/api/model.py +16 -15
- workbench/api/monitor.py +1 -16
- workbench/core/artifacts/__init__.py +11 -2
- workbench/core/artifacts/artifact.py +11 -3
- workbench/core/artifacts/data_capture_core.py +355 -0
- workbench/core/artifacts/endpoint_core.py +256 -118
- workbench/core/artifacts/feature_set_core.py +265 -16
- workbench/core/artifacts/model_core.py +107 -60
- workbench/core/artifacts/monitor_core.py +33 -248
- workbench/core/cloud_platform/aws/aws_account_clamp.py +50 -1
- workbench/core/cloud_platform/aws/aws_meta.py +12 -5
- workbench/core/cloud_platform/aws/aws_parameter_store.py +18 -2
- workbench/core/cloud_platform/aws/aws_session.py +4 -4
- workbench/core/transforms/data_to_features/light/molecular_descriptors.py +4 -4
- workbench/core/transforms/features_to_model/features_to_model.py +42 -32
- workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +36 -6
- workbench/core/transforms/pandas_transforms/pandas_to_features.py +27 -0
- workbench/core/views/training_view.py +113 -42
- workbench/core/views/view.py +53 -3
- workbench/core/views/view_utils.py +4 -4
- workbench/model_scripts/chemprop/chemprop.template +852 -0
- workbench/model_scripts/chemprop/generated_model_script.py +852 -0
- workbench/model_scripts/chemprop/requirements.txt +11 -0
- workbench/model_scripts/custom_models/chem_info/fingerprints.py +134 -0
- workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +483 -0
- workbench/model_scripts/custom_models/chem_info/mol_standardize.py +450 -0
- workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +7 -9
- workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py +1 -1
- workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +3 -5
- workbench/model_scripts/custom_models/proximity/proximity.py +261 -235
- workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
- workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +20 -21
- workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
- workbench/model_scripts/custom_models/uq_models/meta_uq.template +166 -62
- workbench/model_scripts/custom_models/uq_models/ngboost.template +30 -18
- workbench/model_scripts/custom_models/uq_models/proximity.py +261 -235
- workbench/model_scripts/custom_models/uq_models/requirements.txt +1 -3
- workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +15 -17
- workbench/model_scripts/pytorch_model/generated_model_script.py +373 -190
- workbench/model_scripts/pytorch_model/pytorch.template +370 -187
- workbench/model_scripts/scikit_learn/generated_model_script.py +7 -12
- workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
- workbench/model_scripts/script_generation.py +17 -9
- workbench/model_scripts/uq_models/generated_model_script.py +605 -0
- workbench/model_scripts/uq_models/mapie.template +605 -0
- workbench/model_scripts/uq_models/requirements.txt +1 -0
- workbench/model_scripts/xgb_model/generated_model_script.py +37 -46
- workbench/model_scripts/xgb_model/xgb_model.template +44 -46
- workbench/repl/workbench_shell.py +28 -14
- workbench/scripts/endpoint_test.py +162 -0
- workbench/scripts/lambda_test.py +73 -0
- workbench/scripts/ml_pipeline_batch.py +137 -0
- workbench/scripts/ml_pipeline_sqs.py +186 -0
- workbench/scripts/monitor_cloud_watch.py +20 -100
- workbench/utils/aws_utils.py +4 -3
- workbench/utils/chem_utils/__init__.py +0 -0
- workbench/utils/chem_utils/fingerprints.py +134 -0
- workbench/utils/chem_utils/misc.py +194 -0
- workbench/utils/chem_utils/mol_descriptors.py +483 -0
- workbench/utils/chem_utils/mol_standardize.py +450 -0
- workbench/utils/chem_utils/mol_tagging.py +348 -0
- workbench/utils/chem_utils/projections.py +209 -0
- workbench/utils/chem_utils/salts.py +256 -0
- workbench/utils/chem_utils/sdf.py +292 -0
- workbench/utils/chem_utils/toxicity.py +250 -0
- workbench/utils/chem_utils/vis.py +253 -0
- workbench/utils/chemprop_utils.py +760 -0
- workbench/utils/cloudwatch_handler.py +1 -1
- workbench/utils/cloudwatch_utils.py +137 -0
- workbench/utils/config_manager.py +3 -7
- workbench/utils/endpoint_utils.py +5 -7
- workbench/utils/license_manager.py +2 -6
- workbench/utils/model_utils.py +95 -34
- workbench/utils/monitor_utils.py +44 -62
- workbench/utils/pandas_utils.py +3 -3
- workbench/utils/pytorch_utils.py +526 -0
- workbench/utils/shap_utils.py +10 -2
- workbench/utils/workbench_logging.py +0 -3
- workbench/utils/workbench_sqs.py +1 -1
- workbench/utils/xgboost_model_utils.py +371 -156
- workbench/web_interface/components/model_plot.py +7 -1
- workbench/web_interface/components/plugin_unit_test.py +5 -2
- workbench/web_interface/components/plugins/dashboard_status.py +3 -1
- workbench/web_interface/components/plugins/generated_compounds.py +1 -1
- workbench/web_interface/components/plugins/model_details.py +9 -7
- workbench/web_interface/components/plugins/scatter_plot.py +3 -3
- {workbench-0.8.162.dist-info → workbench-0.8.202.dist-info}/METADATA +27 -6
- {workbench-0.8.162.dist-info → workbench-0.8.202.dist-info}/RECORD +101 -85
- {workbench-0.8.162.dist-info → workbench-0.8.202.dist-info}/entry_points.txt +4 -0
- {workbench-0.8.162.dist-info → workbench-0.8.202.dist-info}/licenses/LICENSE +1 -1
- workbench/model_scripts/custom_models/chem_info/local_utils.py +0 -769
- workbench/model_scripts/custom_models/chem_info/tautomerize.py +0 -83
- workbench/model_scripts/custom_models/proximity/generated_model_script.py +0 -138
- workbench/model_scripts/custom_models/uq_models/generated_model_script.py +0 -393
- workbench/model_scripts/custom_models/uq_models/mapie_xgb.template +0 -203
- workbench/model_scripts/ensemble_xgb/generated_model_script.py +0 -279
- workbench/model_scripts/quant_regression/quant_regression.template +0 -279
- workbench/model_scripts/quant_regression/requirements.txt +0 -1
- workbench/utils/chem_utils.py +0 -1556
- workbench/utils/execution_environment.py +0 -211
- workbench/utils/fast_inference.py +0 -167
- workbench/utils/resource_utils.py +0 -39
- {workbench-0.8.162.dist-info → workbench-0.8.202.dist-info}/WHEEL +0 -0
- {workbench-0.8.162.dist-info → workbench-0.8.202.dist-info}/top_level.txt +0 -0
|
@@ -4,13 +4,10 @@ import awswrangler as wr
|
|
|
4
4
|
import numpy as np
|
|
5
5
|
|
|
6
6
|
# Model Performance Scores
|
|
7
|
-
from sklearn.metrics import
|
|
8
|
-
mean_absolute_error,
|
|
9
|
-
r2_score,
|
|
10
|
-
root_mean_squared_error
|
|
11
|
-
)
|
|
7
|
+
from sklearn.metrics import mean_absolute_error, median_absolute_error, r2_score, root_mean_squared_error
|
|
12
8
|
from sklearn.model_selection import KFold
|
|
13
9
|
from scipy.optimize import minimize
|
|
10
|
+
from scipy.stats import spearmanr
|
|
14
11
|
|
|
15
12
|
from io import StringIO
|
|
16
13
|
import json
|
|
@@ -23,7 +20,7 @@ TEMPLATE_PARAMS = {
|
|
|
23
20
|
"features": "{{feature_list}}",
|
|
24
21
|
"target": "{{target_column}}",
|
|
25
22
|
"train_all_data": "{{train_all_data}}",
|
|
26
|
-
"model_metrics_s3_path": "{{model_metrics_s3_path}}"
|
|
23
|
+
"model_metrics_s3_path": "{{model_metrics_s3_path}}",
|
|
27
24
|
}
|
|
28
25
|
|
|
29
26
|
|
|
@@ -47,7 +44,7 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
|
|
|
47
44
|
"""
|
|
48
45
|
Matches and renames DataFrame columns to match model feature names (case-insensitive).
|
|
49
46
|
Prioritizes exact matches, then case-insensitive matches.
|
|
50
|
-
|
|
47
|
+
|
|
51
48
|
Raises ValueError if any model features cannot be matched.
|
|
52
49
|
"""
|
|
53
50
|
df_columns_lower = {col.lower(): col for col in df.columns}
|
|
@@ -90,10 +87,7 @@ if __name__ == "__main__":
|
|
|
90
87
|
args = parser.parse_args()
|
|
91
88
|
|
|
92
89
|
# Load training data from the specified directory
|
|
93
|
-
training_files = [
|
|
94
|
-
os.path.join(args.train, file)
|
|
95
|
-
for file in os.listdir(args.train) if file.endswith(".csv")
|
|
96
|
-
]
|
|
90
|
+
training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
|
|
97
91
|
df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
|
|
98
92
|
|
|
99
93
|
# Check if the DataFrame is empty
|
|
@@ -172,16 +166,14 @@ if __name__ == "__main__":
|
|
|
172
166
|
cv_residuals = np.array(cv_residuals)
|
|
173
167
|
cv_uncertainties = np.array(cv_uncertainties)
|
|
174
168
|
|
|
175
|
-
|
|
176
169
|
# Optimize calibration parameters: σ_cal = a * σ_uc + b
|
|
177
170
|
def neg_log_likelihood(params):
|
|
178
171
|
a, b = params
|
|
179
172
|
sigma_cal = a * cv_uncertainties + b
|
|
180
173
|
sigma_cal = np.maximum(sigma_cal, 1e-8) # Prevent division by zero
|
|
181
|
-
return np.sum(0.5 * np.log(2 * np.pi * sigma_cal
|
|
174
|
+
return np.sum(0.5 * np.log(2 * np.pi * sigma_cal**2) + 0.5 * (cv_residuals**2) / (sigma_cal**2))
|
|
182
175
|
|
|
183
|
-
|
|
184
|
-
result = minimize(neg_log_likelihood, x0=[1.0, 0.1], method='Nelder-Mead')
|
|
176
|
+
result = minimize(neg_log_likelihood, x0=[1.0, 0.1], method="Nelder-Mead")
|
|
185
177
|
cal_a, cal_b = result.x
|
|
186
178
|
|
|
187
179
|
print(f"Calibration parameters: a={cal_a:.4f}, b={cal_b:.4f}")
|
|
@@ -205,7 +197,9 @@ if __name__ == "__main__":
|
|
|
205
197
|
result_df["prediction"] = result_df[[name for name in result_df.columns if name.startswith("m_")]].mean(axis=1)
|
|
206
198
|
|
|
207
199
|
# Compute uncalibrated uncertainty
|
|
208
|
-
result_df["prediction_std_uc"] = result_df[[name for name in result_df.columns if name.startswith("m_")]].std(
|
|
200
|
+
result_df["prediction_std_uc"] = result_df[[name for name in result_df.columns if name.startswith("m_")]].std(
|
|
201
|
+
axis=1
|
|
202
|
+
)
|
|
209
203
|
|
|
210
204
|
# Apply calibration to uncertainty
|
|
211
205
|
result_df["prediction_std"] = cal_a * result_df["prediction_std_uc"] + cal_b
|
|
@@ -224,11 +218,16 @@ if __name__ == "__main__":
|
|
|
224
218
|
# Report Performance Metrics
|
|
225
219
|
rmse = root_mean_squared_error(result_df[target], result_df["prediction"])
|
|
226
220
|
mae = mean_absolute_error(result_df[target], result_df["prediction"])
|
|
221
|
+
medae = median_absolute_error(result_df[target], result_df["prediction"])
|
|
227
222
|
r2 = r2_score(result_df[target], result_df["prediction"])
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
print(f"
|
|
231
|
-
print(f"
|
|
223
|
+
spearman_corr = spearmanr(result_df[target], result_df["prediction"]).correlation
|
|
224
|
+
support = len(result_df)
|
|
225
|
+
print(f"rmse: {rmse:.3f}")
|
|
226
|
+
print(f"mae: {mae:.3f}")
|
|
227
|
+
print(f"medae: {medae:.3f}")
|
|
228
|
+
print(f"r2: {r2:.3f}")
|
|
229
|
+
print(f"spearmanr: {spearman_corr:.3f}")
|
|
230
|
+
print(f"support: {support}")
|
|
232
231
|
|
|
233
232
|
# Now save the models
|
|
234
233
|
for name, model in models.items():
|
|
@@ -352,4 +351,4 @@ def predict_fn(df, models) -> pd.DataFrame:
|
|
|
352
351
|
df = df.reindex(sorted(df.columns), axis=1)
|
|
353
352
|
|
|
354
353
|
# All done, return the DataFrame
|
|
355
|
-
return df
|
|
354
|
+
return df
|
|
@@ -9,7 +9,7 @@ from sklearn.model_selection import train_test_split
|
|
|
9
9
|
TEMPLATE_PARAMS = {
|
|
10
10
|
"features": "{{feature_list}}",
|
|
11
11
|
"target": "{{target_column}}",
|
|
12
|
-
"train_all_data": "{{train_all_data}}"
|
|
12
|
+
"train_all_data": "{{train_all_data}}",
|
|
13
13
|
}
|
|
14
14
|
|
|
15
15
|
from io import StringIO
|
|
@@ -33,7 +33,7 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
|
|
|
33
33
|
"""
|
|
34
34
|
Matches and renames DataFrame columns to match model feature names (case-insensitive).
|
|
35
35
|
Prioritizes exact matches, then case-insensitive matches.
|
|
36
|
-
|
|
36
|
+
|
|
37
37
|
Raises ValueError if any model features cannot be matched.
|
|
38
38
|
"""
|
|
39
39
|
df_columns_lower = {col.lower(): col for col in df.columns}
|
|
@@ -46,7 +46,7 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
|
|
|
46
46
|
rename_dict[df_columns_lower[feature.lower()]] = feature
|
|
47
47
|
else:
|
|
48
48
|
missing.append(feature)
|
|
49
|
-
|
|
49
|
+
|
|
50
50
|
if missing:
|
|
51
51
|
raise ValueError(f"Features not found: {missing}")
|
|
52
52
|
|
|
@@ -76,10 +76,7 @@ if __name__ == "__main__":
|
|
|
76
76
|
args = parser.parse_args()
|
|
77
77
|
|
|
78
78
|
# Load training data from the specified directory
|
|
79
|
-
training_files = [
|
|
80
|
-
os.path.join(args.train, file)
|
|
81
|
-
for file in os.listdir(args.train) if file.endswith(".csv")
|
|
82
|
-
]
|
|
79
|
+
training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
|
|
83
80
|
df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
|
|
84
81
|
|
|
85
82
|
# Check if the DataFrame is empty
|
|
@@ -112,10 +109,7 @@ if __name__ == "__main__":
|
|
|
112
109
|
)
|
|
113
110
|
|
|
114
111
|
# Create a Pipeline with StandardScaler
|
|
115
|
-
model = Pipeline([
|
|
116
|
-
("scaler", StandardScaler()),
|
|
117
|
-
("model", model)
|
|
118
|
-
])
|
|
112
|
+
model = Pipeline([("scaler", StandardScaler()), ("model", model)])
|
|
119
113
|
|
|
120
114
|
# Prepare features and targets for training
|
|
121
115
|
X_train = df_train[features]
|
|
@@ -1,34 +1,34 @@
|
|
|
1
1
|
# Model: NGBoost Regressor with Distribution output
|
|
2
2
|
from ngboost import NGBRegressor
|
|
3
|
-
from
|
|
3
|
+
from ngboost.distns import Cauchy
|
|
4
|
+
from xgboost import XGBRegressor # Point Estimator
|
|
4
5
|
from sklearn.model_selection import train_test_split
|
|
5
6
|
|
|
6
7
|
# Model Performance Scores
|
|
7
|
-
from sklearn.metrics import
|
|
8
|
-
|
|
9
|
-
r2_score,
|
|
10
|
-
root_mean_squared_error
|
|
11
|
-
)
|
|
8
|
+
from sklearn.metrics import mean_absolute_error, median_absolute_error, r2_score, root_mean_squared_error
|
|
9
|
+
from scipy.stats import spearmanr
|
|
12
10
|
|
|
13
11
|
from io import StringIO
|
|
14
12
|
import json
|
|
15
13
|
import argparse
|
|
16
14
|
import joblib
|
|
17
15
|
import os
|
|
16
|
+
import numpy as np
|
|
18
17
|
import pandas as pd
|
|
18
|
+
from typing import List, Tuple
|
|
19
19
|
|
|
20
20
|
# Local Imports
|
|
21
21
|
from proximity import Proximity
|
|
22
22
|
|
|
23
23
|
|
|
24
|
-
|
|
25
24
|
# Template Placeholders
|
|
26
25
|
TEMPLATE_PARAMS = {
|
|
27
26
|
"id_column": "{{id_column}}",
|
|
28
|
-
"features": "{{feature_list}}",
|
|
29
27
|
"target": "{{target_column}}",
|
|
28
|
+
"features": "{{feature_list}}",
|
|
29
|
+
"compressed_features": "{{compressed_features}}",
|
|
30
30
|
"train_all_data": "{{train_all_data}}",
|
|
31
|
-
"track_columns": "{{track_columns}}"
|
|
31
|
+
"track_columns": "{{track_columns}}",
|
|
32
32
|
}
|
|
33
33
|
|
|
34
34
|
|
|
@@ -72,16 +72,99 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
|
|
|
72
72
|
return df.rename(columns=rename_dict)
|
|
73
73
|
|
|
74
74
|
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
75
|
+
def convert_categorical_types(df: pd.DataFrame, features: list, category_mappings={}) -> tuple:
|
|
76
|
+
"""
|
|
77
|
+
Converts appropriate columns to categorical type with consistent mappings.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
df (pd.DataFrame): The DataFrame to process.
|
|
81
|
+
features (list): List of feature names to consider for conversion.
|
|
82
|
+
category_mappings (dict, optional): Existing category mappings. If empty dict, we're in
|
|
83
|
+
training mode. If populated, we're in inference mode.
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
tuple: (processed DataFrame, category mappings dictionary)
|
|
87
|
+
"""
|
|
88
|
+
# Training mode
|
|
89
|
+
if category_mappings == {}:
|
|
90
|
+
for col in df.select_dtypes(include=["object", "string"]):
|
|
91
|
+
if col in features and df[col].nunique() < 20:
|
|
92
|
+
print(f"Training mode: Converting {col} to category")
|
|
93
|
+
df[col] = df[col].astype("category")
|
|
94
|
+
category_mappings[col] = df[col].cat.categories.tolist() # Store category mappings
|
|
95
|
+
|
|
96
|
+
# Inference mode
|
|
97
|
+
else:
|
|
98
|
+
for col, categories in category_mappings.items():
|
|
99
|
+
if col in df.columns:
|
|
100
|
+
print(f"Inference mode: Applying categorical mapping for {col}")
|
|
101
|
+
df[col] = pd.Categorical(df[col], categories=categories) # Apply consistent categorical mapping
|
|
102
|
+
|
|
103
|
+
return df, category_mappings
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def decompress_features(
|
|
107
|
+
df: pd.DataFrame, features: List[str], compressed_features: List[str]
|
|
108
|
+
) -> Tuple[pd.DataFrame, List[str]]:
|
|
109
|
+
"""Prepare features for the model by decompressing bitstring features
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
df (pd.DataFrame): The features DataFrame
|
|
113
|
+
features (List[str]): Full list of feature names
|
|
114
|
+
compressed_features (List[str]): List of feature names to decompress (bitstrings)
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
pd.DataFrame: DataFrame with the decompressed features
|
|
118
|
+
List[str]: Updated list of feature names after decompression
|
|
119
|
+
|
|
120
|
+
Raises:
|
|
121
|
+
ValueError: If any missing values are found in the specified features
|
|
122
|
+
"""
|
|
123
|
+
|
|
124
|
+
# Check for any missing values in the required features
|
|
125
|
+
missing_counts = df[features].isna().sum()
|
|
126
|
+
if missing_counts.any():
|
|
127
|
+
missing_features = missing_counts[missing_counts > 0]
|
|
128
|
+
print(
|
|
129
|
+
f"WARNING: Found missing values in features: {missing_features.to_dict()}. "
|
|
130
|
+
"WARNING: You might want to remove/replace all NaN values before processing."
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
# Decompress the specified compressed features
|
|
134
|
+
decompressed_features = features.copy()
|
|
135
|
+
for feature in compressed_features:
|
|
136
|
+
if (feature not in df.columns) or (feature not in features):
|
|
137
|
+
print(f"Feature '{feature}' not in the features list, skipping decompression.")
|
|
138
|
+
continue
|
|
139
|
+
|
|
140
|
+
# Remove the feature from the list of features to avoid duplication
|
|
141
|
+
decompressed_features.remove(feature)
|
|
142
|
+
|
|
143
|
+
# Handle all compressed features as bitstrings
|
|
144
|
+
bit_matrix = np.array([list(bitstring) for bitstring in df[feature]], dtype=np.uint8)
|
|
145
|
+
prefix = feature[:3]
|
|
146
|
+
|
|
147
|
+
# Create all new columns at once - avoids fragmentation
|
|
148
|
+
new_col_names = [f"{prefix}_{i}" for i in range(bit_matrix.shape[1])]
|
|
149
|
+
new_df = pd.DataFrame(bit_matrix, columns=new_col_names, index=df.index)
|
|
150
|
+
|
|
151
|
+
# Add to features list
|
|
152
|
+
decompressed_features.extend(new_col_names)
|
|
153
|
+
|
|
154
|
+
# Drop original column and concatenate new ones
|
|
155
|
+
df = df.drop(columns=[feature])
|
|
156
|
+
df = pd.concat([df, new_df], axis=1)
|
|
157
|
+
|
|
158
|
+
return df, decompressed_features
|
|
159
|
+
|
|
160
|
+
|
|
80
161
|
if __name__ == "__main__":
|
|
81
162
|
# Template Parameters
|
|
82
163
|
id_column = TEMPLATE_PARAMS["id_column"]
|
|
83
|
-
features = TEMPLATE_PARAMS["features"]
|
|
84
164
|
target = TEMPLATE_PARAMS["target"]
|
|
165
|
+
features = TEMPLATE_PARAMS["features"]
|
|
166
|
+
orig_features = features.copy()
|
|
167
|
+
compressed_features = TEMPLATE_PARAMS["compressed_features"]
|
|
85
168
|
train_all_data = TEMPLATE_PARAMS["train_all_data"]
|
|
86
169
|
track_columns = TEMPLATE_PARAMS["track_columns"] # Can be None
|
|
87
170
|
validation_split = 0.2
|
|
@@ -95,63 +178,77 @@ if __name__ == "__main__":
|
|
|
95
178
|
)
|
|
96
179
|
args = parser.parse_args()
|
|
97
180
|
|
|
98
|
-
#
|
|
99
|
-
training_files = [
|
|
100
|
-
os.path.join(args.train, file)
|
|
101
|
-
for file in os.listdir(args.train) if file.endswith(".csv")
|
|
102
|
-
]
|
|
181
|
+
# Read the training data into DataFrames
|
|
182
|
+
training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
|
|
103
183
|
print(f"Training Files: {training_files}")
|
|
104
184
|
|
|
105
185
|
# Combine files and read them all into a single pandas dataframe
|
|
106
|
-
|
|
186
|
+
all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
|
|
187
|
+
|
|
188
|
+
# Check if the dataframe is empty
|
|
189
|
+
check_dataframe(all_df, "training_df")
|
|
190
|
+
|
|
191
|
+
# Features/Target output
|
|
192
|
+
print(f"Target: {target}")
|
|
193
|
+
print(f"Features: {str(features)}")
|
|
107
194
|
|
|
108
|
-
#
|
|
109
|
-
|
|
195
|
+
# Convert any features that might be categorical to 'category' type
|
|
196
|
+
all_df, category_mappings = convert_categorical_types(all_df, features)
|
|
110
197
|
|
|
111
|
-
#
|
|
198
|
+
# If we have compressed features, decompress them
|
|
199
|
+
if compressed_features:
|
|
200
|
+
print(f"Decompressing features {compressed_features}...")
|
|
201
|
+
all_df, features = decompress_features(all_df, features, compressed_features)
|
|
202
|
+
|
|
203
|
+
# Do we want to train on all the data?
|
|
112
204
|
if train_all_data:
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
print("
|
|
120
|
-
df_train =
|
|
121
|
-
df_val =
|
|
205
|
+
print("Training on ALL of the data")
|
|
206
|
+
df_train = all_df.copy()
|
|
207
|
+
df_val = all_df.copy()
|
|
208
|
+
|
|
209
|
+
# Does the dataframe have a training column?
|
|
210
|
+
elif "training" in all_df.columns:
|
|
211
|
+
print("Found training column, splitting data based on training column")
|
|
212
|
+
df_train = all_df[all_df["training"]]
|
|
213
|
+
df_val = all_df[~all_df["training"]]
|
|
122
214
|
else:
|
|
123
|
-
#
|
|
124
|
-
print("
|
|
125
|
-
df_train, df_val = train_test_split(
|
|
215
|
+
# Just do a random training Split
|
|
216
|
+
print("WARNING: No training column found, splitting data with random state=42")
|
|
217
|
+
df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
|
|
218
|
+
print(f"FIT/TRAIN: {df_train.shape}")
|
|
219
|
+
print(f"VALIDATION: {df_val.shape}")
|
|
126
220
|
|
|
127
221
|
# We're using XGBoost for point predictions and NGBoost for uncertainty quantification
|
|
128
222
|
xgb_model = XGBRegressor()
|
|
129
|
-
ngb_model = NGBRegressor()
|
|
223
|
+
ngb_model = NGBRegressor() # Dist=Cauchy) Seems to give HUGE prediction intervals
|
|
130
224
|
|
|
131
225
|
# Prepare features and targets for training
|
|
132
226
|
X_train = df_train[features]
|
|
133
|
-
|
|
227
|
+
X_validate = df_val[features]
|
|
134
228
|
y_train = df_train[target]
|
|
135
|
-
|
|
229
|
+
y_validate = df_val[target]
|
|
136
230
|
|
|
137
231
|
# Train both models using the training data
|
|
138
232
|
xgb_model.fit(X_train, y_train)
|
|
139
|
-
ngb_model.fit(X_train, y_train, X_val=
|
|
233
|
+
ngb_model.fit(X_train, y_train, X_val=X_validate, Y_val=y_validate)
|
|
140
234
|
|
|
141
235
|
# Make Predictions on the Validation Set
|
|
142
236
|
print(f"Making Predictions on Validation Set...")
|
|
143
|
-
y_validate = df_val[target]
|
|
144
|
-
X_validate = df_val[features]
|
|
145
237
|
preds = xgb_model.predict(X_validate)
|
|
146
238
|
|
|
147
239
|
# Calculate various model performance metrics (regression)
|
|
148
240
|
rmse = root_mean_squared_error(y_validate, preds)
|
|
149
241
|
mae = mean_absolute_error(y_validate, preds)
|
|
242
|
+
medae = median_absolute_error(y_validate, preds)
|
|
150
243
|
r2 = r2_score(y_validate, preds)
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
print(f"
|
|
154
|
-
print(f"
|
|
244
|
+
spearman_corr = spearmanr(y_validate, preds).correlation
|
|
245
|
+
support = len(df_val)
|
|
246
|
+
print(f"rmse: {rmse:.3f}")
|
|
247
|
+
print(f"mae: {mae:.3f}")
|
|
248
|
+
print(f"medae: {medae:.3f}")
|
|
249
|
+
print(f"r2: {r2:.3f}")
|
|
250
|
+
print(f"spearmanr: {spearman_corr:.3f}")
|
|
251
|
+
print(f"support: {support}")
|
|
155
252
|
|
|
156
253
|
# Save the trained XGBoost model
|
|
157
254
|
xgb_model.save_model(os.path.join(args.model_dir, "xgb_model.json"))
|
|
@@ -159,9 +256,9 @@ if __name__ == "__main__":
|
|
|
159
256
|
# Save the trained NGBoost model
|
|
160
257
|
joblib.dump(ngb_model, os.path.join(args.model_dir, "ngb_model.joblib"))
|
|
161
258
|
|
|
162
|
-
# Save the
|
|
259
|
+
# Save the features (this will validate input during predictions)
|
|
163
260
|
with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
|
|
164
|
-
json.dump(
|
|
261
|
+
json.dump(orig_features, fp) # We save the original features, not the decompressed ones
|
|
165
262
|
|
|
166
263
|
# Now the Proximity model
|
|
167
264
|
model = Proximity(df_train, id_column, features, target, track_columns=track_columns)
|
|
@@ -187,11 +284,7 @@ def model_fn(model_dir) -> dict:
|
|
|
187
284
|
# Deserialize the proximity model
|
|
188
285
|
prox_model = Proximity.deserialize(model_dir)
|
|
189
286
|
|
|
190
|
-
return {
|
|
191
|
-
"xgboost": xgb_model,
|
|
192
|
-
"ngboost": ngb_model,
|
|
193
|
-
"proximity": prox_model
|
|
194
|
-
}
|
|
287
|
+
return {"xgboost": xgb_model, "ngboost": ngb_model, "proximity": prox_model}
|
|
195
288
|
|
|
196
289
|
|
|
197
290
|
def input_fn(input_data, content_type):
|
|
@@ -251,20 +344,31 @@ def predict_fn(df, models) -> pd.DataFrame:
|
|
|
251
344
|
dist_params = y_dists.params
|
|
252
345
|
|
|
253
346
|
# Extract mean and std from distribution parameters
|
|
254
|
-
df["prediction_uq"] = dist_params[
|
|
255
|
-
df["prediction_std"] = dist_params[
|
|
347
|
+
df["prediction_uq"] = dist_params["loc"] # mean
|
|
348
|
+
df["prediction_std"] = dist_params["scale"] # standard deviation
|
|
256
349
|
|
|
257
350
|
# Add 95% prediction intervals using ppf (percent point function)
|
|
258
|
-
|
|
259
|
-
|
|
351
|
+
# Note: Our hybrid model uses XGB point prediction and NGBoost UQ
|
|
352
|
+
# so we need to adjust the bounds to include the point prediction
|
|
353
|
+
df["q_025"] = np.minimum(y_dists.ppf(0.025), df["prediction"])
|
|
354
|
+
df["q_975"] = np.maximum(y_dists.ppf(0.975), df["prediction"])
|
|
355
|
+
|
|
356
|
+
# Add 90% prediction intervals
|
|
357
|
+
df["q_05"] = y_dists.ppf(0.05) # 5th percentile
|
|
358
|
+
df["q_95"] = y_dists.ppf(0.95) # 95th percentile
|
|
359
|
+
|
|
360
|
+
# Add 80% prediction intervals
|
|
361
|
+
df["q_10"] = y_dists.ppf(0.10) # 10th percentile
|
|
362
|
+
df["q_90"] = y_dists.ppf(0.90) # 90th percentile
|
|
260
363
|
|
|
261
364
|
# Add 50% prediction intervals
|
|
262
|
-
df["q_25"] = y_dists.ppf(0.25)
|
|
263
|
-
df["q_75"] = y_dists.ppf(0.75)
|
|
365
|
+
df["q_25"] = y_dists.ppf(0.25) # 25th percentile
|
|
366
|
+
df["q_75"] = y_dists.ppf(0.75) # 75th percentile
|
|
264
367
|
|
|
265
|
-
#
|
|
266
|
-
|
|
267
|
-
|
|
368
|
+
# Reorder the quantile columns for easier reading
|
|
369
|
+
quantile_cols = ["q_025", "q_05", "q_10", "q_25", "q_75", "q_90", "q_95", "q_975"]
|
|
370
|
+
other_cols = [col for col in df.columns if col not in quantile_cols]
|
|
371
|
+
df = df[other_cols + quantile_cols]
|
|
268
372
|
|
|
269
373
|
# Compute Nearest neighbors with Proximity model
|
|
270
374
|
models["proximity"].neighbors(df)
|
|
@@ -3,11 +3,8 @@ from ngboost import NGBRegressor
|
|
|
3
3
|
from sklearn.model_selection import train_test_split
|
|
4
4
|
|
|
5
5
|
# Model Performance Scores
|
|
6
|
-
from sklearn.metrics import
|
|
7
|
-
|
|
8
|
-
r2_score,
|
|
9
|
-
root_mean_squared_error
|
|
10
|
-
)
|
|
6
|
+
from sklearn.metrics import mean_absolute_error, median_absolute_error, r2_score, root_mean_squared_error
|
|
7
|
+
from scipy.stats import spearmanr
|
|
11
8
|
|
|
12
9
|
from io import StringIO
|
|
13
10
|
import json
|
|
@@ -21,7 +18,7 @@ import pandas as pd
|
|
|
21
18
|
TEMPLATE_PARAMS = {
|
|
22
19
|
"features": "{{feature_list}}",
|
|
23
20
|
"target": "{{target_column}}",
|
|
24
|
-
"train_all_data": "{{train_all_data}}"
|
|
21
|
+
"train_all_data": "{{train_all_data}}",
|
|
25
22
|
}
|
|
26
23
|
|
|
27
24
|
|
|
@@ -87,10 +84,7 @@ if __name__ == "__main__":
|
|
|
87
84
|
args = parser.parse_args()
|
|
88
85
|
|
|
89
86
|
# Load training data from the specified directory
|
|
90
|
-
training_files = [
|
|
91
|
-
os.path.join(args.train, file)
|
|
92
|
-
for file in os.listdir(args.train) if file.endswith(".csv")
|
|
93
|
-
]
|
|
87
|
+
training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
|
|
94
88
|
print(f"Training Files: {training_files}")
|
|
95
89
|
|
|
96
90
|
# Combine files and read them all into a single pandas dataframe
|
|
@@ -136,11 +130,16 @@ if __name__ == "__main__":
|
|
|
136
130
|
# Calculate various model performance metrics (regression)
|
|
137
131
|
rmse = root_mean_squared_error(y_validate, preds)
|
|
138
132
|
mae = mean_absolute_error(y_validate, preds)
|
|
133
|
+
medae = median_absolute_error(y_validate, preds)
|
|
139
134
|
r2 = r2_score(y_validate, preds)
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
print(f"
|
|
143
|
-
print(f"
|
|
135
|
+
spearman_corr = spearmanr(y_validate, preds).correlation
|
|
136
|
+
support = len(df_val)
|
|
137
|
+
print(f"rmse: {rmse:.3f}")
|
|
138
|
+
print(f"mae: {mae:.3f}")
|
|
139
|
+
print(f"medae: {medae:.3f}")
|
|
140
|
+
print(f"r2: {r2:.3f}")
|
|
141
|
+
print(f"spearmanr: {spearman_corr:.3f}")
|
|
142
|
+
print(f"support: {support}")
|
|
144
143
|
|
|
145
144
|
# Save the trained NGBoost model
|
|
146
145
|
joblib.dump(ngb_model, os.path.join(args.model_dir, "ngb_model.joblib"))
|
|
@@ -212,16 +211,29 @@ def predict_fn(df, model) -> pd.DataFrame:
|
|
|
212
211
|
dist_params = y_dists.params
|
|
213
212
|
|
|
214
213
|
# Extract mean and std from distribution parameters
|
|
215
|
-
df["prediction"] = dist_params[
|
|
216
|
-
df["prediction_std"] = dist_params[
|
|
214
|
+
df["prediction"] = dist_params["loc"] # mean
|
|
215
|
+
df["prediction_std"] = dist_params["scale"] # standard deviation
|
|
217
216
|
|
|
218
217
|
# Add 95% prediction intervals using ppf (percent point function)
|
|
219
218
|
df["q_025"] = y_dists.ppf(0.025) # 2.5th percentile
|
|
220
219
|
df["q_975"] = y_dists.ppf(0.975) # 97.5th percentile
|
|
221
220
|
|
|
221
|
+
# Add 90% prediction intervals
|
|
222
|
+
df["q_05"] = y_dists.ppf(0.05) # 5th percentile
|
|
223
|
+
df["q_95"] = y_dists.ppf(0.95) # 95th percentile
|
|
224
|
+
|
|
225
|
+
# Add 80% prediction intervals
|
|
226
|
+
df["q_10"] = y_dists.ppf(0.10) # 10th percentile
|
|
227
|
+
df["q_90"] = y_dists.ppf(0.90) # 90th percentile
|
|
228
|
+
|
|
222
229
|
# Add 50% prediction intervals
|
|
223
|
-
df["q_25"] = y_dists.ppf(0.25)
|
|
224
|
-
df["q_75"] = y_dists.ppf(0.75)
|
|
230
|
+
df["q_25"] = y_dists.ppf(0.25) # 25th percentile
|
|
231
|
+
df["q_75"] = y_dists.ppf(0.75) # 75th percentile
|
|
232
|
+
|
|
233
|
+
# Reorder the quantile columns for easier reading
|
|
234
|
+
quantile_cols = ["q_025", "q_05", "q_10", "q_25", "q_75", "q_90", "q_95", "q_975"]
|
|
235
|
+
other_cols = [col for col in df.columns if col not in quantile_cols]
|
|
236
|
+
df = df[other_cols + quantile_cols]
|
|
225
237
|
|
|
226
238
|
# Return the modified DataFrame
|
|
227
239
|
return df
|