workbench 0.8.178__py3-none-any.whl → 0.8.180__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of workbench might be problematic. Click here for more details.
- workbench/api/endpoint.py +3 -2
- workbench/core/artifacts/endpoint_core.py +5 -5
- workbench/core/artifacts/feature_set_core.py +32 -2
- workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +3 -5
- workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
- workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +10 -17
- workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
- workbench/model_scripts/custom_models/uq_models/generated_model_script.py +37 -34
- workbench/model_scripts/custom_models/uq_models/mapie.template +35 -32
- workbench/model_scripts/custom_models/uq_models/meta_uq.template +7 -22
- workbench/model_scripts/custom_models/uq_models/ngboost.template +5 -12
- workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +5 -13
- workbench/model_scripts/pytorch_model/pytorch.template +9 -18
- workbench/model_scripts/quant_regression/quant_regression.template +5 -10
- workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
- workbench/model_scripts/xgb_model/generated_model_script.py +24 -33
- workbench/model_scripts/xgb_model/xgb_model.template +23 -32
- workbench/utils/model_utils.py +2 -1
- workbench/utils/shap_utils.py +10 -2
- workbench/utils/xgboost_model_utils.py +160 -137
- {workbench-0.8.178.dist-info → workbench-0.8.180.dist-info}/METADATA +1 -1
- {workbench-0.8.178.dist-info → workbench-0.8.180.dist-info}/RECORD +26 -26
- {workbench-0.8.178.dist-info → workbench-0.8.180.dist-info}/WHEEL +0 -0
- {workbench-0.8.178.dist-info → workbench-0.8.180.dist-info}/entry_points.txt +0 -0
- {workbench-0.8.178.dist-info → workbench-0.8.180.dist-info}/licenses/LICENSE +0 -0
- {workbench-0.8.178.dist-info → workbench-0.8.180.dist-info}/top_level.txt +0 -0
workbench/api/endpoint.py
CHANGED
|
@@ -4,6 +4,7 @@ Endpoints can be viewed in the AWS Sagemaker interfaces or in the Workbench
|
|
|
4
4
|
Dashboard UI, which provides additional model details and performance metrics"""
|
|
5
5
|
|
|
6
6
|
import pandas as pd
|
|
7
|
+
from typing import Tuple
|
|
7
8
|
|
|
8
9
|
# Workbench Imports
|
|
9
10
|
from workbench.core.artifacts.endpoint_core import EndpointCore
|
|
@@ -70,14 +71,14 @@ class Endpoint(EndpointCore):
|
|
|
70
71
|
"""
|
|
71
72
|
return super().fast_inference(eval_df, threads=threads)
|
|
72
73
|
|
|
73
|
-
def cross_fold_inference(self, nfolds: int = 5) -> dict:
|
|
74
|
+
def cross_fold_inference(self, nfolds: int = 5) -> Tuple[dict, pd.DataFrame]:
|
|
74
75
|
"""Run cross-fold inference (only works for XGBoost models)
|
|
75
76
|
|
|
76
77
|
Args:
|
|
77
78
|
nfolds (int): The number of folds to use for cross-validation (default: 5)
|
|
78
79
|
|
|
79
80
|
Returns:
|
|
80
|
-
dict: A dictionary with
|
|
81
|
+
Tuple(dict, pd.DataFrame): A tuple containing a dictionary of metrics and a DataFrame with predictions
|
|
81
82
|
"""
|
|
82
83
|
return super().cross_fold_inference(nfolds)
|
|
83
84
|
|
|
@@ -8,7 +8,7 @@ import pandas as pd
|
|
|
8
8
|
import numpy as np
|
|
9
9
|
from io import StringIO
|
|
10
10
|
import awswrangler as wr
|
|
11
|
-
from typing import Union, Optional
|
|
11
|
+
from typing import Union, Optional, Tuple
|
|
12
12
|
import hashlib
|
|
13
13
|
|
|
14
14
|
# Model Performance Scores
|
|
@@ -436,24 +436,24 @@ class EndpointCore(Artifact):
|
|
|
436
436
|
# Return the prediction DataFrame
|
|
437
437
|
return prediction_df
|
|
438
438
|
|
|
439
|
-
def cross_fold_inference(self, nfolds: int = 5) -> dict:
|
|
439
|
+
def cross_fold_inference(self, nfolds: int = 5) -> Tuple[dict, pd.DataFrame]:
|
|
440
440
|
"""Run cross-fold inference (only works for XGBoost models)
|
|
441
441
|
|
|
442
442
|
Args:
|
|
443
443
|
nfolds (int): Number of folds to use for cross-fold (default: 5)
|
|
444
444
|
|
|
445
445
|
Returns:
|
|
446
|
-
dict:
|
|
446
|
+
Tuple[dict, pd.DataFrame]: Tuple of (cross_fold_metrics, out_of_fold_df)
|
|
447
447
|
"""
|
|
448
448
|
|
|
449
449
|
# Grab our model
|
|
450
450
|
model = ModelCore(self.model_name)
|
|
451
451
|
|
|
452
452
|
# Compute CrossFold Metrics
|
|
453
|
-
cross_fold_metrics = cross_fold_inference(model, nfolds=nfolds)
|
|
453
|
+
cross_fold_metrics, out_of_fold_df = cross_fold_inference(model, nfolds=nfolds)
|
|
454
454
|
if cross_fold_metrics:
|
|
455
455
|
self.param_store.upsert(f"/workbench/models/{model.name}/inference/cross_fold", cross_fold_metrics)
|
|
456
|
-
return cross_fold_metrics
|
|
456
|
+
return cross_fold_metrics, out_of_fold_df
|
|
457
457
|
|
|
458
458
|
def fast_inference(self, eval_df: pd.DataFrame, threads: int = 4) -> pd.DataFrame:
|
|
459
459
|
"""Run inference on the Endpoint using the provided DataFrame
|
|
@@ -17,7 +17,7 @@ from workbench.core.artifacts.artifact import Artifact
|
|
|
17
17
|
from workbench.core.artifacts.data_source_factory import DataSourceFactory
|
|
18
18
|
from workbench.core.artifacts.athena_source import AthenaSource
|
|
19
19
|
|
|
20
|
-
from typing import TYPE_CHECKING, Optional
|
|
20
|
+
from typing import TYPE_CHECKING, Optional, List, Union
|
|
21
21
|
|
|
22
22
|
from workbench.utils.aws_utils import aws_throttle
|
|
23
23
|
|
|
@@ -514,7 +514,7 @@ class FeatureSetCore(Artifact):
|
|
|
514
514
|
|
|
515
515
|
Args:
|
|
516
516
|
filter_expression (Optional[str]): A SQL filter expression (e.g., "age > 25 AND status = 'active'")
|
|
517
|
-
If None or empty string, will reset to
|
|
517
|
+
If None or empty string, will reset to training view with no filter
|
|
518
518
|
(default: None)
|
|
519
519
|
"""
|
|
520
520
|
from workbench.core.views import TrainingView
|
|
@@ -528,6 +528,29 @@ class FeatureSetCore(Artifact):
|
|
|
528
528
|
self, id_column=self.id_column, holdout_ids=holdout_ids, filter_expression=filter_expression
|
|
529
529
|
)
|
|
530
530
|
|
|
531
|
+
def exclude_ids_from_training(self, ids: List[Union[str, int]], column_name: Optional[str] = None):
|
|
532
|
+
"""Exclude a list of IDs from the training view
|
|
533
|
+
|
|
534
|
+
Args:
|
|
535
|
+
ids (List[Union[str, int]],): List of IDs to exclude from training
|
|
536
|
+
column_name (Optional[str]): Column name to filter on.
|
|
537
|
+
If None, uses self.id_column (default: None)
|
|
538
|
+
"""
|
|
539
|
+
# Use the default id_column if not specified
|
|
540
|
+
column = column_name or self.id_column
|
|
541
|
+
|
|
542
|
+
# Handle empty list case
|
|
543
|
+
if not ids:
|
|
544
|
+
self.log.warning("No IDs provided to exclude")
|
|
545
|
+
return
|
|
546
|
+
|
|
547
|
+
# Build the filter expression with proper SQL quoting
|
|
548
|
+
quoted_ids = ", ".join([repr(id) for id in ids])
|
|
549
|
+
filter_expression = f"{column} NOT IN ({quoted_ids})"
|
|
550
|
+
|
|
551
|
+
# Apply the filter
|
|
552
|
+
self.set_training_filter(filter_expression)
|
|
553
|
+
|
|
531
554
|
@classmethod
|
|
532
555
|
def delete_views(cls, table: str, database: str):
|
|
533
556
|
"""Delete any views associated with this FeatureSet
|
|
@@ -769,6 +792,13 @@ if __name__ == "__main__":
|
|
|
769
792
|
print(f"Training Data: {training_data.shape}")
|
|
770
793
|
print(training_data)
|
|
771
794
|
|
|
795
|
+
# Test excluding ids from training
|
|
796
|
+
print("Excluding ids from training...")
|
|
797
|
+
my_features.exclude_ids_from_training([1, 2, 3, 4, 5])
|
|
798
|
+
training_data = my_features.get_training_data()
|
|
799
|
+
print(f"Training Data: {training_data.shape}")
|
|
800
|
+
print(training_data)
|
|
801
|
+
|
|
772
802
|
# Now delete the AWS artifacts associated with this Feature Set
|
|
773
803
|
# print("Deleting Workbench Feature Set...")
|
|
774
804
|
# my_features.delete()
|
|
@@ -8,7 +8,7 @@ TEMPLATE_PARAMS = {
|
|
|
8
8
|
"id_column": "{{id_column}}",
|
|
9
9
|
"features": "{{feature_list}}",
|
|
10
10
|
"target": "{{target_column}}",
|
|
11
|
-
"track_columns": "{{track_columns}}"
|
|
11
|
+
"track_columns": "{{track_columns}}",
|
|
12
12
|
}
|
|
13
13
|
|
|
14
14
|
from io import StringIO
|
|
@@ -73,10 +73,7 @@ if __name__ == "__main__":
|
|
|
73
73
|
args = parser.parse_args()
|
|
74
74
|
|
|
75
75
|
# Load training data from the specified directory
|
|
76
|
-
training_files = [
|
|
77
|
-
os.path.join(args.train, file)
|
|
78
|
-
for file in os.listdir(args.train) if file.endswith(".csv")
|
|
79
|
-
]
|
|
76
|
+
training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
|
|
80
77
|
all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
|
|
81
78
|
|
|
82
79
|
# Check if the DataFrame is empty
|
|
@@ -88,6 +85,7 @@ if __name__ == "__main__":
|
|
|
88
85
|
# Now serialize the model
|
|
89
86
|
model.serialize(args.model_dir)
|
|
90
87
|
|
|
88
|
+
|
|
91
89
|
# Model loading and prediction functions
|
|
92
90
|
def model_fn(model_dir):
|
|
93
91
|
|
|
@@ -14,7 +14,7 @@ import pandas as pd
|
|
|
14
14
|
TEMPLATE_PARAMS = {
|
|
15
15
|
"features": "{{feature_list}}",
|
|
16
16
|
"target": "{{target_column}}",
|
|
17
|
-
"train_all_data": "{{train_all_data}}"
|
|
17
|
+
"train_all_data": "{{train_all_data}}",
|
|
18
18
|
}
|
|
19
19
|
|
|
20
20
|
|
|
@@ -37,7 +37,7 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
|
|
|
37
37
|
"""
|
|
38
38
|
Matches and renames DataFrame columns to match model feature names (case-insensitive).
|
|
39
39
|
Prioritizes exact matches, then case-insensitive matches.
|
|
40
|
-
|
|
40
|
+
|
|
41
41
|
Raises ValueError if any model features cannot be matched.
|
|
42
42
|
"""
|
|
43
43
|
df_columns_lower = {col.lower(): col for col in df.columns}
|
|
@@ -81,10 +81,7 @@ if __name__ == "__main__":
|
|
|
81
81
|
args = parser.parse_args()
|
|
82
82
|
|
|
83
83
|
# Load training data from the specified directory
|
|
84
|
-
training_files = [
|
|
85
|
-
os.path.join(args.train, file)
|
|
86
|
-
for file in os.listdir(args.train) if file.endswith(".csv")
|
|
87
|
-
]
|
|
84
|
+
training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
|
|
88
85
|
df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
|
|
89
86
|
|
|
90
87
|
# Check if the DataFrame is empty
|
|
@@ -109,8 +106,10 @@ if __name__ == "__main__":
|
|
|
109
106
|
# Create and train the Regression/Confidence model
|
|
110
107
|
# model = BayesianRidge()
|
|
111
108
|
model = BayesianRidge(
|
|
112
|
-
alpha_1=1e-6,
|
|
113
|
-
|
|
109
|
+
alpha_1=1e-6,
|
|
110
|
+
alpha_2=1e-6, # Noise precision
|
|
111
|
+
lambda_1=1e-6,
|
|
112
|
+
lambda_2=1e-6, # Weight precision
|
|
114
113
|
fit_intercept=True,
|
|
115
114
|
)
|
|
116
115
|
|
|
@@ -4,11 +4,7 @@ import awswrangler as wr
|
|
|
4
4
|
import numpy as np
|
|
5
5
|
|
|
6
6
|
# Model Performance Scores
|
|
7
|
-
from sklearn.metrics import
|
|
8
|
-
mean_absolute_error,
|
|
9
|
-
r2_score,
|
|
10
|
-
root_mean_squared_error
|
|
11
|
-
)
|
|
7
|
+
from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
|
|
12
8
|
from sklearn.model_selection import KFold
|
|
13
9
|
from scipy.optimize import minimize
|
|
14
10
|
|
|
@@ -23,7 +19,7 @@ TEMPLATE_PARAMS = {
|
|
|
23
19
|
"features": "{{feature_list}}",
|
|
24
20
|
"target": "{{target_column}}",
|
|
25
21
|
"train_all_data": "{{train_all_data}}",
|
|
26
|
-
"model_metrics_s3_path": "{{model_metrics_s3_path}}"
|
|
22
|
+
"model_metrics_s3_path": "{{model_metrics_s3_path}}",
|
|
27
23
|
}
|
|
28
24
|
|
|
29
25
|
|
|
@@ -47,7 +43,7 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
|
|
|
47
43
|
"""
|
|
48
44
|
Matches and renames DataFrame columns to match model feature names (case-insensitive).
|
|
49
45
|
Prioritizes exact matches, then case-insensitive matches.
|
|
50
|
-
|
|
46
|
+
|
|
51
47
|
Raises ValueError if any model features cannot be matched.
|
|
52
48
|
"""
|
|
53
49
|
df_columns_lower = {col.lower(): col for col in df.columns}
|
|
@@ -90,10 +86,7 @@ if __name__ == "__main__":
|
|
|
90
86
|
args = parser.parse_args()
|
|
91
87
|
|
|
92
88
|
# Load training data from the specified directory
|
|
93
|
-
training_files = [
|
|
94
|
-
os.path.join(args.train, file)
|
|
95
|
-
for file in os.listdir(args.train) if file.endswith(".csv")
|
|
96
|
-
]
|
|
89
|
+
training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
|
|
97
90
|
df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
|
|
98
91
|
|
|
99
92
|
# Check if the DataFrame is empty
|
|
@@ -172,16 +165,14 @@ if __name__ == "__main__":
|
|
|
172
165
|
cv_residuals = np.array(cv_residuals)
|
|
173
166
|
cv_uncertainties = np.array(cv_uncertainties)
|
|
174
167
|
|
|
175
|
-
|
|
176
168
|
# Optimize calibration parameters: σ_cal = a * σ_uc + b
|
|
177
169
|
def neg_log_likelihood(params):
|
|
178
170
|
a, b = params
|
|
179
171
|
sigma_cal = a * cv_uncertainties + b
|
|
180
172
|
sigma_cal = np.maximum(sigma_cal, 1e-8) # Prevent division by zero
|
|
181
|
-
return np.sum(0.5 * np.log(2 * np.pi * sigma_cal
|
|
173
|
+
return np.sum(0.5 * np.log(2 * np.pi * sigma_cal**2) + 0.5 * (cv_residuals**2) / (sigma_cal**2))
|
|
182
174
|
|
|
183
|
-
|
|
184
|
-
result = minimize(neg_log_likelihood, x0=[1.0, 0.1], method='Nelder-Mead')
|
|
175
|
+
result = minimize(neg_log_likelihood, x0=[1.0, 0.1], method="Nelder-Mead")
|
|
185
176
|
cal_a, cal_b = result.x
|
|
186
177
|
|
|
187
178
|
print(f"Calibration parameters: a={cal_a:.4f}, b={cal_b:.4f}")
|
|
@@ -205,7 +196,9 @@ if __name__ == "__main__":
|
|
|
205
196
|
result_df["prediction"] = result_df[[name for name in result_df.columns if name.startswith("m_")]].mean(axis=1)
|
|
206
197
|
|
|
207
198
|
# Compute uncalibrated uncertainty
|
|
208
|
-
result_df["prediction_std_uc"] = result_df[[name for name in result_df.columns if name.startswith("m_")]].std(
|
|
199
|
+
result_df["prediction_std_uc"] = result_df[[name for name in result_df.columns if name.startswith("m_")]].std(
|
|
200
|
+
axis=1
|
|
201
|
+
)
|
|
209
202
|
|
|
210
203
|
# Apply calibration to uncertainty
|
|
211
204
|
result_df["prediction_std"] = cal_a * result_df["prediction_std_uc"] + cal_b
|
|
@@ -352,4 +345,4 @@ def predict_fn(df, models) -> pd.DataFrame:
|
|
|
352
345
|
df = df.reindex(sorted(df.columns), axis=1)
|
|
353
346
|
|
|
354
347
|
# All done, return the DataFrame
|
|
355
|
-
return df
|
|
348
|
+
return df
|
|
@@ -9,7 +9,7 @@ from sklearn.model_selection import train_test_split
|
|
|
9
9
|
TEMPLATE_PARAMS = {
|
|
10
10
|
"features": "{{feature_list}}",
|
|
11
11
|
"target": "{{target_column}}",
|
|
12
|
-
"train_all_data": "{{train_all_data}}"
|
|
12
|
+
"train_all_data": "{{train_all_data}}",
|
|
13
13
|
}
|
|
14
14
|
|
|
15
15
|
from io import StringIO
|
|
@@ -33,7 +33,7 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
|
|
|
33
33
|
"""
|
|
34
34
|
Matches and renames DataFrame columns to match model feature names (case-insensitive).
|
|
35
35
|
Prioritizes exact matches, then case-insensitive matches.
|
|
36
|
-
|
|
36
|
+
|
|
37
37
|
Raises ValueError if any model features cannot be matched.
|
|
38
38
|
"""
|
|
39
39
|
df_columns_lower = {col.lower(): col for col in df.columns}
|
|
@@ -46,7 +46,7 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
|
|
|
46
46
|
rename_dict[df_columns_lower[feature.lower()]] = feature
|
|
47
47
|
else:
|
|
48
48
|
missing.append(feature)
|
|
49
|
-
|
|
49
|
+
|
|
50
50
|
if missing:
|
|
51
51
|
raise ValueError(f"Features not found: {missing}")
|
|
52
52
|
|
|
@@ -76,10 +76,7 @@ if __name__ == "__main__":
|
|
|
76
76
|
args = parser.parse_args()
|
|
77
77
|
|
|
78
78
|
# Load training data from the specified directory
|
|
79
|
-
training_files = [
|
|
80
|
-
os.path.join(args.train, file)
|
|
81
|
-
for file in os.listdir(args.train) if file.endswith(".csv")
|
|
82
|
-
]
|
|
79
|
+
training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
|
|
83
80
|
df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
|
|
84
81
|
|
|
85
82
|
# Check if the DataFrame is empty
|
|
@@ -112,10 +109,7 @@ if __name__ == "__main__":
|
|
|
112
109
|
)
|
|
113
110
|
|
|
114
111
|
# Create a Pipeline with StandardScaler
|
|
115
|
-
model = Pipeline([
|
|
116
|
-
("scaler", StandardScaler()),
|
|
117
|
-
("model", model)
|
|
118
|
-
])
|
|
112
|
+
model = Pipeline([("scaler", StandardScaler()), ("model", model)])
|
|
119
113
|
|
|
120
114
|
# Prepare features and targets for training
|
|
121
115
|
X_train = df_train[features]
|
|
@@ -5,11 +5,7 @@ from xgboost import XGBRegressor
|
|
|
5
5
|
from sklearn.model_selection import train_test_split
|
|
6
6
|
|
|
7
7
|
# Model Performance Scores
|
|
8
|
-
from sklearn.metrics import
|
|
9
|
-
mean_absolute_error,
|
|
10
|
-
r2_score,
|
|
11
|
-
root_mean_squared_error
|
|
12
|
-
)
|
|
8
|
+
from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
|
|
13
9
|
|
|
14
10
|
from io import StringIO
|
|
15
11
|
import json
|
|
@@ -22,10 +18,11 @@ from typing import List, Tuple
|
|
|
22
18
|
|
|
23
19
|
# Template Placeholders
|
|
24
20
|
TEMPLATE_PARAMS = {
|
|
25
|
-
"target": "
|
|
26
|
-
"features": ['
|
|
21
|
+
"target": "solubility",
|
|
22
|
+
"features": ['molwt', 'mollogp', 'molmr', 'heavyatomcount', 'numhacceptors', 'numhdonors', 'numheteroatoms', 'numrotatablebonds', 'numvalenceelectrons', 'numaromaticrings', 'numsaturatedrings', 'numaliphaticrings', 'ringcount', 'tpsa', 'labuteasa', 'balabanj', 'bertzct'],
|
|
27
23
|
"compressed_features": [],
|
|
28
|
-
"train_all_data":
|
|
24
|
+
"train_all_data": False,
|
|
25
|
+
"hyperparameters": {'n_estimators': 200, 'max_depth': 6, 'learning_rate': 0.05, 'subsample': 0.7, 'colsample_bytree': 0.3, 'colsample_bylevel': 0.5, 'min_child_weight': 5, 'gamma': 0.2, 'reg_alpha': 0.5, 'reg_lambda': 2.0, 'scale_pos_weight': 1},
|
|
29
26
|
}
|
|
30
27
|
|
|
31
28
|
|
|
@@ -101,7 +98,7 @@ def convert_categorical_types(df: pd.DataFrame, features: list, category_mapping
|
|
|
101
98
|
|
|
102
99
|
|
|
103
100
|
def decompress_features(
|
|
104
|
-
|
|
101
|
+
df: pd.DataFrame, features: List[str], compressed_features: List[str]
|
|
105
102
|
) -> Tuple[pd.DataFrame, List[str]]:
|
|
106
103
|
"""Prepare features for the model by decompressing bitstring features
|
|
107
104
|
|
|
@@ -162,6 +159,7 @@ if __name__ == "__main__":
|
|
|
162
159
|
orig_features = features.copy()
|
|
163
160
|
compressed_features = TEMPLATE_PARAMS["compressed_features"]
|
|
164
161
|
train_all_data = TEMPLATE_PARAMS["train_all_data"]
|
|
162
|
+
hyperparameters = TEMPLATE_PARAMS["hyperparameters"]
|
|
165
163
|
validation_split = 0.2
|
|
166
164
|
|
|
167
165
|
# Script arguments for input/output directories
|
|
@@ -174,11 +172,7 @@ if __name__ == "__main__":
|
|
|
174
172
|
args = parser.parse_args()
|
|
175
173
|
|
|
176
174
|
# Read the training data into DataFrames
|
|
177
|
-
training_files = [
|
|
178
|
-
os.path.join(args.train, file)
|
|
179
|
-
for file in os.listdir(args.train)
|
|
180
|
-
if file.endswith(".csv")
|
|
181
|
-
]
|
|
175
|
+
training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
|
|
182
176
|
print(f"Training Files: {training_files}")
|
|
183
177
|
|
|
184
178
|
# Combine files and read them all into a single pandas dataframe
|
|
@@ -213,9 +207,7 @@ if __name__ == "__main__":
|
|
|
213
207
|
else:
|
|
214
208
|
# Just do a random training Split
|
|
215
209
|
print("WARNING: No training column found, splitting data with random state=42")
|
|
216
|
-
df_train, df_val = train_test_split(
|
|
217
|
-
all_df, test_size=validation_split, random_state=42
|
|
218
|
-
)
|
|
210
|
+
df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
|
|
219
211
|
print(f"FIT/TRAIN: {df_train.shape}")
|
|
220
212
|
print(f"VALIDATION: {df_val.shape}")
|
|
221
213
|
|
|
@@ -227,7 +219,8 @@ if __name__ == "__main__":
|
|
|
227
219
|
|
|
228
220
|
# Train XGBoost for point predictions
|
|
229
221
|
print("\nTraining XGBoost for point predictions...")
|
|
230
|
-
|
|
222
|
+
print(f" Hyperparameters: {hyperparameters}")
|
|
223
|
+
xgb_model = XGBRegressor(enable_categorical=True, **hyperparameters)
|
|
231
224
|
xgb_model.fit(X_train, y_train)
|
|
232
225
|
|
|
233
226
|
# Evaluate XGBoost performance
|
|
@@ -272,7 +265,7 @@ if __name__ == "__main__":
|
|
|
272
265
|
colsample_bytree=0.8,
|
|
273
266
|
random_state=42,
|
|
274
267
|
verbose=-1,
|
|
275
|
-
force_col_wise=True
|
|
268
|
+
force_col_wise=True,
|
|
276
269
|
)
|
|
277
270
|
est.fit(X_train, y_train)
|
|
278
271
|
quantile_estimators.append(est)
|
|
@@ -280,9 +273,7 @@ if __name__ == "__main__":
|
|
|
280
273
|
# Create MAPIE CQR model for this confidence level
|
|
281
274
|
print(f" Setting up MAPIE CQR for {confidence_level * 100:.0f}% confidence...")
|
|
282
275
|
mapie_model = ConformalizedQuantileRegressor(
|
|
283
|
-
quantile_estimators,
|
|
284
|
-
confidence_level=confidence_level,
|
|
285
|
-
prefit=True
|
|
276
|
+
quantile_estimators, confidence_level=confidence_level, prefit=True
|
|
286
277
|
)
|
|
287
278
|
|
|
288
279
|
# Conformalize the model
|
|
@@ -337,8 +328,8 @@ if __name__ == "__main__":
|
|
|
337
328
|
"xgb_rmse": float(xgb_rmse),
|
|
338
329
|
"xgb_mae": float(xgb_mae),
|
|
339
330
|
"xgb_r2": float(xgb_r2),
|
|
340
|
-
"n_validation": len(df_val)
|
|
341
|
-
}
|
|
331
|
+
"n_validation": len(df_val),
|
|
332
|
+
},
|
|
342
333
|
}
|
|
343
334
|
with open(os.path.join(args.model_dir, "model_config.json"), "w") as fp:
|
|
344
335
|
json.dump(model_config, fp, indent=2)
|
|
@@ -379,7 +370,7 @@ def model_fn(model_dir) -> dict:
|
|
|
379
370
|
"xgb_model": xgb_model,
|
|
380
371
|
"mapie_models": mapie_models,
|
|
381
372
|
"confidence_levels": config["confidence_levels"],
|
|
382
|
-
"category_mappings": category_mappings
|
|
373
|
+
"category_mappings": category_mappings,
|
|
383
374
|
}
|
|
384
375
|
|
|
385
376
|
|
|
@@ -404,7 +395,7 @@ def output_fn(output_df, accept_type):
|
|
|
404
395
|
"""Supports both CSV and JSON output formats."""
|
|
405
396
|
if "text/csv" in accept_type:
|
|
406
397
|
# Convert categorical columns to string to avoid fillna issues
|
|
407
|
-
for col in output_df.select_dtypes(include=[
|
|
398
|
+
for col in output_df.select_dtypes(include=["category"]).columns:
|
|
408
399
|
output_df[col] = output_df[col].astype(str)
|
|
409
400
|
csv_output = output_df.fillna("N/A").to_csv(index=False)
|
|
410
401
|
return csv_output, "text/csv"
|
|
@@ -425,6 +416,10 @@ def predict_fn(df, models) -> pd.DataFrame:
|
|
|
425
416
|
pd.DataFrame: DataFrame with XGBoost predictions and conformalized intervals
|
|
426
417
|
"""
|
|
427
418
|
|
|
419
|
+
# Flag for outlier stretch adjustment for the prediction intervals
|
|
420
|
+
# if the predicted values are outside the intervals
|
|
421
|
+
outlier_stretch = False
|
|
422
|
+
|
|
428
423
|
# Grab our feature columns (from training)
|
|
429
424
|
model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
|
|
430
425
|
with open(os.path.join(model_dir, "feature_columns.json")) as fp:
|
|
@@ -435,11 +430,7 @@ def predict_fn(df, models) -> pd.DataFrame:
|
|
|
435
430
|
|
|
436
431
|
# Apply categorical mappings if they exist
|
|
437
432
|
if models.get("category_mappings"):
|
|
438
|
-
matched_df, _ = convert_categorical_types(
|
|
439
|
-
matched_df,
|
|
440
|
-
model_features,
|
|
441
|
-
models["category_mappings"]
|
|
442
|
-
)
|
|
433
|
+
matched_df, _ = convert_categorical_types(matched_df, model_features, models["category_mappings"])
|
|
443
434
|
|
|
444
435
|
# Get features for prediction
|
|
445
436
|
X = matched_df[model_features]
|
|
@@ -475,7 +466,7 @@ def predict_fn(df, models) -> pd.DataFrame:
|
|
|
475
466
|
# Add median (q_50) from XGBoost prediction
|
|
476
467
|
df["q_50"] = df["prediction"]
|
|
477
468
|
|
|
478
|
-
# Calculate a
|
|
469
|
+
# Calculate a pseudo-standard deviation from the 68% interval width
|
|
479
470
|
df["prediction_std"] = (df["q_84"] - df["q_16"]) / 2.0
|
|
480
471
|
|
|
481
472
|
# Reorder the quantile columns for easier reading
|
|
@@ -484,7 +475,19 @@ def predict_fn(df, models) -> pd.DataFrame:
|
|
|
484
475
|
df = df[other_cols + quantile_cols]
|
|
485
476
|
|
|
486
477
|
# Adjust the outer quantiles to ensure they encompass the prediction
|
|
487
|
-
|
|
488
|
-
|
|
478
|
+
if outlier_stretch:
|
|
479
|
+
# Lower intervals adjustments
|
|
480
|
+
df["q_025"] = np.minimum(df["q_025"], df["prediction"])
|
|
481
|
+
df["q_05"] = np.minimum(df["q_05"], df["prediction"])
|
|
482
|
+
df["q_10"] = np.minimum(df["q_10"], df["prediction"])
|
|
483
|
+
df["q_16"] = np.minimum(df["q_16"], df["prediction"])
|
|
484
|
+
df["q_25"] = np.minimum(df["q_25"], df["prediction"])
|
|
485
|
+
|
|
486
|
+
# Upper intervals adjustments
|
|
487
|
+
df["q_75"] = np.maximum(df["q_75"], df["prediction"])
|
|
488
|
+
df["q_84"] = np.maximum(df["q_84"], df["prediction"])
|
|
489
|
+
df["q_90"] = np.maximum(df["q_90"], df["prediction"])
|
|
490
|
+
df["q_95"] = np.maximum(df["q_95"], df["prediction"])
|
|
491
|
+
df["q_975"] = np.maximum(df["q_975"], df["prediction"])
|
|
489
492
|
|
|
490
493
|
return df
|