workbench 0.8.170__py3-none-any.whl → 0.8.171__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of workbench might be problematic. Click here for more details.
- workbench/core/cloud_platform/aws/aws_meta.py +11 -4
- workbench/model_scripts/custom_models/uq_models/generated_model_script.py +141 -151
- workbench/model_scripts/custom_models/uq_models/meta_uq.template +149 -39
- workbench/model_scripts/custom_models/uq_models/ngboost.template +15 -2
- workbench/model_scripts/xgb_model/generated_model_script.py +11 -11
- workbench/model_scripts/xgb_model/xgb_model.template +7 -7
- workbench/scripts/{ml_pipeline_launcher.py → ml_pipeline_batch.py} +1 -1
- workbench/scripts/ml_pipeline_sqs.py +139 -0
- workbench/utils/model_utils.py +12 -0
- workbench/web_interface/components/plugins/dashboard_status.py +3 -1
- {workbench-0.8.170.dist-info → workbench-0.8.171.dist-info}/METADATA +1 -1
- {workbench-0.8.170.dist-info → workbench-0.8.171.dist-info}/RECORD +16 -15
- {workbench-0.8.170.dist-info → workbench-0.8.171.dist-info}/entry_points.txt +2 -1
- {workbench-0.8.170.dist-info → workbench-0.8.171.dist-info}/WHEEL +0 -0
- {workbench-0.8.170.dist-info → workbench-0.8.171.dist-info}/licenses/LICENSE +0 -0
- {workbench-0.8.170.dist-info → workbench-0.8.171.dist-info}/top_level.txt +0 -0
|
@@ -196,7 +196,9 @@ class AWSMeta:
|
|
|
196
196
|
|
|
197
197
|
# Return the summary as a DataFrame
|
|
198
198
|
df = pd.DataFrame(data_summary).convert_dtypes()
|
|
199
|
-
|
|
199
|
+
if not df.empty:
|
|
200
|
+
df.sort_values(by="Created", ascending=False, inplace=True)
|
|
201
|
+
return df
|
|
200
202
|
|
|
201
203
|
def models(self, details: bool = False) -> pd.DataFrame:
|
|
202
204
|
"""Get a summary of the Models in AWS.
|
|
@@ -256,7 +258,9 @@ class AWSMeta:
|
|
|
256
258
|
|
|
257
259
|
# Return the summary as a DataFrame
|
|
258
260
|
df = pd.DataFrame(model_summary).convert_dtypes()
|
|
259
|
-
|
|
261
|
+
if not df.empty:
|
|
262
|
+
df.sort_values(by="Created", ascending=False, inplace=True)
|
|
263
|
+
return df
|
|
260
264
|
|
|
261
265
|
def endpoints(self, details: bool = False) -> pd.DataFrame:
|
|
262
266
|
"""Get a summary of the Endpoints in AWS.
|
|
@@ -317,7 +321,9 @@ class AWSMeta:
|
|
|
317
321
|
|
|
318
322
|
# Return the summary as a DataFrame
|
|
319
323
|
df = pd.DataFrame(data_summary).convert_dtypes()
|
|
320
|
-
|
|
324
|
+
if not df.empty:
|
|
325
|
+
df.sort_values(by="Created", ascending=False, inplace=True)
|
|
326
|
+
return df
|
|
321
327
|
|
|
322
328
|
def _endpoint_config_info(self, endpoint_config_name: str) -> dict:
|
|
323
329
|
"""Internal: Get the Endpoint Configuration information for the given endpoint config name.
|
|
@@ -657,7 +663,8 @@ class AWSMeta:
|
|
|
657
663
|
df = pd.DataFrame(data_summary).convert_dtypes()
|
|
658
664
|
|
|
659
665
|
# Sort by the Modified column
|
|
660
|
-
|
|
666
|
+
if not df.empty:
|
|
667
|
+
df = df.sort_values(by="Modified", ascending=False)
|
|
661
668
|
return df
|
|
662
669
|
|
|
663
670
|
def _aws_pipelines(self) -> pd.DataFrame:
|
|
@@ -2,7 +2,6 @@
|
|
|
2
2
|
from ngboost import NGBRegressor
|
|
3
3
|
from xgboost import XGBRegressor # Base Estimator
|
|
4
4
|
from sklearn.model_selection import train_test_split
|
|
5
|
-
import numpy as np
|
|
6
5
|
|
|
7
6
|
# Model Performance Scores
|
|
8
7
|
from sklearn.metrics import (
|
|
@@ -16,7 +15,9 @@ import json
|
|
|
16
15
|
import argparse
|
|
17
16
|
import joblib
|
|
18
17
|
import os
|
|
18
|
+
import numpy as np
|
|
19
19
|
import pandas as pd
|
|
20
|
+
from typing import List, Tuple
|
|
20
21
|
|
|
21
22
|
# Local Imports
|
|
22
23
|
from proximity import Proximity
|
|
@@ -25,11 +26,12 @@ from proximity import Proximity
|
|
|
25
26
|
|
|
26
27
|
# Template Placeholders
|
|
27
28
|
TEMPLATE_PARAMS = {
|
|
28
|
-
"id_column": "
|
|
29
|
-
"
|
|
30
|
-
"
|
|
31
|
-
"
|
|
32
|
-
"
|
|
29
|
+
"id_column": "udm_mol_bat_id",
|
|
30
|
+
"target": "udm_asy_res_intrinsic_clearance_ul_per_min_per_mg_protein",
|
|
31
|
+
"features": ['bcut2d_logplow', 'numradicalelectrons', 'smr_vsa5', 'fr_lactam', 'fr_morpholine', 'fr_aldehyde', 'slogp_vsa1', 'fr_amidine', 'bpol', 'fr_ester', 'fr_azo', 'kappa3', 'peoe_vsa5', 'fr_ketone_topliss', 'vsa_estate9', 'estate_vsa9', 'bcut2d_mrhi', 'fr_ndealkylation1', 'numrotatablebonds', 'minestateindex', 'fr_quatn', 'peoe_vsa3', 'fr_epoxide', 'fr_aniline', 'minpartialcharge', 'fr_nitroso', 'fpdensitymorgan2', 'fr_oxime', 'fr_sulfone', 'smr_vsa1', 'kappa1', 'fr_pyridine', 'numaromaticrings', 'vsa_estate6', 'molmr', 'estate_vsa1', 'fr_dihydropyridine', 'vsa_estate10', 'fr_alkyl_halide', 'chi2n', 'fr_thiocyan', 'fpdensitymorgan1', 'fr_unbrch_alkane', 'slogp_vsa9', 'chi4n', 'fr_nitro_arom', 'fr_al_oh', 'fr_furan', 'fr_c_s', 'peoe_vsa8', 'peoe_vsa14', 'numheteroatoms', 'fr_ndealkylation2', 'maxabspartialcharge', 'vsa_estate2', 'peoe_vsa7', 'apol', 'numhacceptors', 'fr_tetrazole', 'vsa_estate1', 'peoe_vsa9', 'naromatom', 'bcut2d_chghi', 'fr_sh', 'fr_halogen', 'slogp_vsa4', 'fr_benzodiazepine', 'molwt', 'fr_isocyan', 'fr_prisulfonamd', 'maxabsestateindex', 'minabsestateindex', 'peoe_vsa11', 'slogp_vsa12', 'estate_vsa5', 'numaliphaticcarbocycles', 'bcut2d_mwlow', 'slogp_vsa7', 'fr_allylic_oxid', 'fr_methoxy', 'fr_nh0', 'fr_coo2', 'fr_phenol', 'nacid', 'nbase', 'chi3v', 'fr_ar_nh', 'fr_nitrile', 'fr_imidazole', 'fr_urea', 'bcut2d_mrlow', 'chi1', 'smr_vsa6', 'fr_aryl_methyl', 'narombond', 'fr_alkyl_carbamate', 'fr_piperzine', 'exactmolwt', 'qed', 'chi0n', 'fr_sulfonamd', 'fr_thiazole', 'numvalenceelectrons', 'fr_phos_acid', 'peoe_vsa12', 'fr_nh1', 'fr_hdrzine', 'fr_c_o_nocoo', 'fr_lactone', 'estate_vsa6', 'bcut2d_logphi', 'vsa_estate7', 'peoe_vsa13', 'numsaturatedcarbocycles', 'fr_nitro', 'fr_phenol_noorthohbond', 'rotratio', 'fr_barbitur', 'fr_isothiocyan', 'balabanj', 'fr_arn', 'fr_imine', 'maxpartialcharge', 'fr_sulfide', 'slogp_vsa11', 'fr_hoccn', 'fr_n_o', 'peoe_vsa1', 'slogp_vsa6', 'heavyatommolwt', 'fractioncsp3', 'estate_vsa8', 'peoe_vsa10', 'numaliphaticrings', 'fr_thiophene', 'maxestateindex', 'smr_vsa10', 'labuteasa', 'smr_vsa2', 'fpdensitymorgan3', 'smr_vsa9', 'slogp_vsa10', 'numaromaticheterocycles', 'fr_nh2', 'fr_diazo', 'chi3n', 'fr_ar_coo', 'slogp_vsa5', 'fr_bicyclic', 'fr_amide', 'estate_vsa10', 'fr_guanido', 'chi1n', 'numsaturatedrings', 'fr_piperdine', 'fr_term_acetylene', 'estate_vsa4', 'slogp_vsa3', 'fr_coo', 'fr_ether', 'estate_vsa7', 'bcut2d_chglo', 'fr_oxazole', 'peoe_vsa6', 'hallkieralpha', 'peoe_vsa2', 'chi2v', 'nocount', 'vsa_estate5', 'fr_nhpyrrole', 'fr_al_coo', 'bertzct', 'estate_vsa11', 'minabspartialcharge', 'slogp_vsa8', 'fr_imide', 'kappa2', 'numaliphaticheterocycles', 'numsaturatedheterocycles', 'fr_hdrzone', 'smr_vsa4', 'fr_ar_n', 'nrot', 'smr_vsa8', 'slogp_vsa2', 'chi4v', 'fr_phos_ester', 'fr_para_hydroxylation', 'smr_vsa3', 'nhohcount', 'estate_vsa2', 'mollogp', 'tpsa', 'fr_azide', 'peoe_vsa4', 'numhdonors', 'fr_al_oh_notert', 'fr_c_o', 'chi0', 'fr_nitro_arom_nonortho', 'vsa_estate3', 'fr_benzene', 'fr_ketone', 'vsa_estate8', 'smr_vsa7', 'fr_ar_oh', 'fr_priamide', 'ringcount', 'estate_vsa3', 'numaromaticcarbocycles', 'bcut2d_mwhi', 'chi1v', 'heavyatomcount', 'vsa_estate4', 'chi0v'],
|
|
32
|
+
"compressed_features": [],
|
|
33
|
+
"train_all_data": False,
|
|
34
|
+
"track_columns": ['udm_asy_res_intrinsic_clearance_ul_per_min_per_mg_protein']
|
|
33
35
|
}
|
|
34
36
|
|
|
35
37
|
|
|
@@ -73,136 +75,97 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
|
|
|
73
75
|
return df.rename(columns=rename_dict)
|
|
74
76
|
|
|
75
77
|
|
|
76
|
-
def
|
|
77
|
-
df_pred: pd.DataFrame,
|
|
78
|
-
prox_df: pd.DataFrame,
|
|
79
|
-
calibration_strength: float = 0.7,
|
|
80
|
-
distance_decay: float = 3.0,
|
|
81
|
-
) -> pd.DataFrame:
|
|
78
|
+
def convert_categorical_types(df: pd.DataFrame, features: list, category_mappings={}) -> tuple:
|
|
82
79
|
"""
|
|
83
|
-
|
|
84
|
-
|
|
80
|
+
Converts appropriate columns to categorical type with consistent mappings.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
df (pd.DataFrame): The DataFrame to process.
|
|
84
|
+
features (list): List of feature names to consider for conversion.
|
|
85
|
+
category_mappings (dict, optional): Existing category mappings. If empty dict, we're in
|
|
86
|
+
training mode. If populated, we're in inference mode.
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
tuple: (processed DataFrame, category mappings dictionary)
|
|
85
90
|
"""
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
# Calculate distance weights (closer neighbors get more weight)
|
|
102
|
-
prox_df = prox_df.copy()
|
|
103
|
-
prox_df['weight'] = 1 / (1 + prox_df['distance'] ** distance_decay)
|
|
104
|
-
|
|
105
|
-
# Get weighted quantiles and statistics for each ID
|
|
106
|
-
neighbor_stats = []
|
|
107
|
-
for id_val, group in prox_df.groupby(id_column):
|
|
108
|
-
values = group[target_column].values
|
|
109
|
-
weights = group['weight'].values
|
|
110
|
-
|
|
111
|
-
# Normalize weights
|
|
112
|
-
weights = weights / weights.sum()
|
|
113
|
-
|
|
114
|
-
stats = {
|
|
115
|
-
id_column: id_val,
|
|
116
|
-
'local_q025': weighted_quantile(values, weights, 0.025),
|
|
117
|
-
'local_q25': weighted_quantile(values, weights, 0.25),
|
|
118
|
-
'local_q75': weighted_quantile(values, weights, 0.75),
|
|
119
|
-
'local_q975': weighted_quantile(values, weights, 0.975),
|
|
120
|
-
'local_median': weighted_quantile(values, weights, 0.5),
|
|
121
|
-
'local_std': np.sqrt(np.average((values - np.average(values, weights=weights)) ** 2, weights=weights)),
|
|
122
|
-
'avg_distance': group['distance'].mean(),
|
|
123
|
-
'min_distance': group['distance'].min(),
|
|
124
|
-
'max_distance': group['distance'].max(),
|
|
125
|
-
}
|
|
126
|
-
neighbor_stats.append(stats)
|
|
127
|
-
|
|
128
|
-
neighbor_df = pd.DataFrame(neighbor_stats)
|
|
129
|
-
out = df_pred.merge(neighbor_df, on=id_column, how='left')
|
|
130
|
-
|
|
131
|
-
# Model disagreement score (normalized by prediction std)
|
|
132
|
-
model_disagreement = (out["prediction"] - out["prediction_uq"]).abs()
|
|
133
|
-
disagreement_score = (model_disagreement / out["prediction_std"]).clip(0, 2)
|
|
134
|
-
|
|
135
|
-
# Local confidence based on:
|
|
136
|
-
# 1. How close the neighbors are (closer = more confident)
|
|
137
|
-
# 2. How much local variance there is (less variance = more confident)
|
|
138
|
-
max_reasonable_distance = out['max_distance'].quantile(0.8) # 80th percentile as reference
|
|
139
|
-
distance_confidence = (1 - (out['avg_distance'] / max_reasonable_distance)).clip(0.1, 1.0)
|
|
140
|
-
|
|
141
|
-
variance_confidence = (out["prediction_std"] / out["local_std"]).clip(0.5, 2.0)
|
|
142
|
-
local_confidence = distance_confidence * variance_confidence.clip(0.5, 1.5)
|
|
143
|
-
|
|
144
|
-
# Calibration weight: higher when models disagree and we have good local data
|
|
145
|
-
calibration_weight = (
|
|
146
|
-
calibration_strength *
|
|
147
|
-
local_confidence * # Weight by local data quality
|
|
148
|
-
disagreement_score.clip(0.3, 1.0) # More calibration when models disagree
|
|
149
|
-
)
|
|
91
|
+
# Training mode
|
|
92
|
+
if category_mappings == {}:
|
|
93
|
+
for col in df.select_dtypes(include=["object", "string"]):
|
|
94
|
+
if col in features and df[col].nunique() < 20:
|
|
95
|
+
print(f"Training mode: Converting {col} to category")
|
|
96
|
+
df[col] = df[col].astype("category")
|
|
97
|
+
category_mappings[col] = df[col].cat.categories.tolist() # Store category mappings
|
|
98
|
+
|
|
99
|
+
# Inference mode
|
|
100
|
+
else:
|
|
101
|
+
for col, categories in category_mappings.items():
|
|
102
|
+
if col in df.columns:
|
|
103
|
+
print(f"Inference mode: Applying categorical mapping for {col}")
|
|
104
|
+
df[col] = pd.Categorical(df[col], categories=categories) # Apply consistent categorical mapping
|
|
150
105
|
|
|
151
|
-
|
|
152
|
-
consensus_pred = 0.65 * out["prediction_uq"] + 0.35 * out["prediction"]
|
|
106
|
+
return df, category_mappings
|
|
153
107
|
|
|
154
|
-
# Re-center local intervals around consensus prediction
|
|
155
|
-
local_center_offset = consensus_pred - out["local_median"]
|
|
156
108
|
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
(
|
|
162
|
-
(
|
|
163
|
-
|
|
109
|
+
def decompress_features(df: pd.DataFrame, features: List[str], compressed_features: List[str]) -> Tuple[pd.DataFrame, List[str]]:
|
|
110
|
+
"""Prepare features for the XGBoost model
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
df (pd.DataFrame): The features DataFrame
|
|
114
|
+
features (List[str]): Full list of feature names
|
|
115
|
+
compressed_features (List[str]): List of feature names to decompress (bitstrings)
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
pd.DataFrame: DataFrame with the decompressed features
|
|
119
|
+
List[str]: Updated list of feature names after decompression
|
|
164
120
|
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
121
|
+
Raises:
|
|
122
|
+
ValueError: If any missing values are found in the specified features
|
|
123
|
+
"""
|
|
168
124
|
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
125
|
+
# Check for any missing values in the required features
|
|
126
|
+
missing_counts = df[features].isna().sum()
|
|
127
|
+
if missing_counts.any():
|
|
128
|
+
missing_features = missing_counts[missing_counts > 0]
|
|
129
|
+
print(
|
|
130
|
+
f"WARNING: Found missing values in features: {missing_features.to_dict()}. "
|
|
131
|
+
"WARNING: You might want to remove/replace all NaN values before processing."
|
|
173
132
|
)
|
|
174
133
|
|
|
175
|
-
#
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
134
|
+
# Decompress the specified compressed features
|
|
135
|
+
decompressed_features = features
|
|
136
|
+
for feature in compressed_features:
|
|
137
|
+
if (feature not in df.columns) or (feature not in features):
|
|
138
|
+
print(f"Feature '{feature}' not in the features list, skipping decompression.")
|
|
139
|
+
continue
|
|
179
140
|
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
sparse_region_mask = out['min_distance'] > out['min_distance'].quantile(0.9)
|
|
183
|
-
expansion_factor = 1 + 0.2 * sparse_region_mask # 20% expansion in sparse regions
|
|
141
|
+
# Remove the feature from the list of features to avoid duplication
|
|
142
|
+
decompressed_features.remove(feature)
|
|
184
143
|
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
144
|
+
# Handle all compressed features as bitstrings
|
|
145
|
+
bit_matrix = np.array([list(bitstring) for bitstring in df[feature]], dtype=np.uint8)
|
|
146
|
+
prefix = feature[:3]
|
|
188
147
|
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
148
|
+
# Create all new columns at once - avoids fragmentation
|
|
149
|
+
new_col_names = [f"{prefix}_{i}" for i in range(bit_matrix.shape[1])]
|
|
150
|
+
new_df = pd.DataFrame(bit_matrix, columns=new_col_names, index=df.index)
|
|
192
151
|
|
|
193
|
-
|
|
152
|
+
# Add to features list
|
|
153
|
+
decompressed_features.extend(new_col_names)
|
|
154
|
+
|
|
155
|
+
# Drop original column and concatenate new ones
|
|
156
|
+
df = df.drop(columns=[feature])
|
|
157
|
+
df = pd.concat([df, new_df], axis=1)
|
|
158
|
+
|
|
159
|
+
return df, decompressed_features
|
|
194
160
|
|
|
195
161
|
|
|
196
|
-
# TRAINING SECTION
|
|
197
|
-
#
|
|
198
|
-
# This section (__main__) is where SageMaker will execute the training job
|
|
199
|
-
# and save the model artifacts to the model directory.
|
|
200
|
-
#
|
|
201
162
|
if __name__ == "__main__":
|
|
202
163
|
# Template Parameters
|
|
203
164
|
id_column = TEMPLATE_PARAMS["id_column"]
|
|
204
|
-
features = TEMPLATE_PARAMS["features"]
|
|
205
165
|
target = TEMPLATE_PARAMS["target"]
|
|
166
|
+
features = TEMPLATE_PARAMS["features"]
|
|
167
|
+
orig_features = features.copy()
|
|
168
|
+
compressed_features = TEMPLATE_PARAMS["compressed_features"]
|
|
206
169
|
train_all_data = TEMPLATE_PARAMS["train_all_data"]
|
|
207
170
|
track_columns = TEMPLATE_PARAMS["track_columns"] # Can be None
|
|
208
171
|
validation_split = 0.2
|
|
@@ -216,34 +179,51 @@ if __name__ == "__main__":
|
|
|
216
179
|
)
|
|
217
180
|
args = parser.parse_args()
|
|
218
181
|
|
|
219
|
-
#
|
|
182
|
+
# Read the training data into DataFrames
|
|
220
183
|
training_files = [
|
|
221
184
|
os.path.join(args.train, file)
|
|
222
|
-
for file in os.listdir(args.train)
|
|
185
|
+
for file in os.listdir(args.train)
|
|
186
|
+
if file.endswith(".csv")
|
|
223
187
|
]
|
|
224
188
|
print(f"Training Files: {training_files}")
|
|
225
189
|
|
|
226
190
|
# Combine files and read them all into a single pandas dataframe
|
|
227
|
-
|
|
191
|
+
all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
|
|
228
192
|
|
|
229
|
-
# Check if the
|
|
230
|
-
check_dataframe(
|
|
193
|
+
# Check if the dataframe is empty
|
|
194
|
+
check_dataframe(all_df, "training_df")
|
|
231
195
|
|
|
232
|
-
#
|
|
196
|
+
# Features/Target output
|
|
197
|
+
print(f"Target: {target}")
|
|
198
|
+
print(f"Features: {str(features)}")
|
|
199
|
+
|
|
200
|
+
# Convert any features that might be categorical to 'category' type
|
|
201
|
+
all_df, category_mappings = convert_categorical_types(all_df, features)
|
|
202
|
+
|
|
203
|
+
# If we have compressed features, decompress them
|
|
204
|
+
if compressed_features:
|
|
205
|
+
print(f"Decompressing features {compressed_features}...")
|
|
206
|
+
all_df, features = decompress_features(all_df, features, compressed_features)
|
|
207
|
+
|
|
208
|
+
# Do we want to train on all the data?
|
|
233
209
|
if train_all_data:
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
print("
|
|
241
|
-
df_train =
|
|
242
|
-
df_val =
|
|
210
|
+
print("Training on ALL of the data")
|
|
211
|
+
df_train = all_df.copy()
|
|
212
|
+
df_val = all_df.copy()
|
|
213
|
+
|
|
214
|
+
# Does the dataframe have a training column?
|
|
215
|
+
elif "training" in all_df.columns:
|
|
216
|
+
print("Found training column, splitting data based on training column")
|
|
217
|
+
df_train = all_df[all_df["training"]]
|
|
218
|
+
df_val = all_df[~all_df["training"]]
|
|
243
219
|
else:
|
|
244
|
-
#
|
|
245
|
-
print("
|
|
246
|
-
df_train, df_val = train_test_split(
|
|
220
|
+
# Just do a random training Split
|
|
221
|
+
print("WARNING: No training column found, splitting data with random state=42")
|
|
222
|
+
df_train, df_val = train_test_split(
|
|
223
|
+
all_df, test_size=validation_split, random_state=42
|
|
224
|
+
)
|
|
225
|
+
print(f"FIT/TRAIN: {df_train.shape}")
|
|
226
|
+
print(f"VALIDATION: {df_val.shape}")
|
|
247
227
|
|
|
248
228
|
# We're using XGBoost for point predictions and NGBoost for uncertainty quantification
|
|
249
229
|
xgb_model = XGBRegressor()
|
|
@@ -251,18 +231,16 @@ if __name__ == "__main__":
|
|
|
251
231
|
|
|
252
232
|
# Prepare features and targets for training
|
|
253
233
|
X_train = df_train[features]
|
|
254
|
-
|
|
234
|
+
X_validate = df_val[features]
|
|
255
235
|
y_train = df_train[target]
|
|
256
|
-
|
|
236
|
+
y_validate = df_val[target]
|
|
257
237
|
|
|
258
238
|
# Train both models using the training data
|
|
259
239
|
xgb_model.fit(X_train, y_train)
|
|
260
|
-
ngb_model.fit(X_train, y_train, X_val=
|
|
240
|
+
ngb_model.fit(X_train, y_train, X_val=X_validate, Y_val=y_validate)
|
|
261
241
|
|
|
262
242
|
# Make Predictions on the Validation Set
|
|
263
243
|
print(f"Making Predictions on Validation Set...")
|
|
264
|
-
y_validate = df_val[target]
|
|
265
|
-
X_validate = df_val[features]
|
|
266
244
|
preds = xgb_model.predict(X_validate)
|
|
267
245
|
|
|
268
246
|
# Calculate various model performance metrics (regression)
|
|
@@ -280,9 +258,9 @@ if __name__ == "__main__":
|
|
|
280
258
|
# Save the trained NGBoost model
|
|
281
259
|
joblib.dump(ngb_model, os.path.join(args.model_dir, "ngb_model.joblib"))
|
|
282
260
|
|
|
283
|
-
# Save the
|
|
261
|
+
# Save the features (this will validate input during predictions)
|
|
284
262
|
with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
|
|
285
|
-
json.dump(
|
|
263
|
+
json.dump(orig_features, fp) # We save the original features, not the decompressed ones
|
|
286
264
|
|
|
287
265
|
# Now the Proximity model
|
|
288
266
|
model = Proximity(df_train, id_column, features, target, track_columns=track_columns)
|
|
@@ -295,7 +273,7 @@ if __name__ == "__main__":
|
|
|
295
273
|
# Inference Section
|
|
296
274
|
#
|
|
297
275
|
def model_fn(model_dir) -> dict:
|
|
298
|
-
"""Load and return XGBoost and
|
|
276
|
+
"""Load and return XGBoost, NGBoost, and Prox Model from model directory."""
|
|
299
277
|
|
|
300
278
|
# Load XGBoost regressor
|
|
301
279
|
xgb_path = os.path.join(model_dir, "xgb_model.json")
|
|
@@ -376,18 +354,30 @@ def predict_fn(df, models) -> pd.DataFrame:
|
|
|
376
354
|
df["prediction_std"] = dist_params['scale'] # standard deviation
|
|
377
355
|
|
|
378
356
|
# Add 95% prediction intervals using ppf (percent point function)
|
|
379
|
-
|
|
380
|
-
|
|
357
|
+
# Note: Our hybrid model uses XGB point prediction and NGBoost UQ
|
|
358
|
+
# so we need to adjust the bounds to include the point prediction
|
|
359
|
+
df["q_025"] = np.minimum(y_dists.ppf(0.025), df["prediction"])
|
|
360
|
+
df["q_975"] = np.maximum(y_dists.ppf(0.975), df["prediction"])
|
|
361
|
+
|
|
362
|
+
# Add 90% prediction intervals
|
|
363
|
+
df["q_05"] = y_dists.ppf(0.05) # 5th percentile
|
|
364
|
+
df["q_95"] = y_dists.ppf(0.95) # 95th percentile
|
|
365
|
+
|
|
366
|
+
# Add 80% prediction intervals
|
|
367
|
+
df["q_10"] = y_dists.ppf(0.10) # 10th percentile
|
|
368
|
+
df["q_90"] = y_dists.ppf(0.90) # 90th percentile
|
|
381
369
|
|
|
382
370
|
# Add 50% prediction intervals
|
|
383
|
-
df["q_25"] = y_dists.ppf(0.25)
|
|
384
|
-
df["q_75"] = y_dists.ppf(0.75)
|
|
371
|
+
df["q_25"] = y_dists.ppf(0.25) # 25th percentile
|
|
372
|
+
df["q_75"] = y_dists.ppf(0.75) # 75th percentile
|
|
385
373
|
|
|
386
|
-
#
|
|
387
|
-
|
|
374
|
+
# Reorder the quantile columns for easier reading
|
|
375
|
+
quantile_cols = ["q_025", "q_05", "q_10", "q_25", "q_75", "q_90", "q_95", "q_975"]
|
|
376
|
+
other_cols = [col for col in df.columns if col not in quantile_cols]
|
|
377
|
+
df = df[other_cols + quantile_cols]
|
|
388
378
|
|
|
389
|
-
#
|
|
390
|
-
|
|
379
|
+
# Compute Nearest neighbors with Proximity model
|
|
380
|
+
models["proximity"].neighbors(df)
|
|
391
381
|
|
|
392
382
|
# Return the modified DataFrame
|
|
393
383
|
return df
|
|
@@ -15,7 +15,9 @@ import json
|
|
|
15
15
|
import argparse
|
|
16
16
|
import joblib
|
|
17
17
|
import os
|
|
18
|
+
import numpy as np
|
|
18
19
|
import pandas as pd
|
|
20
|
+
from typing import List, Tuple
|
|
19
21
|
|
|
20
22
|
# Local Imports
|
|
21
23
|
from proximity import Proximity
|
|
@@ -25,8 +27,9 @@ from proximity import Proximity
|
|
|
25
27
|
# Template Placeholders
|
|
26
28
|
TEMPLATE_PARAMS = {
|
|
27
29
|
"id_column": "{{id_column}}",
|
|
28
|
-
"features": "{{feature_list}}",
|
|
29
30
|
"target": "{{target_column}}",
|
|
31
|
+
"features": "{{feature_list}}",
|
|
32
|
+
"compressed_features": "{{compressed_features}}",
|
|
30
33
|
"train_all_data": "{{train_all_data}}",
|
|
31
34
|
"track_columns": "{{track_columns}}"
|
|
32
35
|
}
|
|
@@ -72,16 +75,97 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
|
|
|
72
75
|
return df.rename(columns=rename_dict)
|
|
73
76
|
|
|
74
77
|
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
78
|
+
def convert_categorical_types(df: pd.DataFrame, features: list, category_mappings={}) -> tuple:
|
|
79
|
+
"""
|
|
80
|
+
Converts appropriate columns to categorical type with consistent mappings.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
df (pd.DataFrame): The DataFrame to process.
|
|
84
|
+
features (list): List of feature names to consider for conversion.
|
|
85
|
+
category_mappings (dict, optional): Existing category mappings. If empty dict, we're in
|
|
86
|
+
training mode. If populated, we're in inference mode.
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
tuple: (processed DataFrame, category mappings dictionary)
|
|
90
|
+
"""
|
|
91
|
+
# Training mode
|
|
92
|
+
if category_mappings == {}:
|
|
93
|
+
for col in df.select_dtypes(include=["object", "string"]):
|
|
94
|
+
if col in features and df[col].nunique() < 20:
|
|
95
|
+
print(f"Training mode: Converting {col} to category")
|
|
96
|
+
df[col] = df[col].astype("category")
|
|
97
|
+
category_mappings[col] = df[col].cat.categories.tolist() # Store category mappings
|
|
98
|
+
|
|
99
|
+
# Inference mode
|
|
100
|
+
else:
|
|
101
|
+
for col, categories in category_mappings.items():
|
|
102
|
+
if col in df.columns:
|
|
103
|
+
print(f"Inference mode: Applying categorical mapping for {col}")
|
|
104
|
+
df[col] = pd.Categorical(df[col], categories=categories) # Apply consistent categorical mapping
|
|
105
|
+
|
|
106
|
+
return df, category_mappings
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def decompress_features(df: pd.DataFrame, features: List[str], compressed_features: List[str]) -> Tuple[pd.DataFrame, List[str]]:
|
|
110
|
+
"""Prepare features for the XGBoost model
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
df (pd.DataFrame): The features DataFrame
|
|
114
|
+
features (List[str]): Full list of feature names
|
|
115
|
+
compressed_features (List[str]): List of feature names to decompress (bitstrings)
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
pd.DataFrame: DataFrame with the decompressed features
|
|
119
|
+
List[str]: Updated list of feature names after decompression
|
|
120
|
+
|
|
121
|
+
Raises:
|
|
122
|
+
ValueError: If any missing values are found in the specified features
|
|
123
|
+
"""
|
|
124
|
+
|
|
125
|
+
# Check for any missing values in the required features
|
|
126
|
+
missing_counts = df[features].isna().sum()
|
|
127
|
+
if missing_counts.any():
|
|
128
|
+
missing_features = missing_counts[missing_counts > 0]
|
|
129
|
+
print(
|
|
130
|
+
f"WARNING: Found missing values in features: {missing_features.to_dict()}. "
|
|
131
|
+
"WARNING: You might want to remove/replace all NaN values before processing."
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
# Decompress the specified compressed features
|
|
135
|
+
decompressed_features = features
|
|
136
|
+
for feature in compressed_features:
|
|
137
|
+
if (feature not in df.columns) or (feature not in features):
|
|
138
|
+
print(f"Feature '{feature}' not in the features list, skipping decompression.")
|
|
139
|
+
continue
|
|
140
|
+
|
|
141
|
+
# Remove the feature from the list of features to avoid duplication
|
|
142
|
+
decompressed_features.remove(feature)
|
|
143
|
+
|
|
144
|
+
# Handle all compressed features as bitstrings
|
|
145
|
+
bit_matrix = np.array([list(bitstring) for bitstring in df[feature]], dtype=np.uint8)
|
|
146
|
+
prefix = feature[:3]
|
|
147
|
+
|
|
148
|
+
# Create all new columns at once - avoids fragmentation
|
|
149
|
+
new_col_names = [f"{prefix}_{i}" for i in range(bit_matrix.shape[1])]
|
|
150
|
+
new_df = pd.DataFrame(bit_matrix, columns=new_col_names, index=df.index)
|
|
151
|
+
|
|
152
|
+
# Add to features list
|
|
153
|
+
decompressed_features.extend(new_col_names)
|
|
154
|
+
|
|
155
|
+
# Drop original column and concatenate new ones
|
|
156
|
+
df = df.drop(columns=[feature])
|
|
157
|
+
df = pd.concat([df, new_df], axis=1)
|
|
158
|
+
|
|
159
|
+
return df, decompressed_features
|
|
160
|
+
|
|
161
|
+
|
|
80
162
|
if __name__ == "__main__":
|
|
81
163
|
# Template Parameters
|
|
82
164
|
id_column = TEMPLATE_PARAMS["id_column"]
|
|
83
|
-
features = TEMPLATE_PARAMS["features"]
|
|
84
165
|
target = TEMPLATE_PARAMS["target"]
|
|
166
|
+
features = TEMPLATE_PARAMS["features"]
|
|
167
|
+
orig_features = features.copy()
|
|
168
|
+
compressed_features = TEMPLATE_PARAMS["compressed_features"]
|
|
85
169
|
train_all_data = TEMPLATE_PARAMS["train_all_data"]
|
|
86
170
|
track_columns = TEMPLATE_PARAMS["track_columns"] # Can be None
|
|
87
171
|
validation_split = 0.2
|
|
@@ -95,34 +179,51 @@ if __name__ == "__main__":
|
|
|
95
179
|
)
|
|
96
180
|
args = parser.parse_args()
|
|
97
181
|
|
|
98
|
-
#
|
|
182
|
+
# Read the training data into DataFrames
|
|
99
183
|
training_files = [
|
|
100
184
|
os.path.join(args.train, file)
|
|
101
|
-
for file in os.listdir(args.train)
|
|
185
|
+
for file in os.listdir(args.train)
|
|
186
|
+
if file.endswith(".csv")
|
|
102
187
|
]
|
|
103
188
|
print(f"Training Files: {training_files}")
|
|
104
189
|
|
|
105
190
|
# Combine files and read them all into a single pandas dataframe
|
|
106
|
-
|
|
191
|
+
all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
|
|
192
|
+
|
|
193
|
+
# Check if the dataframe is empty
|
|
194
|
+
check_dataframe(all_df, "training_df")
|
|
195
|
+
|
|
196
|
+
# Features/Target output
|
|
197
|
+
print(f"Target: {target}")
|
|
198
|
+
print(f"Features: {str(features)}")
|
|
107
199
|
|
|
108
|
-
#
|
|
109
|
-
|
|
200
|
+
# Convert any features that might be categorical to 'category' type
|
|
201
|
+
all_df, category_mappings = convert_categorical_types(all_df, features)
|
|
110
202
|
|
|
111
|
-
#
|
|
203
|
+
# If we have compressed features, decompress them
|
|
204
|
+
if compressed_features:
|
|
205
|
+
print(f"Decompressing features {compressed_features}...")
|
|
206
|
+
all_df, features = decompress_features(all_df, features, compressed_features)
|
|
207
|
+
|
|
208
|
+
# Do we want to train on all the data?
|
|
112
209
|
if train_all_data:
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
print("
|
|
120
|
-
df_train =
|
|
121
|
-
df_val =
|
|
210
|
+
print("Training on ALL of the data")
|
|
211
|
+
df_train = all_df.copy()
|
|
212
|
+
df_val = all_df.copy()
|
|
213
|
+
|
|
214
|
+
# Does the dataframe have a training column?
|
|
215
|
+
elif "training" in all_df.columns:
|
|
216
|
+
print("Found training column, splitting data based on training column")
|
|
217
|
+
df_train = all_df[all_df["training"]]
|
|
218
|
+
df_val = all_df[~all_df["training"]]
|
|
122
219
|
else:
|
|
123
|
-
#
|
|
124
|
-
print("
|
|
125
|
-
df_train, df_val = train_test_split(
|
|
220
|
+
# Just do a random training Split
|
|
221
|
+
print("WARNING: No training column found, splitting data with random state=42")
|
|
222
|
+
df_train, df_val = train_test_split(
|
|
223
|
+
all_df, test_size=validation_split, random_state=42
|
|
224
|
+
)
|
|
225
|
+
print(f"FIT/TRAIN: {df_train.shape}")
|
|
226
|
+
print(f"VALIDATION: {df_val.shape}")
|
|
126
227
|
|
|
127
228
|
# We're using XGBoost for point predictions and NGBoost for uncertainty quantification
|
|
128
229
|
xgb_model = XGBRegressor()
|
|
@@ -130,18 +231,16 @@ if __name__ == "__main__":
|
|
|
130
231
|
|
|
131
232
|
# Prepare features and targets for training
|
|
132
233
|
X_train = df_train[features]
|
|
133
|
-
|
|
234
|
+
X_validate = df_val[features]
|
|
134
235
|
y_train = df_train[target]
|
|
135
|
-
|
|
236
|
+
y_validate = df_val[target]
|
|
136
237
|
|
|
137
238
|
# Train both models using the training data
|
|
138
239
|
xgb_model.fit(X_train, y_train)
|
|
139
|
-
ngb_model.fit(X_train, y_train, X_val=
|
|
240
|
+
ngb_model.fit(X_train, y_train, X_val=X_validate, Y_val=y_validate)
|
|
140
241
|
|
|
141
242
|
# Make Predictions on the Validation Set
|
|
142
243
|
print(f"Making Predictions on Validation Set...")
|
|
143
|
-
y_validate = df_val[target]
|
|
144
|
-
X_validate = df_val[features]
|
|
145
244
|
preds = xgb_model.predict(X_validate)
|
|
146
245
|
|
|
147
246
|
# Calculate various model performance metrics (regression)
|
|
@@ -159,9 +258,9 @@ if __name__ == "__main__":
|
|
|
159
258
|
# Save the trained NGBoost model
|
|
160
259
|
joblib.dump(ngb_model, os.path.join(args.model_dir, "ngb_model.joblib"))
|
|
161
260
|
|
|
162
|
-
# Save the
|
|
261
|
+
# Save the features (this will validate input during predictions)
|
|
163
262
|
with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
|
|
164
|
-
json.dump(
|
|
263
|
+
json.dump(orig_features, fp) # We save the original features, not the decompressed ones
|
|
165
264
|
|
|
166
265
|
# Now the Proximity model
|
|
167
266
|
model = Proximity(df_train, id_column, features, target, track_columns=track_columns)
|
|
@@ -255,16 +354,27 @@ def predict_fn(df, models) -> pd.DataFrame:
|
|
|
255
354
|
df["prediction_std"] = dist_params['scale'] # standard deviation
|
|
256
355
|
|
|
257
356
|
# Add 95% prediction intervals using ppf (percent point function)
|
|
258
|
-
|
|
259
|
-
|
|
357
|
+
# Note: Our hybrid model uses XGB point prediction and NGBoost UQ
|
|
358
|
+
# so we need to adjust the bounds to include the point prediction
|
|
359
|
+
df["q_025"] = np.minimum(y_dists.ppf(0.025), df["prediction"])
|
|
360
|
+
df["q_975"] = np.maximum(y_dists.ppf(0.975), df["prediction"])
|
|
361
|
+
|
|
362
|
+
# Add 90% prediction intervals
|
|
363
|
+
df["q_05"] = y_dists.ppf(0.05) # 5th percentile
|
|
364
|
+
df["q_95"] = y_dists.ppf(0.95) # 95th percentile
|
|
365
|
+
|
|
366
|
+
# Add 80% prediction intervals
|
|
367
|
+
df["q_10"] = y_dists.ppf(0.10) # 10th percentile
|
|
368
|
+
df["q_90"] = y_dists.ppf(0.90) # 90th percentile
|
|
260
369
|
|
|
261
370
|
# Add 50% prediction intervals
|
|
262
|
-
df["q_25"] = y_dists.ppf(0.25)
|
|
263
|
-
df["q_75"] = y_dists.ppf(0.75)
|
|
371
|
+
df["q_25"] = y_dists.ppf(0.25) # 25th percentile
|
|
372
|
+
df["q_75"] = y_dists.ppf(0.75) # 75th percentile
|
|
264
373
|
|
|
265
|
-
#
|
|
266
|
-
|
|
267
|
-
|
|
374
|
+
# Reorder the quantile columns for easier reading
|
|
375
|
+
quantile_cols = ["q_025", "q_05", "q_10", "q_25", "q_75", "q_90", "q_95", "q_975"]
|
|
376
|
+
other_cols = [col for col in df.columns if col not in quantile_cols]
|
|
377
|
+
df = df[other_cols + quantile_cols]
|
|
268
378
|
|
|
269
379
|
# Compute Nearest neighbors with Proximity model
|
|
270
380
|
models["proximity"].neighbors(df)
|
|
@@ -219,9 +219,22 @@ def predict_fn(df, model) -> pd.DataFrame:
|
|
|
219
219
|
df["q_025"] = y_dists.ppf(0.025) # 2.5th percentile
|
|
220
220
|
df["q_975"] = y_dists.ppf(0.975) # 97.5th percentile
|
|
221
221
|
|
|
222
|
+
# Add 90% prediction intervals
|
|
223
|
+
df["q_05"] = y_dists.ppf(0.05) # 5th percentile
|
|
224
|
+
df["q_95"] = y_dists.ppf(0.95) # 95th percentile
|
|
225
|
+
|
|
226
|
+
# Add 80% prediction intervals
|
|
227
|
+
df["q_10"] = y_dists.ppf(0.10) # 10th percentile
|
|
228
|
+
df["q_90"] = y_dists.ppf(0.90) # 90th percentile
|
|
229
|
+
|
|
222
230
|
# Add 50% prediction intervals
|
|
223
|
-
df["q_25"] = y_dists.ppf(0.25)
|
|
224
|
-
df["q_75"] = y_dists.ppf(0.75)
|
|
231
|
+
df["q_25"] = y_dists.ppf(0.25) # 25th percentile
|
|
232
|
+
df["q_75"] = y_dists.ppf(0.75) # 75th percentile
|
|
233
|
+
|
|
234
|
+
# Reorder the quantile columns for easier reading
|
|
235
|
+
quantile_cols = ["q_025", "q_05", "q_10", "q_25", "q_75", "q_90", "q_95", "q_975"]
|
|
236
|
+
other_cols = [col for col in df.columns if col not in quantile_cols]
|
|
237
|
+
df = df[other_cols + quantile_cols]
|
|
225
238
|
|
|
226
239
|
# Return the modified DataFrame
|
|
227
240
|
return df
|
|
@@ -28,12 +28,12 @@ from typing import List, Tuple
|
|
|
28
28
|
|
|
29
29
|
# Template Parameters
|
|
30
30
|
TEMPLATE_PARAMS = {
|
|
31
|
-
"model_type": "
|
|
32
|
-
"
|
|
33
|
-
"features": ['bcut2d_logplow', 'numradicalelectrons', 'smr_vsa5', 'fr_lactam', 'fr_morpholine', 'fr_aldehyde', 'slogp_vsa1', 'fr_amidine', 'bpol', 'fr_ester', 'fr_azo', 'kappa3', 'peoe_vsa5', 'fr_ketone_topliss', 'vsa_estate9', 'estate_vsa9', 'bcut2d_mrhi', 'fr_ndealkylation1', 'numrotatablebonds', 'minestateindex', 'fr_quatn', 'peoe_vsa3', 'fr_epoxide', 'fr_aniline', 'minpartialcharge', 'fr_nitroso', 'fpdensitymorgan2', 'fr_oxime', 'fr_sulfone', 'smr_vsa1', 'kappa1', 'fr_pyridine', 'numaromaticrings', 'vsa_estate6', 'molmr', 'estate_vsa1', 'fr_dihydropyridine', 'vsa_estate10', 'fr_alkyl_halide', 'chi2n', 'fr_thiocyan', 'fpdensitymorgan1', 'fr_unbrch_alkane', 'slogp_vsa9', 'chi4n', 'fr_nitro_arom', 'fr_al_oh', 'fr_furan', 'fr_c_s', 'peoe_vsa8', 'peoe_vsa14', 'numheteroatoms', 'fr_ndealkylation2', 'maxabspartialcharge', 'vsa_estate2', 'peoe_vsa7', 'apol', 'numhacceptors', 'fr_tetrazole', 'vsa_estate1', 'peoe_vsa9', 'naromatom', 'bcut2d_chghi', 'fr_sh', 'fr_halogen', 'slogp_vsa4', 'fr_benzodiazepine', 'molwt', 'fr_isocyan', 'fr_prisulfonamd', 'maxabsestateindex', 'minabsestateindex', 'peoe_vsa11', 'slogp_vsa12', 'estate_vsa5', 'numaliphaticcarbocycles', 'bcut2d_mwlow', 'slogp_vsa7', 'fr_allylic_oxid', 'fr_methoxy', 'fr_nh0', 'fr_coo2', 'fr_phenol', 'nacid', 'nbase', 'chi3v', 'fr_ar_nh', 'fr_nitrile', 'fr_imidazole', 'fr_urea', 'bcut2d_mrlow', 'chi1', 'smr_vsa6', 'fr_aryl_methyl', 'narombond', 'fr_alkyl_carbamate', 'fr_piperzine', 'exactmolwt', 'qed', 'chi0n', 'fr_sulfonamd', 'fr_thiazole', 'numvalenceelectrons', 'fr_phos_acid', 'peoe_vsa12', 'fr_nh1', 'fr_hdrzine', 'fr_c_o_nocoo', 'fr_lactone', 'estate_vsa6', 'bcut2d_logphi', 'vsa_estate7', 'peoe_vsa13', 'numsaturatedcarbocycles', 'fr_nitro', 'fr_phenol_noorthohbond', 'rotratio', 'fr_barbitur', 'fr_isothiocyan', 'balabanj', 'fr_arn', 'fr_imine', 'maxpartialcharge', 'fr_sulfide', 'slogp_vsa11', 'fr_hoccn', 'fr_n_o', 'peoe_vsa1', 'slogp_vsa6', 'heavyatommolwt', 'fractioncsp3', 'estate_vsa8', 'peoe_vsa10', 'numaliphaticrings', 'fr_thiophene', 'maxestateindex', 'smr_vsa10', 'labuteasa', 'smr_vsa2', 'fpdensitymorgan3', 'smr_vsa9', 'slogp_vsa10', 'numaromaticheterocycles', 'fr_nh2', 'fr_diazo', 'chi3n', 'fr_ar_coo', 'slogp_vsa5', 'fr_bicyclic', 'fr_amide', 'estate_vsa10', 'fr_guanido', 'chi1n', 'numsaturatedrings', 'fr_piperdine', 'fr_term_acetylene', 'estate_vsa4', 'slogp_vsa3', 'fr_coo', 'fr_ether', 'estate_vsa7', 'bcut2d_chglo', 'fr_oxazole', 'peoe_vsa6', 'hallkieralpha', 'peoe_vsa2', 'chi2v', 'nocount', 'vsa_estate5', 'fr_nhpyrrole', 'fr_al_coo', 'bertzct', 'estate_vsa11', 'minabspartialcharge', 'slogp_vsa8', 'fr_imide', 'kappa2', 'numaliphaticheterocycles', 'numsaturatedheterocycles', 'fr_hdrzone', 'smr_vsa4', 'fr_ar_n', 'nrot', 'smr_vsa8', 'slogp_vsa2', 'chi4v', 'fr_phos_ester', 'fr_para_hydroxylation', 'smr_vsa3', 'nhohcount', 'estate_vsa2', 'mollogp', 'tpsa', 'fr_azide', 'peoe_vsa4', 'numhdonors', 'fr_al_oh_notert', 'fr_c_o', 'chi0', 'fr_nitro_arom_nonortho', 'vsa_estate3', 'fr_benzene', 'fr_ketone', 'vsa_estate8', 'smr_vsa7', 'fr_ar_oh', 'fr_priamide', 'ringcount', 'estate_vsa3', 'numaromaticcarbocycles', 'bcut2d_mwhi', 'chi1v', 'heavyatomcount', 'vsa_estate4', 'chi0v'
|
|
31
|
+
"model_type": "regressor",
|
|
32
|
+
"target": "udm_asy_res_intrinsic_clearance_ul_per_min_per_mg_protein",
|
|
33
|
+
"features": ['bcut2d_logplow', 'numradicalelectrons', 'smr_vsa5', 'fr_lactam', 'fr_morpholine', 'fr_aldehyde', 'slogp_vsa1', 'fr_amidine', 'bpol', 'fr_ester', 'fr_azo', 'kappa3', 'peoe_vsa5', 'fr_ketone_topliss', 'vsa_estate9', 'estate_vsa9', 'bcut2d_mrhi', 'fr_ndealkylation1', 'numrotatablebonds', 'minestateindex', 'fr_quatn', 'peoe_vsa3', 'fr_epoxide', 'fr_aniline', 'minpartialcharge', 'fr_nitroso', 'fpdensitymorgan2', 'fr_oxime', 'fr_sulfone', 'smr_vsa1', 'kappa1', 'fr_pyridine', 'numaromaticrings', 'vsa_estate6', 'molmr', 'estate_vsa1', 'fr_dihydropyridine', 'vsa_estate10', 'fr_alkyl_halide', 'chi2n', 'fr_thiocyan', 'fpdensitymorgan1', 'fr_unbrch_alkane', 'slogp_vsa9', 'chi4n', 'fr_nitro_arom', 'fr_al_oh', 'fr_furan', 'fr_c_s', 'peoe_vsa8', 'peoe_vsa14', 'numheteroatoms', 'fr_ndealkylation2', 'maxabspartialcharge', 'vsa_estate2', 'peoe_vsa7', 'apol', 'numhacceptors', 'fr_tetrazole', 'vsa_estate1', 'peoe_vsa9', 'naromatom', 'bcut2d_chghi', 'fr_sh', 'fr_halogen', 'slogp_vsa4', 'fr_benzodiazepine', 'molwt', 'fr_isocyan', 'fr_prisulfonamd', 'maxabsestateindex', 'minabsestateindex', 'peoe_vsa11', 'slogp_vsa12', 'estate_vsa5', 'numaliphaticcarbocycles', 'bcut2d_mwlow', 'slogp_vsa7', 'fr_allylic_oxid', 'fr_methoxy', 'fr_nh0', 'fr_coo2', 'fr_phenol', 'nacid', 'nbase', 'chi3v', 'fr_ar_nh', 'fr_nitrile', 'fr_imidazole', 'fr_urea', 'bcut2d_mrlow', 'chi1', 'smr_vsa6', 'fr_aryl_methyl', 'narombond', 'fr_alkyl_carbamate', 'fr_piperzine', 'exactmolwt', 'qed', 'chi0n', 'fr_sulfonamd', 'fr_thiazole', 'numvalenceelectrons', 'fr_phos_acid', 'peoe_vsa12', 'fr_nh1', 'fr_hdrzine', 'fr_c_o_nocoo', 'fr_lactone', 'estate_vsa6', 'bcut2d_logphi', 'vsa_estate7', 'peoe_vsa13', 'numsaturatedcarbocycles', 'fr_nitro', 'fr_phenol_noorthohbond', 'rotratio', 'fr_barbitur', 'fr_isothiocyan', 'balabanj', 'fr_arn', 'fr_imine', 'maxpartialcharge', 'fr_sulfide', 'slogp_vsa11', 'fr_hoccn', 'fr_n_o', 'peoe_vsa1', 'slogp_vsa6', 'heavyatommolwt', 'fractioncsp3', 'estate_vsa8', 'peoe_vsa10', 'numaliphaticrings', 'fr_thiophene', 'maxestateindex', 'smr_vsa10', 'labuteasa', 'smr_vsa2', 'fpdensitymorgan3', 'smr_vsa9', 'slogp_vsa10', 'numaromaticheterocycles', 'fr_nh2', 'fr_diazo', 'chi3n', 'fr_ar_coo', 'slogp_vsa5', 'fr_bicyclic', 'fr_amide', 'estate_vsa10', 'fr_guanido', 'chi1n', 'numsaturatedrings', 'fr_piperdine', 'fr_term_acetylene', 'estate_vsa4', 'slogp_vsa3', 'fr_coo', 'fr_ether', 'estate_vsa7', 'bcut2d_chglo', 'fr_oxazole', 'peoe_vsa6', 'hallkieralpha', 'peoe_vsa2', 'chi2v', 'nocount', 'vsa_estate5', 'fr_nhpyrrole', 'fr_al_coo', 'bertzct', 'estate_vsa11', 'minabspartialcharge', 'slogp_vsa8', 'fr_imide', 'kappa2', 'numaliphaticheterocycles', 'numsaturatedheterocycles', 'fr_hdrzone', 'smr_vsa4', 'fr_ar_n', 'nrot', 'smr_vsa8', 'slogp_vsa2', 'chi4v', 'fr_phos_ester', 'fr_para_hydroxylation', 'smr_vsa3', 'nhohcount', 'estate_vsa2', 'mollogp', 'tpsa', 'fr_azide', 'peoe_vsa4', 'numhdonors', 'fr_al_oh_notert', 'fr_c_o', 'chi0', 'fr_nitro_arom_nonortho', 'vsa_estate3', 'fr_benzene', 'fr_ketone', 'vsa_estate8', 'smr_vsa7', 'fr_ar_oh', 'fr_priamide', 'ringcount', 'estate_vsa3', 'numaromaticcarbocycles', 'bcut2d_mwhi', 'chi1v', 'heavyatomcount', 'vsa_estate4', 'chi0v'],
|
|
34
34
|
"compressed_features": [],
|
|
35
|
-
"model_metrics_s3_path": "s3://ideaya-sageworks-bucket/models/
|
|
36
|
-
"train_all_data":
|
|
35
|
+
"model_metrics_s3_path": "s3://ideaya-sageworks-bucket/models/temp-hlm-phase1-reg-0-80/training",
|
|
36
|
+
"train_all_data": False
|
|
37
37
|
}
|
|
38
38
|
|
|
39
39
|
# Function to check if dataframe is empty
|
|
@@ -88,13 +88,12 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
|
|
|
88
88
|
"""
|
|
89
89
|
Matches and renames DataFrame columns to match model feature names (case-insensitive).
|
|
90
90
|
Prioritizes exact matches, then case-insensitive matches.
|
|
91
|
-
|
|
91
|
+
|
|
92
92
|
Raises ValueError if any model features cannot be matched.
|
|
93
93
|
"""
|
|
94
94
|
df_columns_lower = {col.lower(): col for col in df.columns}
|
|
95
95
|
rename_dict = {}
|
|
96
96
|
missing = []
|
|
97
|
-
|
|
98
97
|
for feature in model_features:
|
|
99
98
|
if feature in df.columns:
|
|
100
99
|
continue # Exact match
|
|
@@ -102,10 +101,11 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
|
|
|
102
101
|
rename_dict[df_columns_lower[feature.lower()]] = feature
|
|
103
102
|
else:
|
|
104
103
|
missing.append(feature)
|
|
105
|
-
|
|
104
|
+
|
|
106
105
|
if missing:
|
|
107
106
|
raise ValueError(f"Features not found: {missing}")
|
|
108
|
-
|
|
107
|
+
|
|
108
|
+
# Rename the DataFrame columns to match the model features
|
|
109
109
|
return df.rename(columns=rename_dict)
|
|
110
110
|
|
|
111
111
|
|
|
@@ -197,7 +197,7 @@ if __name__ == "__main__":
|
|
|
197
197
|
"""The main function is for training the XGBoost model"""
|
|
198
198
|
|
|
199
199
|
# Harness Template Parameters
|
|
200
|
-
target = TEMPLATE_PARAMS["
|
|
200
|
+
target = TEMPLATE_PARAMS["target"]
|
|
201
201
|
features = TEMPLATE_PARAMS["features"]
|
|
202
202
|
orig_features = features.copy()
|
|
203
203
|
compressed_features = TEMPLATE_PARAMS["compressed_features"]
|
|
@@ -390,7 +390,7 @@ def input_fn(input_data, content_type):
|
|
|
390
390
|
"""Parse input data and return a DataFrame."""
|
|
391
391
|
if not input_data:
|
|
392
392
|
raise ValueError("Empty input data is not supported!")
|
|
393
|
-
|
|
393
|
+
|
|
394
394
|
# Decode bytes to string if necessary
|
|
395
395
|
if isinstance(input_data, bytes):
|
|
396
396
|
input_data = input_data.decode("utf-8")
|
|
@@ -29,7 +29,7 @@ from typing import List, Tuple
|
|
|
29
29
|
# Template Parameters
|
|
30
30
|
TEMPLATE_PARAMS = {
|
|
31
31
|
"model_type": "{{model_type}}",
|
|
32
|
-
"
|
|
32
|
+
"target": "{{target_column}}",
|
|
33
33
|
"features": "{{feature_list}}",
|
|
34
34
|
"compressed_features": "{{compressed_features}}",
|
|
35
35
|
"model_metrics_s3_path": "{{model_metrics_s3_path}}",
|
|
@@ -88,13 +88,12 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
|
|
|
88
88
|
"""
|
|
89
89
|
Matches and renames DataFrame columns to match model feature names (case-insensitive).
|
|
90
90
|
Prioritizes exact matches, then case-insensitive matches.
|
|
91
|
-
|
|
91
|
+
|
|
92
92
|
Raises ValueError if any model features cannot be matched.
|
|
93
93
|
"""
|
|
94
94
|
df_columns_lower = {col.lower(): col for col in df.columns}
|
|
95
95
|
rename_dict = {}
|
|
96
96
|
missing = []
|
|
97
|
-
|
|
98
97
|
for feature in model_features:
|
|
99
98
|
if feature in df.columns:
|
|
100
99
|
continue # Exact match
|
|
@@ -102,10 +101,11 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
|
|
|
102
101
|
rename_dict[df_columns_lower[feature.lower()]] = feature
|
|
103
102
|
else:
|
|
104
103
|
missing.append(feature)
|
|
105
|
-
|
|
104
|
+
|
|
106
105
|
if missing:
|
|
107
106
|
raise ValueError(f"Features not found: {missing}")
|
|
108
|
-
|
|
107
|
+
|
|
108
|
+
# Rename the DataFrame columns to match the model features
|
|
109
109
|
return df.rename(columns=rename_dict)
|
|
110
110
|
|
|
111
111
|
|
|
@@ -197,7 +197,7 @@ if __name__ == "__main__":
|
|
|
197
197
|
"""The main function is for training the XGBoost model"""
|
|
198
198
|
|
|
199
199
|
# Harness Template Parameters
|
|
200
|
-
target = TEMPLATE_PARAMS["
|
|
200
|
+
target = TEMPLATE_PARAMS["target"]
|
|
201
201
|
features = TEMPLATE_PARAMS["features"]
|
|
202
202
|
orig_features = features.copy()
|
|
203
203
|
compressed_features = TEMPLATE_PARAMS["compressed_features"]
|
|
@@ -390,7 +390,7 @@ def input_fn(input_data, content_type):
|
|
|
390
390
|
"""Parse input data and return a DataFrame."""
|
|
391
391
|
if not input_data:
|
|
392
392
|
raise ValueError("Empty input data is not supported!")
|
|
393
|
-
|
|
393
|
+
|
|
394
394
|
# Decode bytes to string if necessary
|
|
395
395
|
if isinstance(input_data, bytes):
|
|
396
396
|
input_data = input_data.decode("utf-8")
|
|
@@ -76,7 +76,7 @@ def run_batch_job(script_path: str, size: str = "small") -> int:
|
|
|
76
76
|
response = batch.submit_job(
|
|
77
77
|
jobName=job_name,
|
|
78
78
|
jobQueue="workbench-job-queue",
|
|
79
|
-
jobDefinition=f"workbench-
|
|
79
|
+
jobDefinition=f"workbench-batch-{size}",
|
|
80
80
|
containerOverrides={
|
|
81
81
|
"environment": [
|
|
82
82
|
{"name": "ML_PIPELINE_S3_PATH", "value": s3_path},
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import logging
|
|
3
|
+
import json
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
# Workbench Imports
|
|
7
|
+
from workbench.core.cloud_platform.aws.aws_account_clamp import AWSAccountClamp
|
|
8
|
+
from workbench.utils.config_manager import ConfigManager
|
|
9
|
+
from workbench.utils.s3_utils import upload_content_to_s3
|
|
10
|
+
|
|
11
|
+
log = logging.getLogger("workbench")
|
|
12
|
+
cm = ConfigManager()
|
|
13
|
+
workbench_bucket = cm.get_config("WORKBENCH_BUCKET")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def submit_to_sqs(script_path: str, size: str = "small") -> None:
|
|
17
|
+
"""
|
|
18
|
+
Upload script to S3 and submit message to SQS queue for processing.
|
|
19
|
+
Args:
|
|
20
|
+
script_path: Local path to the ML pipeline script
|
|
21
|
+
size: Job size tier - "small" (default), "medium", or "large"
|
|
22
|
+
"""
|
|
23
|
+
print(f"\n{'=' * 60}")
|
|
24
|
+
print("🚀 SUBMITTING ML PIPELINE JOB")
|
|
25
|
+
print(f"{'=' * 60}")
|
|
26
|
+
|
|
27
|
+
if size not in ["small", "medium", "large"]:
|
|
28
|
+
raise ValueError(f"Invalid size '{size}'. Must be 'small', 'medium', or 'large'")
|
|
29
|
+
# Validate script exists
|
|
30
|
+
script_file = Path(script_path)
|
|
31
|
+
if not script_file.exists():
|
|
32
|
+
raise FileNotFoundError(f"Script not found: {script_path}")
|
|
33
|
+
|
|
34
|
+
print(f"📄 Script: {script_file.name}")
|
|
35
|
+
print(f"📏 Size tier: {size}")
|
|
36
|
+
print(f"🪣 Bucket: {workbench_bucket}")
|
|
37
|
+
sqs = AWSAccountClamp().boto3_session.client("sqs")
|
|
38
|
+
script_name = script_file.name
|
|
39
|
+
|
|
40
|
+
# List Workbench queues
|
|
41
|
+
print("\n📋 Listing Workbench SQS queues...")
|
|
42
|
+
try:
|
|
43
|
+
queues = sqs.list_queues(QueueNamePrefix="workbench-")
|
|
44
|
+
queue_urls = queues.get("QueueUrls", [])
|
|
45
|
+
if queue_urls:
|
|
46
|
+
print(f"✅ Found {len(queue_urls)} workbench queue(s):")
|
|
47
|
+
for url in queue_urls:
|
|
48
|
+
queue_name = url.split("/")[-1]
|
|
49
|
+
print(f" • {queue_name}")
|
|
50
|
+
else:
|
|
51
|
+
print("⚠️ No workbench queues found")
|
|
52
|
+
except Exception as e:
|
|
53
|
+
print(f"❌ Error listing queues: {e}")
|
|
54
|
+
|
|
55
|
+
# Upload script to S3
|
|
56
|
+
s3_path = f"s3://{workbench_bucket}/batch-jobs/{script_name}"
|
|
57
|
+
print("\n📤 Uploading script to S3...")
|
|
58
|
+
print(f" Source: {script_path}")
|
|
59
|
+
print(f" Destination: {s3_path}")
|
|
60
|
+
|
|
61
|
+
try:
|
|
62
|
+
upload_content_to_s3(script_file.read_text(), s3_path)
|
|
63
|
+
print("✅ Script uploaded successfully")
|
|
64
|
+
except Exception as e:
|
|
65
|
+
print(f"❌ Upload failed: {e}")
|
|
66
|
+
raise
|
|
67
|
+
# Get queue URL and info
|
|
68
|
+
queue_name = "workbench-ml-pipeline-queue.fifo"
|
|
69
|
+
print("\n🎯 Getting queue information...")
|
|
70
|
+
print(f" Queue name: {queue_name}")
|
|
71
|
+
|
|
72
|
+
try:
|
|
73
|
+
queue_url = sqs.get_queue_url(QueueName=queue_name)["QueueUrl"]
|
|
74
|
+
print(f" Queue URL: {queue_url}")
|
|
75
|
+
|
|
76
|
+
# Get queue attributes for additional info
|
|
77
|
+
attrs = sqs.get_queue_attributes(
|
|
78
|
+
QueueUrl=queue_url, AttributeNames=["ApproximateNumberOfMessages", "ApproximateNumberOfMessagesNotVisible"]
|
|
79
|
+
)
|
|
80
|
+
messages_available = attrs["Attributes"].get("ApproximateNumberOfMessages", "0")
|
|
81
|
+
messages_in_flight = attrs["Attributes"].get("ApproximateNumberOfMessagesNotVisible", "0")
|
|
82
|
+
print(f" Messages in queue: {messages_available}")
|
|
83
|
+
print(f" Messages in flight: {messages_in_flight}")
|
|
84
|
+
|
|
85
|
+
except Exception as e:
|
|
86
|
+
print(f"❌ Error accessing queue: {e}")
|
|
87
|
+
raise
|
|
88
|
+
|
|
89
|
+
# Prepare message
|
|
90
|
+
message = {"script_path": s3_path, "size": size}
|
|
91
|
+
print("\n📨 Sending message to SQS...")
|
|
92
|
+
|
|
93
|
+
# Send the message to SQS
|
|
94
|
+
try:
|
|
95
|
+
response = sqs.send_message(
|
|
96
|
+
QueueUrl=queue_url,
|
|
97
|
+
MessageBody=json.dumps(message, indent=2),
|
|
98
|
+
MessageGroupId="ml-pipeline-jobs", # Required for FIFO
|
|
99
|
+
)
|
|
100
|
+
message_id = response["MessageId"]
|
|
101
|
+
print("✅ Message sent successfully!")
|
|
102
|
+
print(f" Message ID: {message_id}")
|
|
103
|
+
except Exception as e:
|
|
104
|
+
print(f"❌ Failed to send message: {e}")
|
|
105
|
+
raise
|
|
106
|
+
|
|
107
|
+
# Success summary
|
|
108
|
+
print(f"\n{'=' * 60}")
|
|
109
|
+
print("✅ JOB SUBMISSION COMPLETE")
|
|
110
|
+
print(f"{'=' * 60}")
|
|
111
|
+
print(f"📄 Script: {script_name}")
|
|
112
|
+
print(f"📏 Size: {size}")
|
|
113
|
+
print(f"🆔 Message ID: {message_id}")
|
|
114
|
+
print("\n🔍 MONITORING LOCATIONS:")
|
|
115
|
+
print(f" • SQS Queue: AWS Console → SQS → {queue_name}")
|
|
116
|
+
print(" • Lambda Logs: AWS Console → Lambda → Functions")
|
|
117
|
+
print(" • Batch Jobs: AWS Console → Batch → Jobs")
|
|
118
|
+
print(" • CloudWatch: AWS Console → CloudWatch → Log groups")
|
|
119
|
+
print("\n⏳ Your job should start processing soon...")
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def main():
|
|
123
|
+
"""CLI entry point for submitting ML pipelines via SQS."""
|
|
124
|
+
parser = argparse.ArgumentParser(description="Submit ML pipeline to SQS queue for Batch processing")
|
|
125
|
+
parser.add_argument("script_file", help="Local path to ML pipeline script")
|
|
126
|
+
parser.add_argument(
|
|
127
|
+
"--size", default="small", choices=["small", "medium", "large"], help="Job size tier (default: small)"
|
|
128
|
+
)
|
|
129
|
+
args = parser.parse_args()
|
|
130
|
+
try:
|
|
131
|
+
submit_to_sqs(args.script_file, args.size)
|
|
132
|
+
except Exception as e:
|
|
133
|
+
print(f"\n❌ ERROR: {e}")
|
|
134
|
+
log.error(f"Error: {e}")
|
|
135
|
+
exit(1)
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
if __name__ == "__main__":
|
|
139
|
+
main()
|
workbench/utils/model_utils.py
CHANGED
|
@@ -220,6 +220,8 @@ def uq_metrics(df: pd.DataFrame, target_col: str) -> Dict[str, Any]:
|
|
|
220
220
|
# --- Coverage and Interval Width ---
|
|
221
221
|
if "q_025" in df.columns and "q_975" in df.columns:
|
|
222
222
|
lower_95, upper_95 = df["q_025"], df["q_975"]
|
|
223
|
+
lower_90, upper_90 = df["q_05"], df["q_95"]
|
|
224
|
+
lower_80, upper_80 = df["q_10"], df["q_90"]
|
|
223
225
|
lower_50, upper_50 = df["q_25"], df["q_75"]
|
|
224
226
|
elif "prediction_std" in df.columns:
|
|
225
227
|
lower_95 = df["prediction"] - 1.96 * df["prediction_std"]
|
|
@@ -231,8 +233,12 @@ def uq_metrics(df: pd.DataFrame, target_col: str) -> Dict[str, Any]:
|
|
|
231
233
|
"Either quantile columns (q_025, q_975, q_25, q_75) or 'prediction_std' column must be present."
|
|
232
234
|
)
|
|
233
235
|
coverage_95 = np.mean((df[target_col] >= lower_95) & (df[target_col] <= upper_95))
|
|
236
|
+
coverage_90 = np.mean((df[target_col] >= lower_90) & (df[target_col] <= upper_90))
|
|
237
|
+
coverage_80 = np.mean((df[target_col] >= lower_80) & (df[target_col] <= upper_80))
|
|
234
238
|
coverage_50 = np.mean((df[target_col] >= lower_50) & (df[target_col] <= upper_50))
|
|
235
239
|
avg_width_95 = np.mean(upper_95 - lower_95)
|
|
240
|
+
avg_width_90 = np.mean(upper_90 - lower_90)
|
|
241
|
+
avg_width_80 = np.mean(upper_80 - lower_80)
|
|
236
242
|
avg_width_50 = np.mean(upper_50 - lower_50)
|
|
237
243
|
|
|
238
244
|
# --- CRPS (measures calibration + sharpness) ---
|
|
@@ -260,6 +266,8 @@ def uq_metrics(df: pd.DataFrame, target_col: str) -> Dict[str, Any]:
|
|
|
260
266
|
# Collect results
|
|
261
267
|
results = {
|
|
262
268
|
"coverage_95": coverage_95,
|
|
269
|
+
"coverage_90": coverage_90,
|
|
270
|
+
"coverage_80": coverage_80,
|
|
263
271
|
"coverage_50": coverage_50,
|
|
264
272
|
"avg_width_95": avg_width_95,
|
|
265
273
|
"avg_width_50": avg_width_50,
|
|
@@ -271,8 +279,12 @@ def uq_metrics(df: pd.DataFrame, target_col: str) -> Dict[str, Any]:
|
|
|
271
279
|
|
|
272
280
|
print("\n=== UQ Metrics ===")
|
|
273
281
|
print(f"Coverage @ 95%: {coverage_95:.3f} (target: 0.95)")
|
|
282
|
+
print(f"Coverage @ 90%: {coverage_90:.3f} (target: 0.90)")
|
|
283
|
+
print(f"Coverage @ 80%: {coverage_80:.3f} (target: 0.80)")
|
|
274
284
|
print(f"Coverage @ 50%: {coverage_50:.3f} (target: 0.50)")
|
|
275
285
|
print(f"Average 95% Width: {avg_width_95:.3f}")
|
|
286
|
+
print(f"Average 90% Width: {avg_width_90:.3f}")
|
|
287
|
+
print(f"Average 80% Width: {avg_width_80:.3f}")
|
|
276
288
|
print(f"Average 50% Width: {avg_width_50:.3f}")
|
|
277
289
|
print(f"CRPS: {mean_crps:.3f} (lower is better)")
|
|
278
290
|
print(f"Interval Score 95%: {mean_is_95:.3f} (lower is better)")
|
|
@@ -72,7 +72,9 @@ class DashboardStatus(PluginInterface):
|
|
|
72
72
|
details = "**Redis:** 🔴 Failed to Connect<br>"
|
|
73
73
|
|
|
74
74
|
# Fill in the license details
|
|
75
|
-
|
|
75
|
+
redis_host = config_info.get("REDIS_HOST", "NOT SET")
|
|
76
|
+
redis_port = config_info.get("REDIS_PORT", "NOT SET")
|
|
77
|
+
details += f"**Redis Server:** {redis_host}:{redis_port}<br>"
|
|
76
78
|
details += f"**Workbench S3 Bucket:** {config_info['WORKBENCH_BUCKET']}<br>"
|
|
77
79
|
details += f"**Plugin Path:** {config_info.get('WORKBENCH_PLUGINS', 'unknown')}<br>"
|
|
78
80
|
details += f"**Themes Path:** {config_info.get('WORKBENCH_THEMES', 'unknown')}<br>"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: workbench
|
|
3
|
-
Version: 0.8.
|
|
3
|
+
Version: 0.8.171
|
|
4
4
|
Summary: Workbench: A Dashboard and Python API for creating and deploying AWS SageMaker Model Pipelines
|
|
5
5
|
Author-email: SuperCowPowers LLC <support@supercowpowers.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -62,7 +62,7 @@ workbench/core/cloud_platform/aws/README.md,sha256=QT5IQXoUHbIA0qQ2wO6_2P2lYjYQF
|
|
|
62
62
|
workbench/core/cloud_platform/aws/aws_account_clamp.py,sha256=OzFknZXKW7VTvnDGGX4BXKoh0i1gQ7yaEBhkLCyHFSs,6310
|
|
63
63
|
workbench/core/cloud_platform/aws/aws_df_store.py,sha256=utRIlTCPwFneHHZ8_Z3Hw3rOJSeryiFA4wBtucxULRQ,15055
|
|
64
64
|
workbench/core/cloud_platform/aws/aws_graph_store.py,sha256=ytYxQTplUmeWbsPmxyZbf6mO9qyTl60ewlJG8MyfyEY,9414
|
|
65
|
-
workbench/core/cloud_platform/aws/aws_meta.py,sha256=
|
|
65
|
+
workbench/core/cloud_platform/aws/aws_meta.py,sha256=eY9Pn6pl2yAyseACFb2nitR-0vLwG4i8CSEXe8Iaswc,34778
|
|
66
66
|
workbench/core/cloud_platform/aws/aws_parameter_store.py,sha256=9ekuMOQFHFMIEV68UbHhS_fLB9iqG5Hvu4EV6iamEpk,10400
|
|
67
67
|
workbench/core/cloud_platform/aws/aws_secrets_manager.py,sha256=TUnddp1gX-OwxJ_oO5ONh7OI4Z2HC_6euGkJ-himCCk,8615
|
|
68
68
|
workbench/core/cloud_platform/aws/aws_session.py,sha256=2Gc_k4Q87BBeQDgXgVR-w-qmsF6ncZR8wvTeNnixM6k,6926
|
|
@@ -139,10 +139,10 @@ workbench/model_scripts/custom_models/uq_models/Readme.md,sha256=UVpL-lvtTrLqwBe
|
|
|
139
139
|
workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template,sha256=U4LIlpp8Rbu3apyzPR7-55lvlutpTsCro_PUvQ5pklY,6457
|
|
140
140
|
workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template,sha256=0IJnSBACQ556ldEiPqR7yPCOOLJs1hQhHmPBvB2d9tY,13491
|
|
141
141
|
workbench/model_scripts/custom_models/uq_models/gaussian_process.template,sha256=QbDUfkiPCwJ-c-4Twgu4utZuYZaAyeW_3T1IP-_tutw,6683
|
|
142
|
-
workbench/model_scripts/custom_models/uq_models/generated_model_script.py,sha256=
|
|
142
|
+
workbench/model_scripts/custom_models/uq_models/generated_model_script.py,sha256=QsMivNf77m4XfrV9aYTB7K3vI-InwegD7gyLZFNQmF4,17170
|
|
143
143
|
workbench/model_scripts/custom_models/uq_models/mapie_xgb.template,sha256=ZTmerwkmXtewJwx3GGJSdLRyzJV5SJ86PvCu3dV_GHw,7330
|
|
144
|
-
workbench/model_scripts/custom_models/uq_models/meta_uq.template,sha256=
|
|
145
|
-
workbench/model_scripts/custom_models/uq_models/ngboost.template,sha256=
|
|
144
|
+
workbench/model_scripts/custom_models/uq_models/meta_uq.template,sha256=FqLLbuKMijd4DjmxuBBQN3_vZcbl8WF0BZU8HRK48_0,13977
|
|
145
|
+
workbench/model_scripts/custom_models/uq_models/ngboost.template,sha256=9-O6P-SW50ul5Wl6es2DMWXSbrwOg7HWsdc8Qdln0MM,8278
|
|
146
146
|
workbench/model_scripts/custom_models/uq_models/proximity.py,sha256=zqmNlX70LnWXr5fdtFFQppSNTLjlOciQVrjGr-g9jRE,13716
|
|
147
147
|
workbench/model_scripts/custom_models/uq_models/requirements.txt,sha256=jfwV5b1t6BFtdaRGrSz8LnuQzJm-4V5OlhhP-4CGxhs,107
|
|
148
148
|
workbench/model_scripts/custom_script_example/custom_model_script.py,sha256=T8aydawgRVAdSlDimoWpXxG2YuWWQkbcjBVjAeSG2_0,6408
|
|
@@ -158,16 +158,17 @@ workbench/model_scripts/quant_regression/requirements.txt,sha256=jWlGc7HH7vqyukT
|
|
|
158
158
|
workbench/model_scripts/scikit_learn/generated_model_script.py,sha256=c73ZpJBlU5k13Nx-ZDkLXu7da40CYyhwjwwmuPq6uLg,12870
|
|
159
159
|
workbench/model_scripts/scikit_learn/requirements.txt,sha256=aVvwiJ3LgBUhM_PyFlb2gHXu_kpGPho3ANBzlOkfcvs,107
|
|
160
160
|
workbench/model_scripts/scikit_learn/scikit_learn.template,sha256=d4pgeZYFezUQsB-7iIsjsUgB1FM6d27651wpfDdXmI0,12640
|
|
161
|
-
workbench/model_scripts/xgb_model/generated_model_script.py,sha256=
|
|
161
|
+
workbench/model_scripts/xgb_model/generated_model_script.py,sha256=nU9BLU0wIhK066HAgChgNLcuOM94vBqweoH8xB8wBeo,21152
|
|
162
162
|
workbench/model_scripts/xgb_model/requirements.txt,sha256=jWlGc7HH7vqyukTm38LN4EyDi8jDUPEay4n45z-30uc,104
|
|
163
|
-
workbench/model_scripts/xgb_model/xgb_model.template,sha256=
|
|
163
|
+
workbench/model_scripts/xgb_model/xgb_model.template,sha256=HViJRsMWn393hP8VJRS45UQBzUVBhwR5sKc8Ern-9f4,17963
|
|
164
164
|
workbench/repl/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
165
165
|
workbench/repl/workbench_shell.py,sha256=eJ3rpYgEwZjhrVVCaJHht2N5BrimN6mbxqHXGrJmwC8,22130
|
|
166
166
|
workbench/resources/open_source_api.key,sha256=3S0OTblsmC0msUPdE_dbBmI83xJNmYscuwLJ57JmuOc,433
|
|
167
167
|
workbench/resources/signature_verify_pub.pem,sha256=V3-u-3_z2PH-805ybkKvzDOBwAbvHxcKn0jLBImEtzM,272
|
|
168
168
|
workbench/scripts/check_double_bond_stereo.py,sha256=p5hnL54Weq77ES0HCELq9JeoM-PyUGkvVSeWYF2dKyo,7776
|
|
169
169
|
workbench/scripts/glue_launcher.py,sha256=bIKQvfGxpAhzbeNvTnHfRW_5kQhY-169_868ZnCejJk,10692
|
|
170
|
-
workbench/scripts/
|
|
170
|
+
workbench/scripts/ml_pipeline_batch.py,sha256=1T5JnLlUJR7bwAGBLHmLPOuj1xFRqVIQX8PsuDhHy8o,4907
|
|
171
|
+
workbench/scripts/ml_pipeline_sqs.py,sha256=7w67UUuZNYnxXiZG48gpoEFbH-c_cUfjMg0FgWI0DbQ,5100
|
|
171
172
|
workbench/scripts/monitor_cloud_watch.py,sha256=s7MY4bsHts0nup9G0lWESCvgJZ9Mw1Eo-c8aKRgLjMw,9235
|
|
172
173
|
workbench/scripts/redis_expire.py,sha256=DxI_RKSNlrW2BsJZXcsSbaWGBgPZdPhtzHjV9SUtElE,1120
|
|
173
174
|
workbench/scripts/redis_report.py,sha256=iaJSuGPyLCs6e0TMcZDoT0YyJ43xJ1u74YD8FLnnUg4,990
|
|
@@ -219,7 +220,7 @@ workbench/utils/lambda_utils.py,sha256=7GhGRPyXn9o-toWb9HBGSnI8-DhK9YRkwhCSk_mNK
|
|
|
219
220
|
workbench/utils/license_manager.py,sha256=sDuhk1mZZqUbFmnuFXehyGnui_ALxrmYBg7gYwoo7ho,6975
|
|
220
221
|
workbench/utils/log_utils.py,sha256=7n1NJXO_jUX82e6LWAQug6oPo3wiPDBYsqk9gsYab_A,3167
|
|
221
222
|
workbench/utils/markdown_utils.py,sha256=4lEqzgG4EVmLcvvKKNUwNxVCySLQKJTJmWDiaDroI1w,8306
|
|
222
|
-
workbench/utils/model_utils.py,sha256=
|
|
223
|
+
workbench/utils/model_utils.py,sha256=S_fGnYucuOH5YfNviH-K85hUjSh1zFRCIjuduax7rvU,11940
|
|
223
224
|
workbench/utils/monitor_utils.py,sha256=ywoEdqoHY9t5PYRstjitS_halEWO6veCL_06BekmMVo,9153
|
|
224
225
|
workbench/utils/pandas_utils.py,sha256=LQTfZ3WJkg3rIahNJhsz1YV2y_0DBG94lO-KMmEY1g0,39325
|
|
225
226
|
workbench/utils/performance_utils.py,sha256=WDNvz-bOdC99cDuXl0urAV4DJ7alk_V3yzKPwvqgST4,1329
|
|
@@ -256,7 +257,7 @@ workbench/web_interface/components/experiments/dashboard_metric_plots.py,sha256=
|
|
|
256
257
|
workbench/web_interface/components/experiments/outlier_plot.py,sha256=5bWsmJEXyt50npeQxLHXCPtiq4WRVgg938Sl0DVjNWg,3647
|
|
257
258
|
workbench/web_interface/components/plugins/ag_table.py,sha256=HrPOMotlOGigk0v8Cxx_doSHXdOKTT1-bzlsqDwwzng,3979
|
|
258
259
|
workbench/web_interface/components/plugins/confusion_matrix.py,sha256=1K94JSlDwQwdf5uDYVydQzY-EQm89hYXchxbXoNvons,7176
|
|
259
|
-
workbench/web_interface/components/plugins/dashboard_status.py,sha256=
|
|
260
|
+
workbench/web_interface/components/plugins/dashboard_status.py,sha256=4plmoiXj3dDjoQerUNpep_jfk50pI9rHvcoSP20UbE8,5832
|
|
260
261
|
workbench/web_interface/components/plugins/data_details.py,sha256=pZm1AbM_0EXQwx77qUkfyrU9MedAs4Wlkp6iOtSrUtI,11104
|
|
261
262
|
workbench/web_interface/components/plugins/endpoint_details.py,sha256=0A7g_Lx5-3XnDWOGT3YEDPNpmME_-WfYc65f-rRVjJE,3769
|
|
262
263
|
workbench/web_interface/components/plugins/generated_compounds.py,sha256=hC0sh-1_rbN55Huno-E_2wF37kgIHi5Mtaer6Xk5fRM,8052
|
|
@@ -276,9 +277,9 @@ workbench/web_interface/page_views/main_page.py,sha256=X4-KyGTKLAdxR-Zk2niuLJB2Y
|
|
|
276
277
|
workbench/web_interface/page_views/models_page_view.py,sha256=M0bdC7bAzLyIaE2jviY12FF4abdMFZmg6sFuOY_LaGI,2650
|
|
277
278
|
workbench/web_interface/page_views/page_view.py,sha256=Gh6YnpOGlUejx-bHZAf5pzqoQ1H1R0OSwOpGhOBO06w,455
|
|
278
279
|
workbench/web_interface/page_views/pipelines_page_view.py,sha256=v2pxrIbsHBcYiblfius3JK766NZ7ciD2yPx0t3E5IJo,2656
|
|
279
|
-
workbench-0.8.
|
|
280
|
-
workbench-0.8.
|
|
281
|
-
workbench-0.8.
|
|
282
|
-
workbench-0.8.
|
|
283
|
-
workbench-0.8.
|
|
284
|
-
workbench-0.8.
|
|
280
|
+
workbench-0.8.171.dist-info/licenses/LICENSE,sha256=z4QMMPlLJkZjU8VOKqJkZiQZCEZ--saIU2Z8-p3aVc0,1080
|
|
281
|
+
workbench-0.8.171.dist-info/METADATA,sha256=cLYIPKqidwQU6U3CIprMiMImJm8hwvKBAJBXGck_Aqo,9210
|
|
282
|
+
workbench-0.8.171.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
283
|
+
workbench-0.8.171.dist-info/entry_points.txt,sha256=zPFPruY9uayk8-wsKrhfnIyIB6jvZOW_ibyllEIsLWo,356
|
|
284
|
+
workbench-0.8.171.dist-info/top_level.txt,sha256=Dhy72zTxaA_o_yRkPZx5zw-fwumnjGaeGf0hBN3jc_w,10
|
|
285
|
+
workbench-0.8.171.dist-info/RECORD,,
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
[console_scripts]
|
|
2
2
|
cloud_watch = workbench.scripts.monitor_cloud_watch:main
|
|
3
3
|
glue_launcher = workbench.scripts.glue_launcher:main
|
|
4
|
-
|
|
4
|
+
ml_pipeline_batch = workbench.scripts.ml_pipeline_batch:main
|
|
5
|
+
ml_pipeline_sqs = workbench.scripts.ml_pipeline_sqs:main
|
|
5
6
|
workbench = workbench.repl.workbench_shell:launch_shell
|
|
6
7
|
workbench_config = workbench.scripts.show_config:main
|
|
File without changes
|
|
File without changes
|
|
File without changes
|