workbench 0.8.168__py3-none-any.whl → 0.8.193__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- workbench/algorithms/dataframe/proximity.py +143 -102
- workbench/algorithms/graph/light/proximity_graph.py +2 -1
- workbench/api/compound.py +1 -1
- workbench/api/endpoint.py +3 -2
- workbench/api/feature_set.py +4 -4
- workbench/api/model.py +16 -12
- workbench/api/monitor.py +1 -16
- workbench/core/artifacts/artifact.py +11 -3
- workbench/core/artifacts/data_capture_core.py +355 -0
- workbench/core/artifacts/endpoint_core.py +113 -27
- workbench/core/artifacts/feature_set_core.py +72 -13
- workbench/core/artifacts/model_core.py +71 -49
- workbench/core/artifacts/monitor_core.py +33 -249
- workbench/core/cloud_platform/aws/aws_account_clamp.py +50 -1
- workbench/core/cloud_platform/aws/aws_meta.py +11 -4
- workbench/core/transforms/data_to_features/light/molecular_descriptors.py +4 -4
- workbench/core/transforms/features_to_model/features_to_model.py +11 -6
- workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +36 -6
- workbench/core/transforms/pandas_transforms/pandas_to_features.py +27 -0
- workbench/core/views/training_view.py +49 -53
- workbench/core/views/view.py +51 -1
- workbench/core/views/view_utils.py +4 -4
- workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +483 -0
- workbench/model_scripts/custom_models/chem_info/mol_standardize.py +450 -0
- workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +7 -9
- workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +3 -5
- workbench/model_scripts/custom_models/proximity/proximity.py +143 -102
- workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
- workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +10 -17
- workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
- workbench/model_scripts/custom_models/uq_models/meta_uq.template +156 -58
- workbench/model_scripts/custom_models/uq_models/ngboost.template +20 -14
- workbench/model_scripts/custom_models/uq_models/proximity.py +143 -102
- workbench/model_scripts/custom_models/uq_models/requirements.txt +1 -3
- workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +5 -13
- workbench/model_scripts/pytorch_model/pytorch.template +9 -18
- workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
- workbench/model_scripts/script_generation.py +7 -2
- workbench/model_scripts/uq_models/mapie.template +492 -0
- workbench/model_scripts/uq_models/requirements.txt +1 -0
- workbench/model_scripts/xgb_model/generated_model_script.py +34 -43
- workbench/model_scripts/xgb_model/xgb_model.template +31 -40
- workbench/repl/workbench_shell.py +4 -4
- workbench/scripts/lambda_launcher.py +63 -0
- workbench/scripts/{ml_pipeline_launcher.py → ml_pipeline_batch.py} +49 -51
- workbench/scripts/ml_pipeline_sqs.py +186 -0
- workbench/utils/chem_utils/__init__.py +0 -0
- workbench/utils/chem_utils/fingerprints.py +134 -0
- workbench/utils/chem_utils/misc.py +194 -0
- workbench/utils/chem_utils/mol_descriptors.py +483 -0
- workbench/utils/chem_utils/mol_standardize.py +450 -0
- workbench/utils/chem_utils/mol_tagging.py +348 -0
- workbench/utils/chem_utils/projections.py +209 -0
- workbench/utils/chem_utils/salts.py +256 -0
- workbench/utils/chem_utils/sdf.py +292 -0
- workbench/utils/chem_utils/toxicity.py +250 -0
- workbench/utils/chem_utils/vis.py +253 -0
- workbench/utils/config_manager.py +2 -6
- workbench/utils/endpoint_utils.py +5 -7
- workbench/utils/license_manager.py +2 -6
- workbench/utils/model_utils.py +89 -31
- workbench/utils/monitor_utils.py +44 -62
- workbench/utils/pandas_utils.py +3 -3
- workbench/utils/shap_utils.py +10 -2
- workbench/utils/workbench_sqs.py +1 -1
- workbench/utils/xgboost_model_utils.py +300 -151
- workbench/web_interface/components/model_plot.py +7 -1
- workbench/web_interface/components/plugins/dashboard_status.py +3 -1
- workbench/web_interface/components/plugins/generated_compounds.py +1 -1
- workbench/web_interface/components/plugins/model_details.py +7 -2
- workbench/web_interface/components/plugins/scatter_plot.py +3 -3
- {workbench-0.8.168.dist-info → workbench-0.8.193.dist-info}/METADATA +24 -2
- {workbench-0.8.168.dist-info → workbench-0.8.193.dist-info}/RECORD +77 -72
- {workbench-0.8.168.dist-info → workbench-0.8.193.dist-info}/entry_points.txt +3 -1
- {workbench-0.8.168.dist-info → workbench-0.8.193.dist-info}/licenses/LICENSE +1 -1
- workbench/model_scripts/custom_models/chem_info/local_utils.py +0 -769
- workbench/model_scripts/custom_models/chem_info/tautomerize.py +0 -83
- workbench/model_scripts/custom_models/proximity/generated_model_script.py +0 -138
- workbench/model_scripts/custom_models/uq_models/generated_model_script.py +0 -393
- workbench/model_scripts/custom_models/uq_models/mapie_xgb.template +0 -203
- workbench/model_scripts/ensemble_xgb/generated_model_script.py +0 -279
- workbench/model_scripts/pytorch_model/generated_model_script.py +0 -576
- workbench/model_scripts/quant_regression/quant_regression.template +0 -279
- workbench/model_scripts/quant_regression/requirements.txt +0 -1
- workbench/model_scripts/scikit_learn/generated_model_script.py +0 -307
- workbench/utils/chem_utils.py +0 -1556
- workbench/utils/fast_inference.py +0 -167
- workbench/utils/resource_utils.py +0 -39
- {workbench-0.8.168.dist-info → workbench-0.8.193.dist-info}/WHEEL +0 -0
- {workbench-0.8.168.dist-info → workbench-0.8.193.dist-info}/top_level.txt +0 -0
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
import logging
|
|
4
4
|
import os
|
|
5
5
|
import tempfile
|
|
6
|
-
import
|
|
6
|
+
import joblib
|
|
7
7
|
import pickle
|
|
8
8
|
import glob
|
|
9
9
|
import awswrangler as wr
|
|
@@ -16,15 +16,16 @@ from typing import Dict, Any
|
|
|
16
16
|
from sklearn.model_selection import KFold, StratifiedKFold
|
|
17
17
|
from sklearn.metrics import (
|
|
18
18
|
precision_recall_fscore_support,
|
|
19
|
-
confusion_matrix,
|
|
20
19
|
mean_squared_error,
|
|
21
20
|
mean_absolute_error,
|
|
22
21
|
r2_score,
|
|
22
|
+
median_absolute_error,
|
|
23
23
|
)
|
|
24
|
+
from scipy.stats import spearmanr
|
|
24
25
|
from sklearn.preprocessing import LabelEncoder
|
|
25
26
|
|
|
26
27
|
# Workbench Imports
|
|
27
|
-
from workbench.utils.model_utils import load_category_mappings_from_s3
|
|
28
|
+
from workbench.utils.model_utils import load_category_mappings_from_s3, safe_extract_tarfile
|
|
28
29
|
from workbench.utils.pandas_utils import convert_categorical_types
|
|
29
30
|
|
|
30
31
|
# Set up the log
|
|
@@ -34,14 +35,12 @@ log = logging.getLogger("workbench")
|
|
|
34
35
|
def xgboost_model_from_s3(model_artifact_uri: str):
|
|
35
36
|
"""
|
|
36
37
|
Download and extract XGBoost model artifact from S3, then load the model into memory.
|
|
37
|
-
Handles both direct XGBoost model files and pickled models.
|
|
38
|
-
Ensures categorical feature support is enabled.
|
|
39
38
|
|
|
40
39
|
Args:
|
|
41
40
|
model_artifact_uri (str): S3 URI of the model artifact.
|
|
42
41
|
|
|
43
42
|
Returns:
|
|
44
|
-
Loaded XGBoost model or None if unavailable.
|
|
43
|
+
Loaded XGBoost model (XGBClassifier, XGBRegressor, or Booster) or None if unavailable.
|
|
45
44
|
"""
|
|
46
45
|
|
|
47
46
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
@@ -50,68 +49,90 @@ def xgboost_model_from_s3(model_artifact_uri: str):
|
|
|
50
49
|
wr.s3.download(path=model_artifact_uri, local_file=local_tar_path)
|
|
51
50
|
|
|
52
51
|
# Extract tarball
|
|
53
|
-
|
|
54
|
-
tar.extractall(path=tmpdir, filter="data")
|
|
52
|
+
safe_extract_tarfile(local_tar_path, tmpdir)
|
|
55
53
|
|
|
56
54
|
# Define model file patterns to search for (in order of preference)
|
|
57
55
|
patterns = [
|
|
58
|
-
#
|
|
59
|
-
os.path.join(tmpdir, "
|
|
60
|
-
os.path.join(tmpdir, "
|
|
61
|
-
os.path.join(tmpdir, "*.
|
|
56
|
+
# Joblib models (preferred - preserves everything)
|
|
57
|
+
os.path.join(tmpdir, "*model*.joblib"),
|
|
58
|
+
os.path.join(tmpdir, "xgb*.joblib"),
|
|
59
|
+
os.path.join(tmpdir, "**", "*model*.joblib"),
|
|
60
|
+
os.path.join(tmpdir, "**", "xgb*.joblib"),
|
|
61
|
+
# Pickle models (also preserves everything)
|
|
62
|
+
os.path.join(tmpdir, "*model*.pkl"),
|
|
63
|
+
os.path.join(tmpdir, "xgb*.pkl"),
|
|
64
|
+
os.path.join(tmpdir, "**", "*model*.pkl"),
|
|
65
|
+
os.path.join(tmpdir, "**", "xgb*.pkl"),
|
|
66
|
+
# JSON models (fallback - requires reconstruction)
|
|
67
|
+
os.path.join(tmpdir, "*model*.json"),
|
|
68
|
+
os.path.join(tmpdir, "xgb*.json"),
|
|
62
69
|
os.path.join(tmpdir, "**", "*model*.json"),
|
|
63
|
-
os.path.join(tmpdir, "**", "
|
|
64
|
-
# Pickled models
|
|
65
|
-
os.path.join(tmpdir, "*.pkl"),
|
|
66
|
-
os.path.join(tmpdir, "**", "*.pkl"),
|
|
67
|
-
os.path.join(tmpdir, "*.pickle"),
|
|
68
|
-
os.path.join(tmpdir, "**", "*.pickle"),
|
|
70
|
+
os.path.join(tmpdir, "**", "xgb*.json"),
|
|
69
71
|
]
|
|
70
72
|
|
|
71
73
|
# Try each pattern
|
|
72
74
|
for pattern in patterns:
|
|
73
|
-
# Use glob to find all matching files
|
|
74
75
|
for model_path in glob.glob(pattern, recursive=True):
|
|
75
|
-
#
|
|
76
|
+
# Skip files that are clearly not XGBoost models
|
|
77
|
+
filename = os.path.basename(model_path).lower()
|
|
78
|
+
if any(skip in filename for skip in ["label_encoder", "scaler", "preprocessor", "transformer"]):
|
|
79
|
+
log.debug(f"Skipping non-model file: {model_path}")
|
|
80
|
+
continue
|
|
81
|
+
|
|
76
82
|
_, ext = os.path.splitext(model_path)
|
|
77
83
|
|
|
78
84
|
try:
|
|
79
|
-
if ext
|
|
80
|
-
|
|
85
|
+
if ext == ".joblib":
|
|
86
|
+
model = joblib.load(model_path)
|
|
87
|
+
# Verify it's actually an XGBoost model
|
|
88
|
+
if isinstance(model, (xgb.XGBClassifier, xgb.XGBRegressor, xgb.Booster)):
|
|
89
|
+
log.important(f"Loaded XGBoost model from joblib: {model_path}")
|
|
90
|
+
return model
|
|
91
|
+
else:
|
|
92
|
+
log.debug(f"Skipping non-XGBoost object from {model_path}: {type(model)}")
|
|
93
|
+
|
|
94
|
+
elif ext in [".pkl", ".pickle"]:
|
|
81
95
|
with open(model_path, "rb") as f:
|
|
82
96
|
model = pickle.load(f)
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
log.important(f"Loaded XGBoost Booster from pickle: {model_path}")
|
|
97
|
+
# Verify it's actually an XGBoost model
|
|
98
|
+
if isinstance(model, (xgb.XGBClassifier, xgb.XGBRegressor, xgb.Booster)):
|
|
99
|
+
log.important(f"Loaded XGBoost model from pickle: {model_path}")
|
|
87
100
|
return model
|
|
88
|
-
|
|
89
|
-
log.
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
# Handle direct XGBoost model files
|
|
101
|
+
else:
|
|
102
|
+
log.debug(f"Skipping non-XGBoost object from {model_path}: {type(model)}")
|
|
103
|
+
|
|
104
|
+
elif ext == ".json":
|
|
105
|
+
# JSON files should be XGBoost models by definition
|
|
94
106
|
booster = xgb.Booster()
|
|
95
107
|
booster.load_model(model_path)
|
|
96
|
-
log.important(f"Loaded XGBoost
|
|
108
|
+
log.important(f"Loaded XGBoost booster from JSON: {model_path}")
|
|
97
109
|
return booster
|
|
110
|
+
|
|
98
111
|
except Exception as e:
|
|
99
|
-
log.
|
|
100
|
-
continue
|
|
112
|
+
log.debug(f"Failed to load {model_path}: {e}")
|
|
113
|
+
continue
|
|
101
114
|
|
|
102
|
-
# If no model found
|
|
103
115
|
log.error("No XGBoost model found in the artifact.")
|
|
104
116
|
return None
|
|
105
117
|
|
|
106
118
|
|
|
107
|
-
def feature_importance(workbench_model, importance_type: str = "
|
|
119
|
+
def feature_importance(workbench_model, importance_type: str = "gain") -> Optional[List[Tuple[str, float]]]:
|
|
108
120
|
"""
|
|
109
121
|
Get sorted feature importances from a Workbench Model object.
|
|
110
122
|
|
|
111
123
|
Args:
|
|
112
124
|
workbench_model: Workbench model object
|
|
113
|
-
importance_type: Type of feature importance.
|
|
114
|
-
|
|
125
|
+
importance_type: Type of feature importance. Options:
|
|
126
|
+
- 'gain' (default): Average improvement in loss/objective when feature is used.
|
|
127
|
+
Best for understanding predictive power of features.
|
|
128
|
+
- 'weight': Number of times a feature appears in trees (split count).
|
|
129
|
+
Useful for understanding model complexity and feature usage frequency.
|
|
130
|
+
- 'cover': Average number of samples affected when feature is used.
|
|
131
|
+
Shows the relative quantity of observations related to this feature.
|
|
132
|
+
- 'total_gain': Total improvement in loss/objective across all splits.
|
|
133
|
+
Similar to 'gain' but not averaged (can be biased toward frequent features).
|
|
134
|
+
- 'total_cover': Total number of samples affected across all splits.
|
|
135
|
+
Similar to 'cover' but not averaged.
|
|
115
136
|
|
|
116
137
|
Returns:
|
|
117
138
|
List of tuples (feature, importance) sorted by importance value (descending).
|
|
@@ -120,7 +141,8 @@ def feature_importance(workbench_model, importance_type: str = "weight") -> Opti
|
|
|
120
141
|
|
|
121
142
|
Note:
|
|
122
143
|
XGBoost's get_score() only returns features with non-zero importance.
|
|
123
|
-
This function ensures all model features are included in the output
|
|
144
|
+
This function ensures all model features are included in the output,
|
|
145
|
+
adding zero values for features that weren't used in any tree splits.
|
|
124
146
|
"""
|
|
125
147
|
model_artifact_uri = workbench_model.model_data_url()
|
|
126
148
|
xgb_model = xgboost_model_from_s3(model_artifact_uri)
|
|
@@ -128,11 +150,18 @@ def feature_importance(workbench_model, importance_type: str = "weight") -> Opti
|
|
|
128
150
|
log.error("No XGBoost model found in the artifact.")
|
|
129
151
|
return None
|
|
130
152
|
|
|
131
|
-
#
|
|
132
|
-
|
|
153
|
+
# Check if we got a full sklearn model or just a booster (for backwards compatibility)
|
|
154
|
+
if hasattr(xgb_model, "get_booster"):
|
|
155
|
+
# Full sklearn model - get the booster for feature importance
|
|
156
|
+
booster = xgb_model.get_booster()
|
|
157
|
+
all_features = booster.feature_names
|
|
158
|
+
else:
|
|
159
|
+
# Already a booster (legacy JSON load)
|
|
160
|
+
booster = xgb_model
|
|
161
|
+
all_features = xgb_model.feature_names
|
|
133
162
|
|
|
134
|
-
# Get
|
|
135
|
-
|
|
163
|
+
# Get feature importances (only non-zero features)
|
|
164
|
+
importances = booster.get_score(importance_type=importance_type)
|
|
136
165
|
|
|
137
166
|
# Create complete importance dict with zeros for missing features
|
|
138
167
|
complete_importances = {feat: importances.get(feat, 0.0) for feat in all_features}
|
|
@@ -229,148 +258,260 @@ def leaf_stats(df: pd.DataFrame, target_col: str) -> pd.DataFrame:
|
|
|
229
258
|
return result_df
|
|
230
259
|
|
|
231
260
|
|
|
232
|
-
def cross_fold_inference(workbench_model: Any, nfolds: int = 5) -> Dict[str, Any]:
|
|
261
|
+
def cross_fold_inference(workbench_model: Any, nfolds: int = 5) -> Tuple[Dict[str, Any], pd.DataFrame]:
|
|
233
262
|
"""
|
|
234
263
|
Performs K-fold cross-validation with detailed metrics.
|
|
235
264
|
Args:
|
|
236
265
|
workbench_model: Workbench model object
|
|
237
266
|
nfolds: Number of folds for cross-validation (default is 5)
|
|
238
267
|
Returns:
|
|
239
|
-
|
|
240
|
-
-
|
|
241
|
-
|
|
242
|
-
|
|
268
|
+
Tuple of:
|
|
269
|
+
- Dictionary containing:
|
|
270
|
+
- folds: Dictionary of formatted strings for each fold
|
|
271
|
+
- summary_metrics: Summary metrics across folds
|
|
272
|
+
- DataFrame with columns: id, target, prediction (out-of-fold predictions for all samples)
|
|
243
273
|
"""
|
|
244
274
|
from workbench.api import FeatureSet
|
|
245
275
|
|
|
246
276
|
# Load model
|
|
247
|
-
model_type = workbench_model.model_type.value
|
|
248
277
|
model_artifact_uri = workbench_model.model_data_url()
|
|
249
|
-
|
|
250
|
-
if
|
|
278
|
+
loaded_model = xgboost_model_from_s3(model_artifact_uri)
|
|
279
|
+
if loaded_model is None:
|
|
251
280
|
log.error("No XGBoost model found in the artifact.")
|
|
252
|
-
return {}
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
281
|
+
return {}, pd.DataFrame()
|
|
282
|
+
|
|
283
|
+
# Check if we got a full sklearn model or need to create one
|
|
284
|
+
if isinstance(loaded_model, (xgb.XGBClassifier, xgb.XGBRegressor)):
|
|
285
|
+
is_classifier = isinstance(loaded_model, xgb.XGBClassifier)
|
|
286
|
+
|
|
287
|
+
# Get the model's hyperparameters and ensure enable_categorical=True
|
|
288
|
+
params = loaded_model.get_params()
|
|
289
|
+
params["enable_categorical"] = True
|
|
290
|
+
|
|
291
|
+
# Create new model with same params but enable_categorical=True
|
|
292
|
+
if is_classifier:
|
|
293
|
+
xgb_model = xgb.XGBClassifier(**params)
|
|
294
|
+
else:
|
|
295
|
+
xgb_model = xgb.XGBRegressor(**params)
|
|
296
|
+
|
|
297
|
+
elif isinstance(loaded_model, xgb.Booster):
|
|
298
|
+
# Legacy: got a booster, need to wrap it
|
|
299
|
+
log.warning("Deprecated: Loaded model is a Booster, wrapping in sklearn model.")
|
|
300
|
+
is_classifier = workbench_model.model_type.value == "classifier"
|
|
301
|
+
xgb_model = (
|
|
302
|
+
xgb.XGBClassifier(enable_categorical=True) if is_classifier else xgb.XGBRegressor(enable_categorical=True)
|
|
303
|
+
)
|
|
304
|
+
xgb_model._Booster = loaded_model
|
|
305
|
+
else:
|
|
306
|
+
log.error(f"Unexpected model type: {type(loaded_model)}")
|
|
307
|
+
return {}, pd.DataFrame()
|
|
308
|
+
|
|
259
309
|
# Prepare data
|
|
260
310
|
fs = FeatureSet(workbench_model.get_input())
|
|
261
|
-
df =
|
|
311
|
+
df = workbench_model.training_view().pull_dataframe()
|
|
312
|
+
|
|
313
|
+
# Get id column - assuming FeatureSet has an id_column attribute or similar
|
|
314
|
+
id_col = fs.id_column
|
|
315
|
+
target_col = workbench_model.target()
|
|
262
316
|
feature_cols = workbench_model.features()
|
|
263
|
-
|
|
317
|
+
|
|
318
|
+
# Convert string[python] to object, then to category for XGBoost compatibility
|
|
319
|
+
# This avoids XGBoost's issue with pandas 2.x string[python] dtype in categorical categories
|
|
264
320
|
for col in feature_cols:
|
|
265
|
-
if df[col]
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
X = df[workbench_model.features()]
|
|
269
|
-
y = df[workbench_model.target()]
|
|
321
|
+
if pd.api.types.is_string_dtype(df[col]):
|
|
322
|
+
# Double conversion: string[python] -> object -> category
|
|
323
|
+
df[col] = df[col].astype("object").astype("category")
|
|
270
324
|
|
|
271
|
-
|
|
325
|
+
X = df[feature_cols]
|
|
326
|
+
y = df[target_col]
|
|
327
|
+
ids = df[id_col]
|
|
328
|
+
|
|
329
|
+
# Encode target if classifier
|
|
272
330
|
label_encoder = LabelEncoder() if is_classifier else None
|
|
273
331
|
if label_encoder:
|
|
274
|
-
|
|
332
|
+
y_encoded = label_encoder.fit_transform(y)
|
|
333
|
+
y_for_cv = pd.Series(y_encoded, index=y.index, name=target_col)
|
|
334
|
+
else:
|
|
335
|
+
y_for_cv = y
|
|
336
|
+
|
|
275
337
|
# Prepare KFold
|
|
276
|
-
kfold = (
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
)
|
|
338
|
+
kfold = (StratifiedKFold if is_classifier else KFold)(n_splits=nfolds, shuffle=True, random_state=42)
|
|
339
|
+
|
|
340
|
+
# Initialize results collection
|
|
341
|
+
fold_metrics = []
|
|
342
|
+
predictions_df = pd.DataFrame({id_col: ids, target_col: y}) # Keep original values
|
|
343
|
+
# Note: 'prediction' column will be created automatically with correct dtype
|
|
281
344
|
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
all_actuals = []
|
|
285
|
-
for fold_idx, (train_idx, val_idx) in enumerate(kfold.split(X, y)):
|
|
345
|
+
# Perform cross-validation
|
|
346
|
+
for fold_idx, (train_idx, val_idx) in enumerate(kfold.split(X, y_for_cv), 1):
|
|
286
347
|
X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
|
|
287
|
-
y_train, y_val =
|
|
348
|
+
y_train, y_val = y_for_cv.iloc[train_idx], y_for_cv.iloc[val_idx]
|
|
288
349
|
|
|
289
|
-
# Train
|
|
350
|
+
# Train and predict
|
|
290
351
|
xgb_model.fit(X_train, y_train)
|
|
291
352
|
preds = xgb_model.predict(X_val)
|
|
292
|
-
all_predictions.extend(preds)
|
|
293
|
-
all_actuals.extend(y_val)
|
|
294
353
|
|
|
295
|
-
#
|
|
296
|
-
|
|
354
|
+
# Store predictions (decode if classifier)
|
|
355
|
+
val_indices = X_val.index
|
|
356
|
+
if is_classifier:
|
|
357
|
+
predictions_df.loc[val_indices, "prediction"] = label_encoder.inverse_transform(preds.astype(int))
|
|
358
|
+
else:
|
|
359
|
+
predictions_df.loc[val_indices, "prediction"] = preds
|
|
297
360
|
|
|
361
|
+
# Calculate fold metrics
|
|
298
362
|
if is_classifier:
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
363
|
+
y_val_orig = label_encoder.inverse_transform(y_val)
|
|
364
|
+
preds_orig = label_encoder.inverse_transform(preds.astype(int))
|
|
365
|
+
prec, rec, f1, _ = precision_recall_fscore_support(
|
|
366
|
+
y_val_orig, preds_orig, average="weighted", zero_division=0
|
|
303
367
|
)
|
|
304
|
-
fold_metrics.
|
|
368
|
+
fold_metrics.append({"fold": fold_idx, "precision": prec, "recall": rec, "fscore": f1})
|
|
305
369
|
else:
|
|
306
|
-
|
|
370
|
+
spearman_corr, _ = spearmanr(y_val, preds)
|
|
371
|
+
fold_metrics.append(
|
|
307
372
|
{
|
|
308
|
-
"
|
|
309
|
-
"
|
|
310
|
-
"
|
|
373
|
+
"fold": fold_idx,
|
|
374
|
+
"rmse": np.sqrt(mean_squared_error(y_val, preds)),
|
|
375
|
+
"mae": mean_absolute_error(y_val, preds),
|
|
376
|
+
"medae": median_absolute_error(y_val, preds),
|
|
377
|
+
"r2": r2_score(y_val, preds),
|
|
378
|
+
"spearmanr": spearman_corr,
|
|
311
379
|
}
|
|
312
380
|
)
|
|
313
381
|
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
scores = precision_recall_fscore_support(
|
|
321
|
-
all_actuals_original, all_predictions_original, average="weighted", zero_division=0
|
|
322
|
-
)
|
|
323
|
-
overall_metrics.update(
|
|
324
|
-
{
|
|
325
|
-
"precision": float(scores[0]),
|
|
326
|
-
"recall": float(scores[1]),
|
|
327
|
-
"fscore": float(scores[2]),
|
|
328
|
-
"confusion_matrix": confusion_matrix(
|
|
329
|
-
all_actuals_original, all_predictions_original, labels=label_encoder.classes_
|
|
330
|
-
).tolist(),
|
|
331
|
-
"label_names": list(label_encoder.classes_),
|
|
332
|
-
}
|
|
333
|
-
)
|
|
334
|
-
else:
|
|
335
|
-
overall_metrics.update(
|
|
336
|
-
{
|
|
337
|
-
"rmse": float(np.sqrt(mean_squared_error(all_actuals, all_predictions))),
|
|
338
|
-
"mae": float(mean_absolute_error(all_actuals, all_predictions)),
|
|
339
|
-
"r2": float(r2_score(all_actuals, all_predictions)),
|
|
340
|
-
}
|
|
341
|
-
)
|
|
342
|
-
# Calculate summary metrics across folds
|
|
343
|
-
summary_metrics = {}
|
|
344
|
-
metrics_to_aggregate = ["precision", "recall", "fscore"] if is_classifier else ["rmse", "mae", "r2"]
|
|
345
|
-
|
|
346
|
-
for metric in metrics_to_aggregate:
|
|
347
|
-
values = [fold[metric] for fold in fold_results]
|
|
348
|
-
summary_metrics[metric] = f"{float(np.mean(values)):.3f} ±{float(np.std(values)):.3f}"
|
|
349
|
-
# Format fold results as strings (TBD section)
|
|
382
|
+
# Calculate summary metrics (mean ± std)
|
|
383
|
+
fold_df = pd.DataFrame(fold_metrics)
|
|
384
|
+
metric_names = ["precision", "recall", "fscore"] if is_classifier else ["rmse", "mae", "medae", "r2", "spearmanr"]
|
|
385
|
+
summary_metrics = {metric: f"{fold_df[metric].mean():.3f} ±{fold_df[metric].std():.3f}" for metric in metric_names}
|
|
386
|
+
|
|
387
|
+
# Format fold results for display
|
|
350
388
|
formatted_folds = {}
|
|
351
|
-
for
|
|
352
|
-
fold_key = f"Fold {
|
|
389
|
+
for _, row in fold_df.iterrows():
|
|
390
|
+
fold_key = f"Fold {int(row['fold'])}"
|
|
353
391
|
if is_classifier:
|
|
354
392
|
formatted_folds[fold_key] = (
|
|
355
|
-
f"precision: {
|
|
356
|
-
f"recall: {fold_data['recall']:.3f} "
|
|
357
|
-
f"fscore: {fold_data['fscore']:.3f}"
|
|
393
|
+
f"precision: {row['precision']:.3f} " f"recall: {row['recall']:.3f} " f"fscore: {row['fscore']:.3f}"
|
|
358
394
|
)
|
|
359
395
|
else:
|
|
360
396
|
formatted_folds[fold_key] = (
|
|
361
|
-
f"rmse: {
|
|
397
|
+
f"rmse: {row['rmse']:.3f} "
|
|
398
|
+
f"mae: {row['mae']:.3f} "
|
|
399
|
+
f"medae: {row['medae']:.3f} "
|
|
400
|
+
f"r2: {row['r2']:.3f} "
|
|
401
|
+
f"spearmanr: {row['spearmanr']:.3f}"
|
|
362
402
|
)
|
|
363
|
-
|
|
364
|
-
return
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
403
|
+
|
|
404
|
+
# Build return dictionary
|
|
405
|
+
metrics_dict = {"summary_metrics": summary_metrics, "folds": formatted_folds}
|
|
406
|
+
|
|
407
|
+
return metrics_dict, predictions_df
|
|
408
|
+
|
|
409
|
+
|
|
410
|
+
def leave_one_out_inference(workbench_model: Any) -> pd.DataFrame:
|
|
411
|
+
"""
|
|
412
|
+
Performs leave-one-out cross-validation (parallelized).
|
|
413
|
+
For datasets > 1000 rows, first identifies top 100 worst predictions via 10-fold CV,
|
|
414
|
+
then performs true leave-one-out on those 100 samples.
|
|
415
|
+
Each model trains on ALL data except one sample.
|
|
416
|
+
"""
|
|
417
|
+
from workbench.api import FeatureSet
|
|
418
|
+
from joblib import Parallel, delayed
|
|
419
|
+
from tqdm import tqdm
|
|
420
|
+
|
|
421
|
+
def train_and_predict_one(model_params, is_classifier, X, y, train_idx, val_idx):
|
|
422
|
+
"""Train on train_idx, predict on val_idx."""
|
|
423
|
+
model = xgb.XGBClassifier(**model_params) if is_classifier else xgb.XGBRegressor(**model_params)
|
|
424
|
+
model.fit(X[train_idx], y[train_idx])
|
|
425
|
+
return model.predict(X[val_idx])[0]
|
|
426
|
+
|
|
427
|
+
# Load model and get params
|
|
428
|
+
model_artifact_uri = workbench_model.model_data_url()
|
|
429
|
+
loaded_model = xgboost_model_from_s3(model_artifact_uri)
|
|
430
|
+
if loaded_model is None:
|
|
431
|
+
log.error("No XGBoost model found in the artifact.")
|
|
432
|
+
return pd.DataFrame()
|
|
433
|
+
|
|
434
|
+
if isinstance(loaded_model, (xgb.XGBClassifier, xgb.XGBRegressor)):
|
|
435
|
+
is_classifier = isinstance(loaded_model, xgb.XGBClassifier)
|
|
436
|
+
model_params = loaded_model.get_params()
|
|
437
|
+
elif isinstance(loaded_model, xgb.Booster):
|
|
438
|
+
log.warning("Deprecated: Loaded model is a Booster, wrapping in sklearn model.")
|
|
439
|
+
is_classifier = workbench_model.model_type.value == "classifier"
|
|
440
|
+
model_params = {"enable_categorical": True}
|
|
441
|
+
else:
|
|
442
|
+
log.error(f"Unexpected model type: {type(loaded_model)}")
|
|
443
|
+
return pd.DataFrame()
|
|
444
|
+
|
|
445
|
+
# Load and prepare data
|
|
446
|
+
fs = FeatureSet(workbench_model.get_input())
|
|
447
|
+
df = workbench_model.training_view().pull_dataframe()
|
|
448
|
+
id_col = fs.id_column
|
|
449
|
+
target_col = workbench_model.target()
|
|
450
|
+
feature_cols = workbench_model.features()
|
|
451
|
+
|
|
452
|
+
# Convert string[python] to object, then to category for XGBoost compatibility
|
|
453
|
+
# This avoids XGBoost's issue with pandas 2.x string[python] dtype in categorical categories
|
|
454
|
+
for col in feature_cols:
|
|
455
|
+
if pd.api.types.is_string_dtype(df[col]):
|
|
456
|
+
# Double conversion: string[python] -> object -> category
|
|
457
|
+
df[col] = df[col].astype("object").astype("category")
|
|
458
|
+
|
|
459
|
+
# Determine which samples to run LOO on
|
|
460
|
+
if len(df) > 1000:
|
|
461
|
+
log.important(f"Dataset has {len(df)} rows. Running 10-fold CV to identify top 1000 worst predictions...")
|
|
462
|
+
_, predictions_df = cross_fold_inference(workbench_model, nfolds=10)
|
|
463
|
+
predictions_df["residual_abs"] = np.abs(predictions_df[target_col] - predictions_df["prediction"])
|
|
464
|
+
worst_samples = predictions_df.nlargest(1000, "residual_abs")
|
|
465
|
+
worst_ids = worst_samples[id_col].values
|
|
466
|
+
loo_indices = df[df[id_col].isin(worst_ids)].index.values
|
|
467
|
+
log.important(f"Running leave-one-out CV on 1000 worst samples. Each model trains on {len(df)-1} rows...")
|
|
468
|
+
else:
|
|
469
|
+
log.important(f"Running leave-one-out CV on all {len(df)} samples...")
|
|
470
|
+
loo_indices = df.index.values
|
|
471
|
+
|
|
472
|
+
# Prepare full dataset for training
|
|
473
|
+
X_full = df[feature_cols].values
|
|
474
|
+
y_full = df[target_col].values
|
|
475
|
+
|
|
476
|
+
# Encode target if classifier
|
|
477
|
+
label_encoder = LabelEncoder() if is_classifier else None
|
|
478
|
+
if label_encoder:
|
|
479
|
+
y_full = label_encoder.fit_transform(y_full)
|
|
480
|
+
|
|
481
|
+
# Generate LOO splits
|
|
482
|
+
splits = []
|
|
483
|
+
for loo_idx in loo_indices:
|
|
484
|
+
train_idx = np.delete(np.arange(len(X_full)), loo_idx)
|
|
485
|
+
val_idx = np.array([loo_idx])
|
|
486
|
+
splits.append((train_idx, val_idx))
|
|
487
|
+
|
|
488
|
+
# Parallel execution
|
|
489
|
+
predictions = Parallel(n_jobs=4)(
|
|
490
|
+
delayed(train_and_predict_one)(model_params, is_classifier, X_full, y_full, train_idx, val_idx)
|
|
491
|
+
for train_idx, val_idx in tqdm(splits, desc="LOO CV")
|
|
492
|
+
)
|
|
493
|
+
|
|
494
|
+
# Build results dataframe
|
|
495
|
+
predictions_array = np.array(predictions)
|
|
496
|
+
if label_encoder:
|
|
497
|
+
predictions_array = label_encoder.inverse_transform(predictions_array.astype(int))
|
|
498
|
+
|
|
499
|
+
predictions_df = pd.DataFrame(
|
|
500
|
+
{
|
|
501
|
+
id_col: df.loc[loo_indices, id_col].values,
|
|
502
|
+
target_col: df.loc[loo_indices, target_col].values,
|
|
503
|
+
"prediction": predictions_array,
|
|
504
|
+
}
|
|
505
|
+
)
|
|
506
|
+
|
|
507
|
+
predictions_df["residual_abs"] = np.abs(predictions_df[target_col] - predictions_df["prediction"])
|
|
508
|
+
|
|
509
|
+
return predictions_df
|
|
369
510
|
|
|
370
511
|
|
|
371
512
|
if __name__ == "__main__":
|
|
372
513
|
"""Exercise the Model Utilities"""
|
|
373
|
-
from workbench.api import Model
|
|
514
|
+
from workbench.api import Model
|
|
374
515
|
from pprint import pprint
|
|
375
516
|
|
|
376
517
|
# Test the XGBoost model loading and feature importance
|
|
@@ -383,11 +524,28 @@ if __name__ == "__main__":
|
|
|
383
524
|
model_artifact_uri = model.model_data_url()
|
|
384
525
|
xgb_model = xgboost_model_from_s3(model_artifact_uri)
|
|
385
526
|
|
|
527
|
+
# Verify enable_categorical is preserved (for debugging/confidence)
|
|
528
|
+
print(f"Model parameters: {xgb_model.get_params()}")
|
|
529
|
+
print(f"enable_categorical: {xgb_model.enable_categorical}")
|
|
530
|
+
|
|
386
531
|
# Test with UQ Model
|
|
387
532
|
uq_model = Model("aqsol-uq")
|
|
388
533
|
_xgb_model = xgboost_model_from_s3(uq_model.model_data_url())
|
|
389
534
|
|
|
535
|
+
print("\n=== CROSS FOLD REGRESSION EXAMPLE ===")
|
|
536
|
+
model = Model("abalone-regression")
|
|
537
|
+
results, df = cross_fold_inference(model)
|
|
538
|
+
pprint(results)
|
|
539
|
+
print(df.head())
|
|
540
|
+
|
|
541
|
+
print("\n=== CROSS FOLD CLASSIFICATION EXAMPLE ===")
|
|
542
|
+
model = Model("wine-classification")
|
|
543
|
+
results, df = cross_fold_inference(model)
|
|
544
|
+
pprint(results)
|
|
545
|
+
print(df.head())
|
|
546
|
+
|
|
390
547
|
# Test XGBoost add_leaf_hash
|
|
548
|
+
"""
|
|
391
549
|
input_df = FeatureSet(model.get_input()).pull_dataframe()
|
|
392
550
|
leaf_df = add_leaf_hash(model, input_df)
|
|
393
551
|
print("DataFrame with Leaf Hash:")
|
|
@@ -404,13 +562,4 @@ if __name__ == "__main__":
|
|
|
404
562
|
stats_df = leaf_stats(leaf_df, target_col)
|
|
405
563
|
print("DataFrame with Leaf Statistics:")
|
|
406
564
|
print(stats_df)
|
|
407
|
-
|
|
408
|
-
print("\n=== CROSS FOLD REGRESSION EXAMPLE ===")
|
|
409
|
-
model = Model("abalone-regression")
|
|
410
|
-
results = cross_fold_inference(model)
|
|
411
|
-
pprint(results)
|
|
412
|
-
|
|
413
|
-
print("\n=== CROSS FOLD CLASSIFICATION EXAMPLE ===")
|
|
414
|
-
model = Model("wine-classification")
|
|
415
|
-
results = cross_fold_inference(model)
|
|
416
|
-
pprint(results)
|
|
565
|
+
"""
|
|
@@ -39,7 +39,13 @@ class ModelPlot(ComponentInterface):
|
|
|
39
39
|
# Calculate the distance from the diagonal for each point
|
|
40
40
|
target = model.target()
|
|
41
41
|
df["error"] = abs(df["prediction"] - df[target])
|
|
42
|
-
return ScatterPlot().update_properties(
|
|
42
|
+
return ScatterPlot().update_properties(
|
|
43
|
+
df,
|
|
44
|
+
color="error",
|
|
45
|
+
regression_line=True,
|
|
46
|
+
x=target,
|
|
47
|
+
y="prediction",
|
|
48
|
+
)[0]
|
|
43
49
|
else:
|
|
44
50
|
return self.display_text(f"Model Type: {model.model_type}\n\n Awesome Plot Coming Soon!")
|
|
45
51
|
|
|
@@ -72,7 +72,9 @@ class DashboardStatus(PluginInterface):
|
|
|
72
72
|
details = "**Redis:** 🔴 Failed to Connect<br>"
|
|
73
73
|
|
|
74
74
|
# Fill in the license details
|
|
75
|
-
|
|
75
|
+
redis_host = config_info.get("REDIS_HOST", "NOT SET")
|
|
76
|
+
redis_port = config_info.get("REDIS_PORT", "NOT SET")
|
|
77
|
+
details += f"**Redis Server:** {redis_host}:{redis_port}<br>"
|
|
76
78
|
details += f"**Workbench S3 Bucket:** {config_info['WORKBENCH_BUCKET']}<br>"
|
|
77
79
|
details += f"**Plugin Path:** {config_info.get('WORKBENCH_PLUGINS', 'unknown')}<br>"
|
|
78
80
|
details += f"**Themes Path:** {config_info.get('WORKBENCH_THEMES', 'unknown')}<br>"
|
|
@@ -5,7 +5,7 @@ import dash_bootstrap_components as dbc
|
|
|
5
5
|
|
|
6
6
|
# Workbench Imports
|
|
7
7
|
from workbench.api.compound import Compound
|
|
8
|
-
from workbench.utils.chem_utils import svg_from_smiles
|
|
8
|
+
from workbench.utils.chem_utils.vis import svg_from_smiles
|
|
9
9
|
from workbench.web_interface.components.plugin_interface import PluginInterface, PluginPage, PluginInputType
|
|
10
10
|
from workbench.utils.theme_manager import ThemeManager
|
|
11
11
|
from workbench.utils.ai_summary import AISummary
|
|
@@ -249,8 +249,13 @@ class ModelDetails(PluginInterface):
|
|
|
249
249
|
if not inference_runs:
|
|
250
250
|
return [], None
|
|
251
251
|
|
|
252
|
-
#
|
|
253
|
-
|
|
252
|
+
# Default inference run (full_cross_fold if it exists, then auto_inference, then first)
|
|
253
|
+
if "full_cross_fold" in inference_runs:
|
|
254
|
+
default_inference_run = "full_cross_fold"
|
|
255
|
+
elif "auto_inference" in inference_runs:
|
|
256
|
+
default_inference_run = "auto_inference"
|
|
257
|
+
else:
|
|
258
|
+
default_inference_run = inference_runs[0]
|
|
254
259
|
|
|
255
260
|
# Return the options for the dropdown and the selected value
|
|
256
261
|
return inference_runs, default_inference_run
|