workbench 0.8.202__py3-none-any.whl → 0.8.220__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of workbench might be problematic. Click here for more details.
- workbench/algorithms/dataframe/compound_dataset_overlap.py +321 -0
- workbench/algorithms/dataframe/feature_space_proximity.py +168 -75
- workbench/algorithms/dataframe/fingerprint_proximity.py +421 -85
- workbench/algorithms/dataframe/projection_2d.py +44 -21
- workbench/algorithms/dataframe/proximity.py +78 -150
- workbench/algorithms/graph/light/proximity_graph.py +5 -5
- workbench/algorithms/models/cleanlab_model.py +382 -0
- workbench/algorithms/models/noise_model.py +388 -0
- workbench/algorithms/sql/outliers.py +3 -3
- workbench/api/__init__.py +3 -0
- workbench/api/df_store.py +17 -108
- workbench/api/endpoint.py +13 -11
- workbench/api/feature_set.py +111 -8
- workbench/api/meta_model.py +289 -0
- workbench/api/model.py +45 -12
- workbench/api/parameter_store.py +3 -52
- workbench/cached/cached_model.py +4 -4
- workbench/core/artifacts/artifact.py +5 -5
- workbench/core/artifacts/df_store_core.py +114 -0
- workbench/core/artifacts/endpoint_core.py +228 -237
- workbench/core/artifacts/feature_set_core.py +185 -230
- workbench/core/artifacts/model_core.py +34 -26
- workbench/core/artifacts/parameter_store_core.py +98 -0
- workbench/core/pipelines/pipeline_executor.py +1 -1
- workbench/core/transforms/features_to_model/features_to_model.py +22 -10
- workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +41 -10
- workbench/core/transforms/pandas_transforms/pandas_to_features.py +11 -2
- workbench/model_script_utils/model_script_utils.py +339 -0
- workbench/model_script_utils/pytorch_utils.py +405 -0
- workbench/model_script_utils/uq_harness.py +278 -0
- workbench/model_scripts/chemprop/chemprop.template +428 -631
- workbench/model_scripts/chemprop/generated_model_script.py +432 -635
- workbench/model_scripts/chemprop/model_script_utils.py +339 -0
- workbench/model_scripts/chemprop/requirements.txt +2 -10
- workbench/model_scripts/custom_models/chem_info/fingerprints.py +87 -46
- workbench/model_scripts/custom_models/proximity/feature_space_proximity.py +194 -0
- workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +6 -6
- workbench/model_scripts/custom_models/uq_models/feature_space_proximity.py +194 -0
- workbench/model_scripts/meta_model/generated_model_script.py +209 -0
- workbench/model_scripts/meta_model/meta_model.template +209 -0
- workbench/model_scripts/pytorch_model/generated_model_script.py +374 -613
- workbench/model_scripts/pytorch_model/model_script_utils.py +339 -0
- workbench/model_scripts/pytorch_model/pytorch.template +370 -609
- workbench/model_scripts/pytorch_model/pytorch_utils.py +405 -0
- workbench/model_scripts/pytorch_model/requirements.txt +1 -1
- workbench/model_scripts/pytorch_model/uq_harness.py +278 -0
- workbench/model_scripts/script_generation.py +6 -5
- workbench/model_scripts/uq_models/generated_model_script.py +65 -422
- workbench/model_scripts/xgb_model/generated_model_script.py +372 -395
- workbench/model_scripts/xgb_model/model_script_utils.py +339 -0
- workbench/model_scripts/xgb_model/uq_harness.py +278 -0
- workbench/model_scripts/xgb_model/xgb_model.template +366 -396
- workbench/repl/workbench_shell.py +0 -5
- workbench/resources/open_source_api.key +1 -1
- workbench/scripts/endpoint_test.py +2 -2
- workbench/scripts/meta_model_sim.py +35 -0
- workbench/scripts/training_test.py +85 -0
- workbench/utils/chem_utils/fingerprints.py +87 -46
- workbench/utils/chem_utils/projections.py +16 -6
- workbench/utils/chemprop_utils.py +36 -655
- workbench/utils/meta_model_simulator.py +499 -0
- workbench/utils/metrics_utils.py +256 -0
- workbench/utils/model_utils.py +192 -54
- workbench/utils/pytorch_utils.py +33 -472
- workbench/utils/shap_utils.py +1 -55
- workbench/utils/xgboost_local_crossfold.py +267 -0
- workbench/utils/xgboost_model_utils.py +49 -356
- workbench/web_interface/components/model_plot.py +7 -1
- workbench/web_interface/components/plugins/model_details.py +30 -68
- workbench/web_interface/components/plugins/scatter_plot.py +4 -8
- {workbench-0.8.202.dist-info → workbench-0.8.220.dist-info}/METADATA +6 -5
- {workbench-0.8.202.dist-info → workbench-0.8.220.dist-info}/RECORD +76 -60
- {workbench-0.8.202.dist-info → workbench-0.8.220.dist-info}/entry_points.txt +2 -0
- workbench/core/cloud_platform/aws/aws_df_store.py +0 -404
- workbench/core/cloud_platform/aws/aws_parameter_store.py +0 -296
- workbench/model_scripts/custom_models/meta_endpoints/example.py +0 -53
- workbench/model_scripts/custom_models/proximity/proximity.py +0 -410
- workbench/model_scripts/custom_models/uq_models/meta_uq.template +0 -377
- workbench/model_scripts/custom_models/uq_models/proximity.py +0 -410
- workbench/model_scripts/uq_models/mapie.template +0 -605
- workbench/model_scripts/uq_models/requirements.txt +0 -1
- {workbench-0.8.202.dist-info → workbench-0.8.220.dist-info}/WHEEL +0 -0
- {workbench-0.8.202.dist-info → workbench-0.8.220.dist-info}/licenses/LICENSE +0 -0
- {workbench-0.8.202.dist-info → workbench-0.8.220.dist-info}/top_level.txt +0 -0
workbench/utils/pytorch_utils.py
CHANGED
|
@@ -1,169 +1,53 @@
|
|
|
1
1
|
"""PyTorch Tabular utilities for Workbench models."""
|
|
2
2
|
|
|
3
|
-
# flake8: noqa: E402
|
|
4
3
|
import logging
|
|
5
4
|
import os
|
|
5
|
+
import tarfile
|
|
6
6
|
import tempfile
|
|
7
|
-
from pprint import pformat
|
|
8
7
|
from typing import Any, Tuple
|
|
9
8
|
|
|
10
|
-
|
|
11
|
-
# (libomp from LLVM vs libiomp from Intel). Must be set before importing numpy/sklearn/torch.
|
|
12
|
-
# See: https://github.com/scikit-learn/scikit-learn/issues/21302
|
|
13
|
-
os.environ.setdefault("OMP_NUM_THREADS", "1")
|
|
14
|
-
os.environ.setdefault("MKL_NUM_THREADS", "1")
|
|
15
|
-
|
|
16
|
-
import numpy as np
|
|
9
|
+
import awswrangler as wr
|
|
17
10
|
import pandas as pd
|
|
18
|
-
from scipy.stats import spearmanr
|
|
19
|
-
from sklearn.metrics import (
|
|
20
|
-
mean_absolute_error,
|
|
21
|
-
mean_squared_error,
|
|
22
|
-
median_absolute_error,
|
|
23
|
-
precision_recall_fscore_support,
|
|
24
|
-
r2_score,
|
|
25
|
-
roc_auc_score,
|
|
26
|
-
)
|
|
27
|
-
from sklearn.model_selection import KFold, StratifiedKFold
|
|
28
|
-
from sklearn.preprocessing import LabelEncoder
|
|
29
11
|
|
|
30
|
-
from workbench.utils.model_utils import safe_extract_tarfile
|
|
31
|
-
from workbench.utils.pandas_utils import expand_proba_column
|
|
32
12
|
from workbench.utils.aws_utils import pull_s3_data
|
|
13
|
+
from workbench.utils.metrics_utils import compute_metrics_from_predictions
|
|
33
14
|
|
|
34
15
|
log = logging.getLogger("workbench")
|
|
35
16
|
|
|
36
17
|
|
|
37
18
|
def download_and_extract_model(s3_uri: str, model_dir: str) -> None:
|
|
38
|
-
"""Download model artifact from S3
|
|
39
|
-
|
|
40
|
-
Args:
|
|
41
|
-
s3_uri: S3 URI to the model artifact (model.tar.gz)
|
|
42
|
-
model_dir: Directory to extract model artifacts to
|
|
43
|
-
"""
|
|
44
|
-
import awswrangler as wr
|
|
45
|
-
|
|
46
|
-
log.info(f"Downloading model from {s3_uri}...")
|
|
47
|
-
|
|
48
|
-
# Download to temp file
|
|
49
|
-
local_tar_path = os.path.join(model_dir, "model.tar.gz")
|
|
50
|
-
wr.s3.download(path=s3_uri, local_file=local_tar_path)
|
|
51
|
-
|
|
52
|
-
# Extract using safe extraction
|
|
53
|
-
log.info(f"Extracting to {model_dir}...")
|
|
54
|
-
safe_extract_tarfile(local_tar_path, model_dir)
|
|
55
|
-
|
|
56
|
-
# Cleanup tar file
|
|
57
|
-
os.unlink(local_tar_path)
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
def load_pytorch_model_artifacts(model_dir: str) -> Tuple[Any, dict]:
|
|
61
|
-
"""Load PyTorch Tabular model and artifacts from an extracted model directory.
|
|
19
|
+
"""Download and extract a PyTorch model artifact from S3.
|
|
62
20
|
|
|
63
21
|
Args:
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
Returns:
|
|
67
|
-
Tuple of (TabularModel, artifacts_dict).
|
|
68
|
-
artifacts_dict contains 'label_encoder' and 'category_mappings' if present.
|
|
22
|
+
s3_uri: S3 URI of the model.tar.gz artifact
|
|
23
|
+
model_dir: Local directory to extract the model to
|
|
69
24
|
"""
|
|
70
|
-
|
|
25
|
+
with tempfile.NamedTemporaryFile(suffix=".tar.gz", delete=False) as tmp:
|
|
26
|
+
tmp_path = tmp.name
|
|
71
27
|
|
|
72
|
-
import joblib
|
|
73
|
-
|
|
74
|
-
# pytorch-tabular saves complex objects, use legacy loading behavior
|
|
75
|
-
os.environ["TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD"] = "1"
|
|
76
|
-
from pytorch_tabular import TabularModel
|
|
77
|
-
|
|
78
|
-
model_path = os.path.join(model_dir, "tabular_model")
|
|
79
|
-
if not os.path.exists(model_path):
|
|
80
|
-
raise FileNotFoundError(f"No tabular_model directory found in {model_dir}")
|
|
81
|
-
|
|
82
|
-
# PyTorch Tabular needs write access, so chdir to /tmp
|
|
83
|
-
original_cwd = os.getcwd()
|
|
84
28
|
try:
|
|
85
|
-
|
|
86
|
-
|
|
29
|
+
wr.s3.download(path=s3_uri, local_file=tmp_path)
|
|
30
|
+
with tarfile.open(tmp_path, "r:gz") as tar:
|
|
31
|
+
tar.extractall(model_dir)
|
|
32
|
+
log.info(f"Extracted model to {model_dir}")
|
|
87
33
|
finally:
|
|
88
|
-
os.
|
|
89
|
-
|
|
90
|
-
# Load additional artifacts
|
|
91
|
-
artifacts = {}
|
|
92
|
-
|
|
93
|
-
label_encoder_path = os.path.join(model_dir, "label_encoder.joblib")
|
|
94
|
-
if os.path.exists(label_encoder_path):
|
|
95
|
-
artifacts["label_encoder"] = joblib.load(label_encoder_path)
|
|
96
|
-
|
|
97
|
-
category_mappings_path = os.path.join(model_dir, "category_mappings.json")
|
|
98
|
-
if os.path.exists(category_mappings_path):
|
|
99
|
-
with open(category_mappings_path) as f:
|
|
100
|
-
artifacts["category_mappings"] = json.load(f)
|
|
101
|
-
|
|
102
|
-
return model, artifacts
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
def _extract_model_configs(loaded_model: Any, n_train: int) -> dict:
|
|
106
|
-
"""Extract trainer and model configs from a loaded PyTorch Tabular model.
|
|
107
|
-
|
|
108
|
-
Args:
|
|
109
|
-
loaded_model: Loaded TabularModel instance
|
|
110
|
-
n_train: Number of training samples (used for batch_size calculation)
|
|
111
|
-
|
|
112
|
-
Returns:
|
|
113
|
-
Dictionary with 'trainer' and 'model' config dictionaries
|
|
114
|
-
"""
|
|
115
|
-
config = loaded_model.config
|
|
116
|
-
|
|
117
|
-
# Trainer config - extract from loaded model, matching template defaults
|
|
118
|
-
trainer_defaults = {
|
|
119
|
-
"auto_lr_find": False,
|
|
120
|
-
"batch_size": min(128, max(32, n_train // 16)),
|
|
121
|
-
"max_epochs": 100,
|
|
122
|
-
"min_epochs": 10,
|
|
123
|
-
"early_stopping": "valid_loss",
|
|
124
|
-
"early_stopping_patience": 10,
|
|
125
|
-
"gradient_clip_val": 1.0,
|
|
126
|
-
}
|
|
127
|
-
|
|
128
|
-
trainer_config = {}
|
|
129
|
-
for key, default in trainer_defaults.items():
|
|
130
|
-
value = getattr(config, key, default)
|
|
131
|
-
if value == default and not hasattr(config, key):
|
|
132
|
-
log.warning(f"Trainer config '{key}' not found in loaded model, using default: {default}")
|
|
133
|
-
trainer_config[key] = value
|
|
134
|
-
|
|
135
|
-
# Model config - extract from loaded model, matching template defaults
|
|
136
|
-
model_defaults = {
|
|
137
|
-
"layers": "256-128-64",
|
|
138
|
-
"activation": "LeakyReLU",
|
|
139
|
-
"learning_rate": 1e-3,
|
|
140
|
-
"dropout": 0.3,
|
|
141
|
-
"use_batch_norm": True,
|
|
142
|
-
"initialization": "kaiming",
|
|
143
|
-
}
|
|
144
|
-
|
|
145
|
-
model_config = {}
|
|
146
|
-
for key, default in model_defaults.items():
|
|
147
|
-
value = getattr(config, key, default)
|
|
148
|
-
if value == default and not hasattr(config, key):
|
|
149
|
-
log.warning(f"Model config '{key}' not found in loaded model, using default: {default}")
|
|
150
|
-
model_config[key] = value
|
|
151
|
-
|
|
152
|
-
return {"trainer": trainer_config, "model": model_config}
|
|
34
|
+
if os.path.exists(tmp_path):
|
|
35
|
+
os.remove(tmp_path)
|
|
153
36
|
|
|
154
37
|
|
|
155
38
|
def pull_cv_results(workbench_model: Any) -> Tuple[pd.DataFrame, pd.DataFrame]:
|
|
156
39
|
"""Pull cross-validation results from AWS training artifacts.
|
|
157
40
|
|
|
158
|
-
This retrieves the validation predictions
|
|
159
|
-
|
|
41
|
+
This retrieves the validation predictions saved during model training and
|
|
42
|
+
computes metrics directly from them. For PyTorch models trained with
|
|
43
|
+
n_folds > 1, these are out-of-fold predictions from k-fold cross-validation.
|
|
160
44
|
|
|
161
45
|
Args:
|
|
162
46
|
workbench_model: Workbench model object
|
|
163
47
|
|
|
164
48
|
Returns:
|
|
165
49
|
Tuple of:
|
|
166
|
-
- DataFrame with
|
|
50
|
+
- DataFrame with computed metrics
|
|
167
51
|
- DataFrame with validation predictions
|
|
168
52
|
"""
|
|
169
53
|
# Get the validation predictions from S3
|
|
@@ -175,352 +59,29 @@ def pull_cv_results(workbench_model: Any) -> Tuple[pd.DataFrame, pd.DataFrame]:
|
|
|
175
59
|
|
|
176
60
|
log.info(f"Pulled {len(predictions_df)} validation predictions from {s3_path}")
|
|
177
61
|
|
|
178
|
-
#
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
if training_metrics is None:
|
|
182
|
-
raise ValueError(f"No training metrics found in model metadata for {workbench_model.model_name}")
|
|
62
|
+
# Compute metrics from predictions
|
|
63
|
+
target = workbench_model.target()
|
|
64
|
+
class_labels = workbench_model.class_labels()
|
|
183
65
|
|
|
184
|
-
|
|
185
|
-
|
|
66
|
+
if target in predictions_df.columns and "prediction" in predictions_df.columns:
|
|
67
|
+
metrics_df = compute_metrics_from_predictions(predictions_df, target, class_labels)
|
|
68
|
+
else:
|
|
69
|
+
metrics_df = pd.DataFrame()
|
|
186
70
|
|
|
187
71
|
return metrics_df, predictions_df
|
|
188
72
|
|
|
189
73
|
|
|
190
|
-
def cross_fold_inference(
|
|
191
|
-
workbench_model: Any,
|
|
192
|
-
nfolds: int = 5,
|
|
193
|
-
) -> Tuple[pd.DataFrame, pd.DataFrame]:
|
|
194
|
-
"""Performs K-fold cross-validation for PyTorch Tabular models.
|
|
195
|
-
|
|
196
|
-
Replicates the training setup from the original model to ensure
|
|
197
|
-
cross-validation results are comparable to the deployed model.
|
|
198
|
-
|
|
199
|
-
Args:
|
|
200
|
-
workbench_model: Workbench model object
|
|
201
|
-
nfolds: Number of folds for cross-validation (default is 5)
|
|
202
|
-
|
|
203
|
-
Returns:
|
|
204
|
-
Tuple of:
|
|
205
|
-
- DataFrame with per-class metrics (and 'all' row for overall metrics)
|
|
206
|
-
- DataFrame with columns: id, target, prediction, and *_proba columns (for classifiers)
|
|
207
|
-
"""
|
|
208
|
-
import shutil
|
|
209
|
-
|
|
210
|
-
from pytorch_tabular import TabularModel
|
|
211
|
-
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig
|
|
212
|
-
from pytorch_tabular.models import CategoryEmbeddingModelConfig
|
|
213
|
-
|
|
214
|
-
from workbench.api import FeatureSet
|
|
215
|
-
|
|
216
|
-
# Create a temporary model directory
|
|
217
|
-
model_dir = tempfile.mkdtemp(prefix="pytorch_cv_")
|
|
218
|
-
log.info(f"Using model directory: {model_dir}")
|
|
219
|
-
|
|
220
|
-
try:
|
|
221
|
-
# Download and extract model artifacts to get config and artifacts
|
|
222
|
-
model_artifact_uri = workbench_model.model_data_url()
|
|
223
|
-
download_and_extract_model(model_artifact_uri, model_dir)
|
|
224
|
-
|
|
225
|
-
# Load model and artifacts
|
|
226
|
-
loaded_model, artifacts = load_pytorch_model_artifacts(model_dir)
|
|
227
|
-
category_mappings = artifacts.get("category_mappings", {})
|
|
228
|
-
|
|
229
|
-
# Determine if classifier from the loaded model's config
|
|
230
|
-
is_classifier = loaded_model.config.task == "classification"
|
|
231
|
-
|
|
232
|
-
# Use saved label encoder if available, otherwise create fresh one
|
|
233
|
-
if is_classifier:
|
|
234
|
-
label_encoder = artifacts.get("label_encoder")
|
|
235
|
-
if label_encoder is None:
|
|
236
|
-
log.warning("No saved label encoder found, creating fresh one")
|
|
237
|
-
label_encoder = LabelEncoder()
|
|
238
|
-
else:
|
|
239
|
-
label_encoder = None
|
|
240
|
-
|
|
241
|
-
# Prepare data
|
|
242
|
-
fs = FeatureSet(workbench_model.get_input())
|
|
243
|
-
df = workbench_model.training_view().pull_dataframe()
|
|
244
|
-
|
|
245
|
-
# Get columns
|
|
246
|
-
id_col = fs.id_column
|
|
247
|
-
target_col = workbench_model.target()
|
|
248
|
-
feature_cols = workbench_model.features()
|
|
249
|
-
print(f"Target column: {target_col}")
|
|
250
|
-
print(f"Feature columns: {len(feature_cols)} features")
|
|
251
|
-
|
|
252
|
-
# Convert string columns to category for PyTorch Tabular compatibility
|
|
253
|
-
for col in feature_cols:
|
|
254
|
-
if pd.api.types.is_string_dtype(df[col]):
|
|
255
|
-
if col in category_mappings:
|
|
256
|
-
df[col] = pd.Categorical(df[col], categories=category_mappings[col])
|
|
257
|
-
else:
|
|
258
|
-
df[col] = df[col].astype("category")
|
|
259
|
-
|
|
260
|
-
# Determine categorical and continuous columns
|
|
261
|
-
categorical_cols = [col for col in feature_cols if df[col].dtype.name == "category"]
|
|
262
|
-
continuous_cols = [col for col in feature_cols if col not in categorical_cols]
|
|
263
|
-
|
|
264
|
-
# Cast continuous columns to float
|
|
265
|
-
if continuous_cols:
|
|
266
|
-
df[continuous_cols] = df[continuous_cols].astype("float64")
|
|
267
|
-
|
|
268
|
-
# Drop rows with NaN features or target (PyTorch Tabular cannot handle NaN values)
|
|
269
|
-
nan_mask = df[feature_cols].isna().any(axis=1) | df[target_col].isna()
|
|
270
|
-
if nan_mask.any():
|
|
271
|
-
n_nan_rows = nan_mask.sum()
|
|
272
|
-
log.warning(
|
|
273
|
-
f"Dropping {n_nan_rows} rows ({100*n_nan_rows/len(df):.1f}%) with NaN values for cross-validation"
|
|
274
|
-
)
|
|
275
|
-
df = df[~nan_mask].reset_index(drop=True)
|
|
276
|
-
|
|
277
|
-
X = df[feature_cols]
|
|
278
|
-
y = df[target_col]
|
|
279
|
-
ids = df[id_col]
|
|
280
|
-
|
|
281
|
-
# Encode target if classifier
|
|
282
|
-
if label_encoder is not None:
|
|
283
|
-
if not hasattr(label_encoder, "classes_"):
|
|
284
|
-
label_encoder.fit(y)
|
|
285
|
-
y_encoded = label_encoder.transform(y)
|
|
286
|
-
y_for_cv = pd.Series(y_encoded, index=y.index, name=target_col)
|
|
287
|
-
else:
|
|
288
|
-
y_for_cv = y
|
|
289
|
-
|
|
290
|
-
# Extract configs from loaded model (pass approx train size for batch_size calculation)
|
|
291
|
-
n_train_approx = int(len(df) * (1 - 1 / nfolds))
|
|
292
|
-
configs = _extract_model_configs(loaded_model, n_train_approx)
|
|
293
|
-
trainer_params = configs["trainer"]
|
|
294
|
-
model_params = configs["model"]
|
|
295
|
-
|
|
296
|
-
log.info(f"Trainer config:\n{pformat(trainer_params)}")
|
|
297
|
-
log.info(f"Model config:\n{pformat(model_params)}")
|
|
298
|
-
|
|
299
|
-
# Prepare KFold
|
|
300
|
-
kfold = (StratifiedKFold if is_classifier else KFold)(n_splits=nfolds, shuffle=True, random_state=42)
|
|
301
|
-
|
|
302
|
-
# Initialize results collection
|
|
303
|
-
fold_metrics = []
|
|
304
|
-
predictions_df = pd.DataFrame({id_col: ids, target_col: y})
|
|
305
|
-
if is_classifier:
|
|
306
|
-
predictions_df["pred_proba"] = [None] * len(predictions_df)
|
|
307
|
-
|
|
308
|
-
# Perform cross-validation
|
|
309
|
-
for fold_idx, (train_idx, val_idx) in enumerate(kfold.split(X, y_for_cv), 1):
|
|
310
|
-
print(f"\n{'='*50}")
|
|
311
|
-
print(f"Fold {fold_idx}/{nfolds}")
|
|
312
|
-
print(f"{'='*50}")
|
|
313
|
-
|
|
314
|
-
# Split data
|
|
315
|
-
df_train = df.iloc[train_idx].copy()
|
|
316
|
-
df_val = df.iloc[val_idx].copy()
|
|
317
|
-
|
|
318
|
-
# Encode target for this fold
|
|
319
|
-
if is_classifier:
|
|
320
|
-
df_train[target_col] = label_encoder.transform(df_train[target_col])
|
|
321
|
-
df_val[target_col] = label_encoder.transform(df_val[target_col])
|
|
322
|
-
|
|
323
|
-
# Create configs for this fold - matching the training template exactly
|
|
324
|
-
data_config = DataConfig(
|
|
325
|
-
target=[target_col],
|
|
326
|
-
continuous_cols=continuous_cols,
|
|
327
|
-
categorical_cols=categorical_cols,
|
|
328
|
-
)
|
|
329
|
-
|
|
330
|
-
trainer_config = TrainerConfig(
|
|
331
|
-
auto_lr_find=trainer_params["auto_lr_find"],
|
|
332
|
-
batch_size=trainer_params["batch_size"],
|
|
333
|
-
max_epochs=trainer_params["max_epochs"],
|
|
334
|
-
min_epochs=trainer_params["min_epochs"],
|
|
335
|
-
early_stopping=trainer_params["early_stopping"],
|
|
336
|
-
early_stopping_patience=trainer_params["early_stopping_patience"],
|
|
337
|
-
gradient_clip_val=trainer_params["gradient_clip_val"],
|
|
338
|
-
checkpoints="valid_loss", # Save best model based on validation loss
|
|
339
|
-
accelerator="cpu",
|
|
340
|
-
)
|
|
341
|
-
|
|
342
|
-
optimizer_config = OptimizerConfig()
|
|
343
|
-
|
|
344
|
-
model_config = CategoryEmbeddingModelConfig(
|
|
345
|
-
task="classification" if is_classifier else "regression",
|
|
346
|
-
layers=model_params["layers"],
|
|
347
|
-
activation=model_params["activation"],
|
|
348
|
-
learning_rate=model_params["learning_rate"],
|
|
349
|
-
dropout=model_params["dropout"],
|
|
350
|
-
use_batch_norm=model_params["use_batch_norm"],
|
|
351
|
-
initialization=model_params["initialization"],
|
|
352
|
-
)
|
|
353
|
-
|
|
354
|
-
# Create and train fresh model
|
|
355
|
-
tabular_model = TabularModel(
|
|
356
|
-
data_config=data_config,
|
|
357
|
-
model_config=model_config,
|
|
358
|
-
optimizer_config=optimizer_config,
|
|
359
|
-
trainer_config=trainer_config,
|
|
360
|
-
)
|
|
361
|
-
|
|
362
|
-
# Change to /tmp for training (PyTorch Tabular needs write access)
|
|
363
|
-
original_cwd = os.getcwd()
|
|
364
|
-
try:
|
|
365
|
-
os.chdir("/tmp")
|
|
366
|
-
# Clean up checkpoint directory from previous fold
|
|
367
|
-
checkpoint_dir = "/tmp/saved_models"
|
|
368
|
-
if os.path.exists(checkpoint_dir):
|
|
369
|
-
shutil.rmtree(checkpoint_dir)
|
|
370
|
-
tabular_model.fit(train=df_train, validation=df_val)
|
|
371
|
-
finally:
|
|
372
|
-
os.chdir(original_cwd)
|
|
373
|
-
|
|
374
|
-
# Make predictions
|
|
375
|
-
result = tabular_model.predict(df_val[feature_cols])
|
|
376
|
-
|
|
377
|
-
# Extract predictions
|
|
378
|
-
prediction_col = f"{target_col}_prediction"
|
|
379
|
-
preds = result[prediction_col].values
|
|
380
|
-
|
|
381
|
-
# Store predictions at the correct indices
|
|
382
|
-
val_indices = df.iloc[val_idx].index
|
|
383
|
-
if is_classifier:
|
|
384
|
-
preds_decoded = label_encoder.inverse_transform(preds.astype(int))
|
|
385
|
-
predictions_df.loc[val_indices, "prediction"] = preds_decoded
|
|
386
|
-
|
|
387
|
-
# Get probabilities and store at validation indices only
|
|
388
|
-
prob_cols = sorted([col for col in result.columns if col.endswith("_probability")])
|
|
389
|
-
if prob_cols:
|
|
390
|
-
probs = result[prob_cols].values
|
|
391
|
-
for i, idx in enumerate(val_indices):
|
|
392
|
-
predictions_df.at[idx, "pred_proba"] = probs[i].tolist()
|
|
393
|
-
else:
|
|
394
|
-
predictions_df.loc[val_indices, "prediction"] = preds
|
|
395
|
-
|
|
396
|
-
# Calculate fold metrics
|
|
397
|
-
if is_classifier:
|
|
398
|
-
y_val_orig = label_encoder.inverse_transform(df_val[target_col])
|
|
399
|
-
preds_orig = preds_decoded
|
|
400
|
-
|
|
401
|
-
prec, rec, f1, _ = precision_recall_fscore_support(
|
|
402
|
-
y_val_orig, preds_orig, average="weighted", zero_division=0
|
|
403
|
-
)
|
|
404
|
-
|
|
405
|
-
prec_per_class, rec_per_class, f1_per_class, _ = precision_recall_fscore_support(
|
|
406
|
-
y_val_orig, preds_orig, average=None, zero_division=0, labels=label_encoder.classes_
|
|
407
|
-
)
|
|
408
|
-
|
|
409
|
-
y_val_encoded = df_val[target_col].values
|
|
410
|
-
roc_auc_overall = roc_auc_score(y_val_encoded, probs, multi_class="ovr", average="macro")
|
|
411
|
-
roc_auc_per_class = roc_auc_score(y_val_encoded, probs, multi_class="ovr", average=None)
|
|
412
|
-
|
|
413
|
-
fold_metrics.append(
|
|
414
|
-
{
|
|
415
|
-
"fold": fold_idx,
|
|
416
|
-
"precision": prec,
|
|
417
|
-
"recall": rec,
|
|
418
|
-
"f1": f1,
|
|
419
|
-
"roc_auc": roc_auc_overall,
|
|
420
|
-
"precision_per_class": prec_per_class,
|
|
421
|
-
"recall_per_class": rec_per_class,
|
|
422
|
-
"f1_per_class": f1_per_class,
|
|
423
|
-
"roc_auc_per_class": roc_auc_per_class,
|
|
424
|
-
}
|
|
425
|
-
)
|
|
426
|
-
|
|
427
|
-
print(f"Fold {fold_idx} - F1: {f1:.4f}, ROC-AUC: {roc_auc_overall:.4f}")
|
|
428
|
-
else:
|
|
429
|
-
y_val = df_val[target_col].values
|
|
430
|
-
spearman_corr, _ = spearmanr(y_val, preds)
|
|
431
|
-
rmse = np.sqrt(mean_squared_error(y_val, preds))
|
|
432
|
-
|
|
433
|
-
fold_metrics.append(
|
|
434
|
-
{
|
|
435
|
-
"fold": fold_idx,
|
|
436
|
-
"rmse": rmse,
|
|
437
|
-
"mae": mean_absolute_error(y_val, preds),
|
|
438
|
-
"medae": median_absolute_error(y_val, preds),
|
|
439
|
-
"r2": r2_score(y_val, preds),
|
|
440
|
-
"spearmanr": spearman_corr,
|
|
441
|
-
}
|
|
442
|
-
)
|
|
443
|
-
|
|
444
|
-
print(f"Fold {fold_idx} - RMSE: {rmse:.4f}, R2: {fold_metrics[-1]['r2']:.4f}")
|
|
445
|
-
|
|
446
|
-
# Calculate summary metrics
|
|
447
|
-
fold_df = pd.DataFrame(fold_metrics)
|
|
448
|
-
|
|
449
|
-
if is_classifier:
|
|
450
|
-
if "pred_proba" in predictions_df.columns:
|
|
451
|
-
predictions_df = expand_proba_column(predictions_df, label_encoder.classes_)
|
|
452
|
-
|
|
453
|
-
metric_rows = []
|
|
454
|
-
for idx, class_name in enumerate(label_encoder.classes_):
|
|
455
|
-
prec_scores = np.array([fold["precision_per_class"][idx] for fold in fold_metrics])
|
|
456
|
-
rec_scores = np.array([fold["recall_per_class"][idx] for fold in fold_metrics])
|
|
457
|
-
f1_scores = np.array([fold["f1_per_class"][idx] for fold in fold_metrics])
|
|
458
|
-
roc_auc_scores = np.array([fold["roc_auc_per_class"][idx] for fold in fold_metrics])
|
|
459
|
-
|
|
460
|
-
y_orig = label_encoder.inverse_transform(y_for_cv)
|
|
461
|
-
support = int((y_orig == class_name).sum())
|
|
462
|
-
|
|
463
|
-
metric_rows.append(
|
|
464
|
-
{
|
|
465
|
-
"class": class_name,
|
|
466
|
-
"precision": prec_scores.mean(),
|
|
467
|
-
"recall": rec_scores.mean(),
|
|
468
|
-
"f1": f1_scores.mean(),
|
|
469
|
-
"roc_auc": roc_auc_scores.mean(),
|
|
470
|
-
"support": support,
|
|
471
|
-
}
|
|
472
|
-
)
|
|
473
|
-
|
|
474
|
-
metric_rows.append(
|
|
475
|
-
{
|
|
476
|
-
"class": "all",
|
|
477
|
-
"precision": fold_df["precision"].mean(),
|
|
478
|
-
"recall": fold_df["recall"].mean(),
|
|
479
|
-
"f1": fold_df["f1"].mean(),
|
|
480
|
-
"roc_auc": fold_df["roc_auc"].mean(),
|
|
481
|
-
"support": len(y_for_cv),
|
|
482
|
-
}
|
|
483
|
-
)
|
|
484
|
-
|
|
485
|
-
metrics_df = pd.DataFrame(metric_rows)
|
|
486
|
-
else:
|
|
487
|
-
metrics_df = pd.DataFrame(
|
|
488
|
-
[
|
|
489
|
-
{
|
|
490
|
-
"rmse": fold_df["rmse"].mean(),
|
|
491
|
-
"mae": fold_df["mae"].mean(),
|
|
492
|
-
"medae": fold_df["medae"].mean(),
|
|
493
|
-
"r2": fold_df["r2"].mean(),
|
|
494
|
-
"spearmanr": fold_df["spearmanr"].mean(),
|
|
495
|
-
"support": len(y_for_cv),
|
|
496
|
-
}
|
|
497
|
-
]
|
|
498
|
-
)
|
|
499
|
-
|
|
500
|
-
print(f"\n{'='*50}")
|
|
501
|
-
print("Cross-Validation Summary")
|
|
502
|
-
print(f"{'='*50}")
|
|
503
|
-
print(metrics_df.to_string(index=False))
|
|
504
|
-
|
|
505
|
-
return metrics_df, predictions_df
|
|
506
|
-
|
|
507
|
-
finally:
|
|
508
|
-
log.info(f"Cleaning up model directory: {model_dir}")
|
|
509
|
-
shutil.rmtree(model_dir, ignore_errors=True)
|
|
510
|
-
|
|
511
|
-
|
|
512
74
|
if __name__ == "__main__":
|
|
75
|
+
from workbench.api import Model
|
|
513
76
|
|
|
514
|
-
#
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
# Initialize Workbench model
|
|
518
|
-
model_name = "caco2-er-reg-pytorch-test"
|
|
519
|
-
# model_name = "aqsol-pytorch-reg"
|
|
77
|
+
# Test pulling CV results
|
|
78
|
+
model_name = "aqsol-reg-pytorch"
|
|
520
79
|
print(f"Loading Workbench model: {model_name}")
|
|
521
80
|
model = Model(model_name)
|
|
522
81
|
print(f"Model Framework: {model.model_framework}")
|
|
523
82
|
|
|
524
|
-
#
|
|
525
|
-
|
|
526
|
-
|
|
83
|
+
# Pull CV results from training artifacts
|
|
84
|
+
metrics_df, predictions_df = pull_cv_results(model)
|
|
85
|
+
print(f"\nMetrics:\n{metrics_df}")
|
|
86
|
+
print(f"\nPredictions shape: {predictions_df.shape}")
|
|
87
|
+
print(f"Predictions columns: {predictions_df.columns.tolist()}")
|
workbench/utils/shap_utils.py
CHANGED
|
@@ -9,6 +9,7 @@ from typing import Optional, List, Tuple, Dict, Union
|
|
|
9
9
|
from workbench.utils.xgboost_model_utils import xgboost_model_from_s3
|
|
10
10
|
from workbench.utils.model_utils import load_category_mappings_from_s3
|
|
11
11
|
from workbench.utils.pandas_utils import convert_categorical_types
|
|
12
|
+
from workbench.model_script_utils.model_script_utils import decompress_features
|
|
12
13
|
|
|
13
14
|
# Set up the log
|
|
14
15
|
log = logging.getLogger("workbench")
|
|
@@ -111,61 +112,6 @@ def shap_values_data(
|
|
|
111
112
|
return result_df, feature_df
|
|
112
113
|
|
|
113
114
|
|
|
114
|
-
def decompress_features(
|
|
115
|
-
df: pd.DataFrame, features: List[str], compressed_features: List[str]
|
|
116
|
-
) -> Tuple[pd.DataFrame, List[str]]:
|
|
117
|
-
"""Prepare features for the XGBoost model
|
|
118
|
-
|
|
119
|
-
Args:
|
|
120
|
-
df (pd.DataFrame): The features DataFrame
|
|
121
|
-
features (List[str]): Full list of feature names
|
|
122
|
-
compressed_features (List[str]): List of feature names to decompress (bitstrings)
|
|
123
|
-
|
|
124
|
-
Returns:
|
|
125
|
-
pd.DataFrame: DataFrame with the decompressed features
|
|
126
|
-
List[str]: Updated list of feature names after decompression
|
|
127
|
-
|
|
128
|
-
Raises:
|
|
129
|
-
ValueError: If any missing values are found in the specified features
|
|
130
|
-
"""
|
|
131
|
-
|
|
132
|
-
# Check for any missing values in the required features
|
|
133
|
-
missing_counts = df[features].isna().sum()
|
|
134
|
-
if missing_counts.any():
|
|
135
|
-
missing_features = missing_counts[missing_counts > 0]
|
|
136
|
-
print(
|
|
137
|
-
f"WARNING: Found missing values in features: {missing_features.to_dict()}. "
|
|
138
|
-
"WARNING: You might want to remove/replace all NaN values before processing."
|
|
139
|
-
)
|
|
140
|
-
|
|
141
|
-
# Decompress the specified compressed features
|
|
142
|
-
decompressed_features = features
|
|
143
|
-
for feature in compressed_features:
|
|
144
|
-
if (feature not in df.columns) or (feature not in features):
|
|
145
|
-
print(f"Feature '{feature}' not in the features list, skipping decompression.")
|
|
146
|
-
continue
|
|
147
|
-
|
|
148
|
-
# Remove the feature from the list of features to avoid duplication
|
|
149
|
-
decompressed_features.remove(feature)
|
|
150
|
-
|
|
151
|
-
# Handle all compressed features as bitstrings
|
|
152
|
-
bit_matrix = np.array([list(bitstring) for bitstring in df[feature]], dtype=np.uint8)
|
|
153
|
-
prefix = feature[:3]
|
|
154
|
-
|
|
155
|
-
# Create all new columns at once - avoids fragmentation
|
|
156
|
-
new_col_names = [f"{prefix}_{i}" for i in range(bit_matrix.shape[1])]
|
|
157
|
-
new_df = pd.DataFrame(bit_matrix, columns=new_col_names, index=df.index)
|
|
158
|
-
|
|
159
|
-
# Add to features list
|
|
160
|
-
decompressed_features.extend(new_col_names)
|
|
161
|
-
|
|
162
|
-
# Drop original column and concatenate new ones
|
|
163
|
-
df = df.drop(columns=[feature])
|
|
164
|
-
df = pd.concat([df, new_df], axis=1)
|
|
165
|
-
|
|
166
|
-
return df, decompressed_features
|
|
167
|
-
|
|
168
|
-
|
|
169
115
|
def _calculate_shap_values(workbench_model, sample_df: pd.DataFrame = None):
|
|
170
116
|
"""
|
|
171
117
|
Internal function to calculate SHAP values for Workbench Models.
|