workbench 0.8.201__py3-none-any.whl → 0.8.203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- workbench/api/df_store.py +17 -108
- workbench/api/feature_set.py +41 -7
- workbench/api/parameter_store.py +3 -52
- workbench/core/artifacts/artifact.py +5 -5
- workbench/core/artifacts/df_store_core.py +114 -0
- workbench/core/artifacts/endpoint_core.py +203 -58
- workbench/core/artifacts/model_core.py +11 -7
- workbench/core/artifacts/parameter_store_core.py +98 -0
- workbench/core/transforms/features_to_model/features_to_model.py +27 -13
- workbench/core/transforms/pandas_transforms/pandas_to_features.py +11 -2
- workbench/model_scripts/chemprop/chemprop.template +297 -295
- workbench/model_scripts/chemprop/generated_model_script.py +300 -298
- workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +11 -5
- workbench/model_scripts/custom_models/uq_models/meta_uq.template +11 -5
- workbench/model_scripts/custom_models/uq_models/ngboost.template +11 -5
- workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +11 -5
- workbench/model_scripts/pytorch_model/generated_model_script.py +278 -128
- workbench/model_scripts/pytorch_model/pytorch.template +273 -123
- workbench/model_scripts/uq_models/generated_model_script.py +19 -10
- workbench/model_scripts/uq_models/mapie.template +17 -8
- workbench/model_scripts/xgb_model/generated_model_script.py +38 -9
- workbench/model_scripts/xgb_model/xgb_model.template +34 -5
- workbench/resources/open_source_api.key +1 -1
- workbench/utils/chemprop_utils.py +38 -1
- workbench/utils/pytorch_utils.py +38 -8
- workbench/web_interface/components/model_plot.py +7 -1
- {workbench-0.8.201.dist-info → workbench-0.8.203.dist-info}/METADATA +2 -2
- {workbench-0.8.201.dist-info → workbench-0.8.203.dist-info}/RECORD +32 -32
- workbench/core/cloud_platform/aws/aws_df_store.py +0 -404
- workbench/core/cloud_platform/aws/aws_parameter_store.py +0 -296
- {workbench-0.8.201.dist-info → workbench-0.8.203.dist-info}/WHEEL +0 -0
- {workbench-0.8.201.dist-info → workbench-0.8.203.dist-info}/entry_points.txt +0 -0
- {workbench-0.8.201.dist-info → workbench-0.8.203.dist-info}/licenses/LICENSE +0 -0
- {workbench-0.8.201.dist-info → workbench-0.8.203.dist-info}/top_level.txt +0 -0
|
@@ -39,11 +39,13 @@ from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
|
|
|
39
39
|
from sklearn.preprocessing import LabelEncoder
|
|
40
40
|
from sklearn.metrics import (
|
|
41
41
|
mean_absolute_error,
|
|
42
|
+
median_absolute_error,
|
|
42
43
|
r2_score,
|
|
43
44
|
root_mean_squared_error,
|
|
44
45
|
precision_recall_fscore_support,
|
|
45
46
|
confusion_matrix,
|
|
46
47
|
)
|
|
48
|
+
from scipy.stats import spearmanr
|
|
47
49
|
import joblib
|
|
48
50
|
|
|
49
51
|
# ChemProp imports
|
|
@@ -51,12 +53,12 @@ from chemprop import data, models, nn
|
|
|
51
53
|
|
|
52
54
|
# Template Parameters
|
|
53
55
|
TEMPLATE_PARAMS = {
|
|
54
|
-
"model_type": "
|
|
55
|
-
"
|
|
56
|
+
"model_type": "regressor",
|
|
57
|
+
"targets": ['logd', 'ksol', 'hlm_clint', 'mlm_clint', 'caco_2_papp_a_b', 'caco_2_efflux', 'mppb', 'mbpb', 'mgmb'], # List of target columns (single or multi-task)
|
|
56
58
|
"feature_list": ['smiles'],
|
|
57
|
-
"
|
|
58
|
-
"
|
|
59
|
-
"hyperparameters": {
|
|
59
|
+
"id_column": "molecule_name",
|
|
60
|
+
"model_metrics_s3_path": "s3://sandbox-sageworks-artifacts/models/open-admet-chemprop-mt/training",
|
|
61
|
+
"hyperparameters": {},
|
|
60
62
|
}
|
|
61
63
|
|
|
62
64
|
|
|
@@ -108,14 +110,14 @@ def expand_proba_column(df: pd.DataFrame, class_labels: list[str]) -> pd.DataFra
|
|
|
108
110
|
|
|
109
111
|
def create_molecule_datapoints(
|
|
110
112
|
smiles_list: list[str],
|
|
111
|
-
targets: list[float] | None = None,
|
|
113
|
+
targets: list[float] | np.ndarray | None = None,
|
|
112
114
|
extra_descriptors: np.ndarray | None = None,
|
|
113
115
|
) -> tuple[list[data.MoleculeDatapoint], list[int]]:
|
|
114
116
|
"""Create ChemProp MoleculeDatapoints from SMILES strings.
|
|
115
117
|
|
|
116
118
|
Args:
|
|
117
119
|
smiles_list: List of SMILES strings
|
|
118
|
-
targets: Optional
|
|
120
|
+
targets: Optional target values as 2D array (n_samples, n_targets). NaN allowed for missing targets.
|
|
119
121
|
extra_descriptors: Optional array of extra features (n_samples, n_features)
|
|
120
122
|
|
|
121
123
|
Returns:
|
|
@@ -127,6 +129,12 @@ def create_molecule_datapoints(
|
|
|
127
129
|
valid_indices = []
|
|
128
130
|
invalid_count = 0
|
|
129
131
|
|
|
132
|
+
# Convert targets to 2D array if provided
|
|
133
|
+
if targets is not None:
|
|
134
|
+
targets = np.atleast_2d(np.array(targets))
|
|
135
|
+
if targets.shape[0] == 1 and len(smiles_list) > 1:
|
|
136
|
+
targets = targets.T # Shape was (1, n_samples), transpose to (n_samples, 1)
|
|
137
|
+
|
|
130
138
|
for i, smi in enumerate(smiles_list):
|
|
131
139
|
# Validate SMILES with RDKit first
|
|
132
140
|
mol = Chem.MolFromSmiles(smi)
|
|
@@ -134,8 +142,9 @@ def create_molecule_datapoints(
|
|
|
134
142
|
invalid_count += 1
|
|
135
143
|
continue
|
|
136
144
|
|
|
137
|
-
# Build datapoint with optional target and extra descriptors
|
|
138
|
-
y
|
|
145
|
+
# Build datapoint with optional target(s) and extra descriptors
|
|
146
|
+
# For multi-task, y is a list of values (can include NaN for missing targets)
|
|
147
|
+
y = targets[i].tolist() if targets is not None else None
|
|
139
148
|
x_d = extra_descriptors[i] if extra_descriptors is not None else None
|
|
140
149
|
|
|
141
150
|
dp = data.MoleculeDatapoint.from_smi(smi, y=y, x_d=x_d)
|
|
@@ -152,9 +161,11 @@ def build_mpnn_model(
|
|
|
152
161
|
hyperparameters: dict,
|
|
153
162
|
task: str = "regression",
|
|
154
163
|
num_classes: int | None = None,
|
|
164
|
+
n_targets: int = 1,
|
|
155
165
|
n_extra_descriptors: int = 0,
|
|
156
166
|
x_d_transform: nn.ScaleTransform | None = None,
|
|
157
167
|
output_transform: nn.UnscaleTransform | None = None,
|
|
168
|
+
task_weights: np.ndarray | None = None,
|
|
158
169
|
) -> models.MPNN:
|
|
159
170
|
"""Build an MPNN model with the specified hyperparameters.
|
|
160
171
|
|
|
@@ -162,19 +173,21 @@ def build_mpnn_model(
|
|
|
162
173
|
hyperparameters: Dictionary of model hyperparameters
|
|
163
174
|
task: Either "regression" or "classification"
|
|
164
175
|
num_classes: Number of classes for classification tasks
|
|
176
|
+
n_targets: Number of target columns (for multi-task regression)
|
|
165
177
|
n_extra_descriptors: Number of extra descriptor features (for hybrid mode)
|
|
166
178
|
x_d_transform: Optional transform for extra descriptors (scaling)
|
|
167
179
|
output_transform: Optional transform for regression output (unscaling targets)
|
|
180
|
+
task_weights: Optional array of weights for each task (multi-task learning)
|
|
168
181
|
|
|
169
182
|
Returns:
|
|
170
183
|
Configured MPNN model
|
|
171
184
|
"""
|
|
172
185
|
# Model hyperparameters with defaults
|
|
173
|
-
hidden_dim = hyperparameters.get("hidden_dim",
|
|
174
|
-
depth = hyperparameters.get("depth",
|
|
175
|
-
dropout = hyperparameters.get("dropout", 0.
|
|
176
|
-
ffn_hidden_dim = hyperparameters.get("ffn_hidden_dim",
|
|
177
|
-
ffn_num_layers = hyperparameters.get("ffn_num_layers",
|
|
186
|
+
hidden_dim = hyperparameters.get("hidden_dim", 700)
|
|
187
|
+
depth = hyperparameters.get("depth", 6)
|
|
188
|
+
dropout = hyperparameters.get("dropout", 0.25)
|
|
189
|
+
ffn_hidden_dim = hyperparameters.get("ffn_hidden_dim", 2000)
|
|
190
|
+
ffn_num_layers = hyperparameters.get("ffn_num_layers", 2)
|
|
178
191
|
|
|
179
192
|
# Message passing component
|
|
180
193
|
mp = nn.BondMessagePassing(d_h=hidden_dim, depth=depth, dropout=dropout)
|
|
@@ -197,12 +210,20 @@ def build_mpnn_model(
|
|
|
197
210
|
)
|
|
198
211
|
else:
|
|
199
212
|
# Regression with optional output transform to unscale predictions
|
|
213
|
+
# n_tasks controls the number of output heads for multi-task learning
|
|
214
|
+
# task_weights goes here (in RegressionFFN) to weight loss per task
|
|
215
|
+
weights_tensor = None
|
|
216
|
+
if task_weights is not None:
|
|
217
|
+
weights_tensor = torch.tensor(task_weights, dtype=torch.float32)
|
|
218
|
+
|
|
200
219
|
ffn = nn.RegressionFFN(
|
|
201
220
|
input_dim=ffn_input_dim,
|
|
202
221
|
hidden_dim=ffn_hidden_dim,
|
|
203
222
|
n_layers=ffn_num_layers,
|
|
204
223
|
dropout=dropout,
|
|
224
|
+
n_tasks=n_targets,
|
|
205
225
|
output_transform=output_transform,
|
|
226
|
+
task_weights=weights_tensor,
|
|
206
227
|
)
|
|
207
228
|
|
|
208
229
|
# Create the MPNN model
|
|
@@ -227,31 +248,26 @@ def model_fn(model_dir: str) -> dict:
|
|
|
227
248
|
Returns:
|
|
228
249
|
Dictionary with ensemble models and metadata
|
|
229
250
|
"""
|
|
230
|
-
# Load ensemble metadata
|
|
251
|
+
# Load ensemble metadata (required)
|
|
231
252
|
ensemble_metadata_path = os.path.join(model_dir, "ensemble_metadata.joblib")
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
else:
|
|
236
|
-
# Backwards compatibility: single model without ensemble metadata
|
|
237
|
-
n_ensemble = 1
|
|
253
|
+
ensemble_metadata = joblib.load(ensemble_metadata_path)
|
|
254
|
+
n_ensemble = ensemble_metadata["n_ensemble"]
|
|
255
|
+
target_columns = ensemble_metadata["target_columns"]
|
|
238
256
|
|
|
239
257
|
# Load all ensemble models
|
|
240
258
|
ensemble_models = []
|
|
241
259
|
for ens_idx in range(n_ensemble):
|
|
242
260
|
model_path = os.path.join(model_dir, f"chemprop_model_{ens_idx}.pt")
|
|
243
|
-
if not os.path.exists(model_path):
|
|
244
|
-
# Backwards compatibility: try old single model path
|
|
245
|
-
model_path = os.path.join(model_dir, "chemprop_model.pt")
|
|
246
261
|
model = models.MPNN.load_from_file(model_path)
|
|
247
262
|
model.eval()
|
|
248
263
|
ensemble_models.append(model)
|
|
249
264
|
|
|
250
|
-
print(f"Loaded {len(ensemble_models)} ensemble model(s)")
|
|
265
|
+
print(f"Loaded {len(ensemble_models)} ensemble model(s), n_targets={len(target_columns)}")
|
|
251
266
|
|
|
252
267
|
return {
|
|
253
268
|
"ensemble_models": ensemble_models,
|
|
254
269
|
"n_ensemble": n_ensemble,
|
|
270
|
+
"target_columns": target_columns,
|
|
255
271
|
}
|
|
256
272
|
|
|
257
273
|
|
|
@@ -297,9 +313,10 @@ def predict_fn(df: pd.DataFrame, model_dict: dict) -> pd.DataFrame:
|
|
|
297
313
|
model_type = TEMPLATE_PARAMS["model_type"]
|
|
298
314
|
model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
|
|
299
315
|
|
|
300
|
-
# Extract ensemble models
|
|
316
|
+
# Extract ensemble models and metadata
|
|
301
317
|
ensemble_models = model_dict["ensemble_models"]
|
|
302
318
|
n_ensemble = model_dict["n_ensemble"]
|
|
319
|
+
target_columns = model_dict["target_columns"]
|
|
303
320
|
|
|
304
321
|
# Load label encoder if present (classification)
|
|
305
322
|
label_encoder = None
|
|
@@ -337,13 +354,14 @@ def predict_fn(df: pd.DataFrame, model_dict: dict) -> pd.DataFrame:
|
|
|
337
354
|
valid_mask = np.array(valid_mask)
|
|
338
355
|
print(f"Valid SMILES: {sum(valid_mask)} / {len(smiles_list)}")
|
|
339
356
|
|
|
340
|
-
# Initialize prediction
|
|
357
|
+
# Initialize prediction columns (use object dtype for classifiers to avoid FutureWarning)
|
|
341
358
|
if model_type == "classifier":
|
|
342
359
|
df["prediction"] = pd.Series([None] * len(df), dtype=object)
|
|
343
360
|
else:
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
df["
|
|
361
|
+
# Regression: create prediction column for each target
|
|
362
|
+
for tc in target_columns:
|
|
363
|
+
df[f"{tc}_pred"] = np.nan
|
|
364
|
+
df[f"{tc}_pred_std"] = np.nan
|
|
347
365
|
|
|
348
366
|
if sum(valid_mask) == 0:
|
|
349
367
|
print("Warning: No valid SMILES to predict on")
|
|
@@ -408,10 +426,15 @@ def predict_fn(df: pd.DataFrame, model_dict: dict) -> pd.DataFrame:
|
|
|
408
426
|
ens_preds = ens_preds.squeeze(axis=1)
|
|
409
427
|
all_ensemble_preds.append(ens_preds)
|
|
410
428
|
|
|
411
|
-
# Stack and compute mean/std
|
|
429
|
+
# Stack and compute mean/std (std is 0 for single model)
|
|
412
430
|
ensemble_preds = np.stack(all_ensemble_preds, axis=0)
|
|
413
431
|
preds = np.mean(ensemble_preds, axis=0)
|
|
414
|
-
preds_std = np.std(ensemble_preds, axis=0)
|
|
432
|
+
preds_std = np.std(ensemble_preds, axis=0) # Will be 0s for n_ensemble=1
|
|
433
|
+
|
|
434
|
+
# Ensure 2D: (n_samples, n_targets)
|
|
435
|
+
if preds.ndim == 1:
|
|
436
|
+
preds = preds.reshape(-1, 1)
|
|
437
|
+
preds_std = preds_std.reshape(-1, 1)
|
|
415
438
|
|
|
416
439
|
print(f"Inference: Ensemble predictions shape: {preds.shape}")
|
|
417
440
|
|
|
@@ -440,12 +463,10 @@ def predict_fn(df: pd.DataFrame, model_dict: dict) -> pd.DataFrame:
|
|
|
440
463
|
decoded_preds = label_encoder.inverse_transform(class_preds)
|
|
441
464
|
df.loc[valid_mask, "prediction"] = decoded_preds
|
|
442
465
|
else:
|
|
443
|
-
# Regression:
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
if preds_std is not None:
|
|
448
|
-
df.loc[valid_mask, "prediction_std"] = preds_std.flatten()
|
|
466
|
+
# Regression: store predictions for each target
|
|
467
|
+
for t_idx, tc in enumerate(target_columns):
|
|
468
|
+
df.loc[valid_mask, f"{tc}_pred"] = preds[:, t_idx]
|
|
469
|
+
df.loc[valid_mask, f"{tc}_pred_std"] = preds_std[:, t_idx]
|
|
449
470
|
|
|
450
471
|
return df
|
|
451
472
|
|
|
@@ -454,13 +475,18 @@ if __name__ == "__main__":
|
|
|
454
475
|
"""Training script for ChemProp MPNN model"""
|
|
455
476
|
|
|
456
477
|
# Template Parameters
|
|
457
|
-
|
|
478
|
+
target_columns = TEMPLATE_PARAMS["targets"] # List of target columns
|
|
458
479
|
model_type = TEMPLATE_PARAMS["model_type"]
|
|
459
480
|
feature_list = TEMPLATE_PARAMS["feature_list"]
|
|
481
|
+
id_column = TEMPLATE_PARAMS["id_column"]
|
|
460
482
|
model_metrics_s3_path = TEMPLATE_PARAMS["model_metrics_s3_path"]
|
|
461
|
-
train_all_data = TEMPLATE_PARAMS["train_all_data"]
|
|
462
483
|
hyperparameters = TEMPLATE_PARAMS["hyperparameters"]
|
|
463
|
-
|
|
484
|
+
|
|
485
|
+
# Validate target_columns
|
|
486
|
+
if not target_columns or not isinstance(target_columns, list) or len(target_columns) == 0:
|
|
487
|
+
raise ValueError("'targets' must be a non-empty list of target column names")
|
|
488
|
+
n_targets = len(target_columns)
|
|
489
|
+
print(f"Target columns ({n_targets}): {target_columns}")
|
|
464
490
|
|
|
465
491
|
# Get the SMILES column name from feature_list (user defines this, so we use their exact name)
|
|
466
492
|
smiles_column = find_smiles_column(feature_list)
|
|
@@ -502,21 +528,29 @@ if __name__ == "__main__":
|
|
|
502
528
|
|
|
503
529
|
check_dataframe(all_df, "training_df")
|
|
504
530
|
|
|
505
|
-
# Drop rows with missing SMILES or target values
|
|
531
|
+
# Drop rows with missing SMILES or all target values
|
|
506
532
|
initial_count = len(all_df)
|
|
507
|
-
all_df = all_df.dropna(subset=[smiles_column
|
|
533
|
+
all_df = all_df.dropna(subset=[smiles_column])
|
|
534
|
+
# Keep rows that have at least one non-null target (works for single and multi-task)
|
|
535
|
+
has_any_target = all_df[target_columns].notna().any(axis=1)
|
|
536
|
+
all_df = all_df[has_any_target]
|
|
508
537
|
dropped = initial_count - len(all_df)
|
|
509
538
|
if dropped > 0:
|
|
510
|
-
print(f"Dropped {dropped} rows with missing SMILES or target values")
|
|
539
|
+
print(f"Dropped {dropped} rows with missing SMILES or all target values")
|
|
511
540
|
|
|
512
|
-
print(f"Target: {
|
|
541
|
+
print(f"Target columns: {target_columns}")
|
|
513
542
|
print(f"Data Shape after cleaning: {all_df.shape}")
|
|
543
|
+
for tc in target_columns:
|
|
544
|
+
n_valid = all_df[tc].notna().sum()
|
|
545
|
+
print(f" {tc}: {n_valid} samples with values")
|
|
514
546
|
|
|
515
|
-
# Set up label encoder for classification
|
|
547
|
+
# Set up label encoder for classification (single-target only)
|
|
516
548
|
label_encoder = None
|
|
517
549
|
if model_type == "classifier":
|
|
550
|
+
if n_targets > 1:
|
|
551
|
+
raise ValueError("Multi-task classification is not supported. Use regression for multi-task.")
|
|
518
552
|
label_encoder = LabelEncoder()
|
|
519
|
-
all_df[
|
|
553
|
+
all_df[target_columns[0]] = label_encoder.fit_transform(all_df[target_columns[0]])
|
|
520
554
|
num_classes = len(label_encoder.classes_)
|
|
521
555
|
print(
|
|
522
556
|
f"Classification task with {num_classes} classes: {label_encoder.classes_}"
|
|
@@ -528,10 +562,10 @@ if __name__ == "__main__":
|
|
|
528
562
|
print(f"Hyperparameters: {hyperparameters}")
|
|
529
563
|
task = "classification" if model_type == "classifier" else "regression"
|
|
530
564
|
n_extra = len(extra_feature_cols) if use_extra_features else 0
|
|
531
|
-
max_epochs = hyperparameters.get("max_epochs",
|
|
532
|
-
patience = hyperparameters.get("patience",
|
|
533
|
-
n_folds = hyperparameters.get("n_folds",
|
|
534
|
-
batch_size = hyperparameters.get("batch_size",
|
|
565
|
+
max_epochs = hyperparameters.get("max_epochs", 400)
|
|
566
|
+
patience = hyperparameters.get("patience", 40)
|
|
567
|
+
n_folds = hyperparameters.get("n_folds", 5) # Number of CV folds (default: 5)
|
|
568
|
+
batch_size = hyperparameters.get("batch_size", 16)
|
|
535
569
|
|
|
536
570
|
# Check extra feature columns exist
|
|
537
571
|
if use_extra_features:
|
|
@@ -540,60 +574,108 @@ if __name__ == "__main__":
|
|
|
540
574
|
raise ValueError(f"Missing extra feature columns in training data: {missing_cols}")
|
|
541
575
|
|
|
542
576
|
# =========================================================================
|
|
543
|
-
#
|
|
577
|
+
# UNIFIED TRAINING: Works for n_folds=1 (single model) or n_folds>1 (K-fold CV)
|
|
544
578
|
# =========================================================================
|
|
579
|
+
print(f"Training {'single model' if n_folds == 1 else f'{n_folds}-fold cross-validation ensemble'}...")
|
|
580
|
+
|
|
581
|
+
# Prepare extra features and validate SMILES upfront
|
|
582
|
+
all_extra_features = None
|
|
583
|
+
col_means = None
|
|
584
|
+
if use_extra_features:
|
|
585
|
+
all_extra_features = all_df[extra_feature_cols].values.astype(np.float32)
|
|
586
|
+
col_means = np.nanmean(all_extra_features, axis=0)
|
|
587
|
+
for i in range(all_extra_features.shape[1]):
|
|
588
|
+
all_extra_features[np.isnan(all_extra_features[:, i]), i] = col_means[i]
|
|
589
|
+
|
|
590
|
+
# Prepare target array: always 2D (n_samples, n_targets)
|
|
591
|
+
all_targets = all_df[target_columns].values.astype(np.float32)
|
|
592
|
+
|
|
593
|
+
# Filter invalid SMILES from the full dataset
|
|
594
|
+
_, valid_indices = create_molecule_datapoints(
|
|
595
|
+
all_df[smiles_column].tolist(), all_targets, all_extra_features
|
|
596
|
+
)
|
|
597
|
+
all_df = all_df.iloc[valid_indices].reset_index(drop=True)
|
|
598
|
+
all_targets = all_targets[valid_indices]
|
|
599
|
+
if all_extra_features is not None:
|
|
600
|
+
all_extra_features = all_extra_features[valid_indices]
|
|
601
|
+
print(f"Data after SMILES validation: {all_df.shape}")
|
|
602
|
+
|
|
603
|
+
# Compute dynamic task weights for multi-task regression
|
|
604
|
+
# Weight = inverse of sample count (normalized so min weight = 1.0)
|
|
605
|
+
# This gives higher weight to targets with fewer samples
|
|
606
|
+
task_weights = None
|
|
607
|
+
if n_targets > 1 and model_type != "classifier":
|
|
608
|
+
sample_counts = np.array([np.sum(~np.isnan(all_targets[:, t])) for t in range(n_targets)])
|
|
609
|
+
# Inverse weighting: fewer samples = higher weight
|
|
610
|
+
inverse_counts = 1.0 / sample_counts
|
|
611
|
+
# Normalize so minimum weight is 1.0
|
|
612
|
+
task_weights = inverse_counts / inverse_counts.min()
|
|
613
|
+
print(f"Task weights (inverse sample count):")
|
|
614
|
+
for t_idx, t_name in enumerate(target_columns):
|
|
615
|
+
print(f" {t_name}: {task_weights[t_idx]:.3f} (n={sample_counts[t_idx]})")
|
|
616
|
+
|
|
617
|
+
# Create fold splits
|
|
545
618
|
if n_folds == 1:
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
# Split data
|
|
549
|
-
if train_all_data:
|
|
550
|
-
print("Training on ALL of the data")
|
|
551
|
-
df_train = all_df.copy()
|
|
552
|
-
df_val = all_df.copy()
|
|
553
|
-
elif "training" in all_df.columns:
|
|
619
|
+
# Single fold: use train/val split from "training" column or random split
|
|
620
|
+
if "training" in all_df.columns:
|
|
554
621
|
print("Found training column, splitting data based on training column")
|
|
555
|
-
|
|
556
|
-
|
|
622
|
+
train_idx = np.where(all_df["training"])[0]
|
|
623
|
+
val_idx = np.where(~all_df["training"])[0]
|
|
557
624
|
else:
|
|
558
|
-
print("WARNING: No training column found, splitting data with random
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
625
|
+
print("WARNING: No training column found, splitting data with random 80/20 split")
|
|
626
|
+
indices = np.arange(len(all_df))
|
|
627
|
+
train_idx, val_idx = train_test_split(indices, test_size=0.2, random_state=42)
|
|
628
|
+
folds = [(train_idx, val_idx)]
|
|
629
|
+
else:
|
|
630
|
+
# K-Fold CV
|
|
631
|
+
if model_type == "classifier":
|
|
632
|
+
kfold = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
|
|
633
|
+
split_target = all_df[target_columns[0]]
|
|
634
|
+
else:
|
|
635
|
+
kfold = KFold(n_splits=n_folds, shuffle=True, random_state=42)
|
|
636
|
+
split_target = None
|
|
637
|
+
folds = list(kfold.split(all_df, split_target))
|
|
638
|
+
|
|
639
|
+
# Initialize storage for out-of-fold predictions: always 2D (n_samples, n_targets)
|
|
640
|
+
oof_predictions = np.full((len(all_df), n_targets), np.nan, dtype=np.float64)
|
|
641
|
+
if model_type == "classifier" and num_classes and num_classes > 1:
|
|
642
|
+
oof_proba = np.full((len(all_df), num_classes), np.nan, dtype=np.float64)
|
|
643
|
+
else:
|
|
644
|
+
oof_proba = None
|
|
562
645
|
|
|
563
|
-
|
|
564
|
-
print(f"VALIDATION: {df_val.shape}")
|
|
646
|
+
ensemble_models = []
|
|
565
647
|
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
648
|
+
for fold_idx, (train_idx, val_idx) in enumerate(folds):
|
|
649
|
+
print(f"\n{'='*50}")
|
|
650
|
+
print(f"Training Fold {fold_idx + 1}/{len(folds)}")
|
|
651
|
+
print(f"{'='*50}")
|
|
570
652
|
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
653
|
+
# Split data for this fold
|
|
654
|
+
df_train = all_df.iloc[train_idx].reset_index(drop=True)
|
|
655
|
+
df_val = all_df.iloc[val_idx].reset_index(drop=True)
|
|
656
|
+
train_targets = all_targets[train_idx]
|
|
657
|
+
val_targets = all_targets[val_idx]
|
|
658
|
+
|
|
659
|
+
train_extra = all_extra_features[train_idx] if all_extra_features is not None else None
|
|
660
|
+
val_extra = all_extra_features[val_idx] if all_extra_features is not None else None
|
|
661
|
+
|
|
662
|
+
print(f"Fold {fold_idx + 1} - Train: {len(df_train)}, Val: {len(df_val)}")
|
|
663
|
+
|
|
664
|
+
# Create ChemProp datasets for this fold
|
|
665
|
+
train_datapoints, _ = create_molecule_datapoints(
|
|
666
|
+
df_train[smiles_column].tolist(), train_targets, train_extra
|
|
582
667
|
)
|
|
583
|
-
val_datapoints,
|
|
584
|
-
df_val[smiles_column].tolist(),
|
|
668
|
+
val_datapoints, _ = create_molecule_datapoints(
|
|
669
|
+
df_val[smiles_column].tolist(), val_targets, val_extra
|
|
585
670
|
)
|
|
586
671
|
|
|
587
|
-
df_train = df_train.iloc[train_valid_idx].reset_index(drop=True)
|
|
588
|
-
df_val = df_val.iloc[val_valid_idx].reset_index(drop=True)
|
|
589
|
-
|
|
590
672
|
train_dataset = data.MoleculeDataset(train_datapoints)
|
|
591
673
|
val_dataset = data.MoleculeDataset(val_datapoints)
|
|
592
674
|
|
|
593
|
-
# Save raw
|
|
594
|
-
val_extra_raw =
|
|
675
|
+
# Save raw val features for prediction
|
|
676
|
+
val_extra_raw = val_extra.copy() if val_extra is not None else None
|
|
595
677
|
|
|
596
|
-
# Scale features and targets
|
|
678
|
+
# Scale features and targets for this fold
|
|
597
679
|
x_d_transform = None
|
|
598
680
|
if use_extra_features:
|
|
599
681
|
feature_scaler = train_dataset.normalize_inputs("X_d")
|
|
@@ -601,7 +683,7 @@ if __name__ == "__main__":
|
|
|
601
683
|
x_d_transform = nn.ScaleTransform.from_standard_scaler(feature_scaler)
|
|
602
684
|
|
|
603
685
|
output_transform = None
|
|
604
|
-
if model_type
|
|
686
|
+
if model_type in ["regressor", "uq_regressor"]:
|
|
605
687
|
target_scaler = train_dataset.normalize_targets()
|
|
606
688
|
val_dataset.normalize_targets(target_scaler)
|
|
607
689
|
output_transform = nn.UnscaleTransform.from_standard_scaler(target_scaler)
|
|
@@ -609,17 +691,18 @@ if __name__ == "__main__":
|
|
|
609
691
|
train_loader = data.build_dataloader(train_dataset, batch_size=batch_size, shuffle=True)
|
|
610
692
|
val_loader = data.build_dataloader(val_dataset, batch_size=batch_size, shuffle=False)
|
|
611
693
|
|
|
612
|
-
# Build and train
|
|
613
|
-
pl.seed_everything(42)
|
|
694
|
+
# Build and train model for this fold
|
|
695
|
+
pl.seed_everything(42 + fold_idx)
|
|
614
696
|
mpnn = build_mpnn_model(
|
|
615
|
-
hyperparameters, task=task, num_classes=num_classes,
|
|
697
|
+
hyperparameters, task=task, num_classes=num_classes, n_targets=n_targets,
|
|
616
698
|
n_extra_descriptors=n_extra, x_d_transform=x_d_transform, output_transform=output_transform,
|
|
699
|
+
task_weights=task_weights,
|
|
617
700
|
)
|
|
618
701
|
|
|
619
702
|
callbacks = [
|
|
620
703
|
pl.callbacks.EarlyStopping(monitor="val_loss", patience=patience, mode="min"),
|
|
621
704
|
pl.callbacks.ModelCheckpoint(
|
|
622
|
-
dirpath=args.model_dir, filename="
|
|
705
|
+
dirpath=args.model_dir, filename=f"best_model_{fold_idx}",
|
|
623
706
|
monitor="val_loss", mode="min", save_top_k=1,
|
|
624
707
|
),
|
|
625
708
|
]
|
|
@@ -636,201 +719,95 @@ if __name__ == "__main__":
|
|
|
636
719
|
mpnn.load_state_dict(checkpoint["state_dict"])
|
|
637
720
|
|
|
638
721
|
mpnn.eval()
|
|
639
|
-
ensemble_models
|
|
722
|
+
ensemble_models.append(mpnn)
|
|
640
723
|
|
|
641
|
-
# Make predictions
|
|
724
|
+
# Make out-of-fold predictions using raw features
|
|
642
725
|
val_datapoints_raw, _ = create_molecule_datapoints(
|
|
643
|
-
df_val[smiles_column].tolist(),
|
|
726
|
+
df_val[smiles_column].tolist(), val_targets, val_extra_raw
|
|
644
727
|
)
|
|
645
728
|
val_dataset_raw = data.MoleculeDataset(val_datapoints_raw)
|
|
646
729
|
val_loader_pred = data.build_dataloader(val_dataset_raw, batch_size=batch_size, shuffle=False)
|
|
647
730
|
|
|
648
731
|
with torch.inference_mode():
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
if
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
else:
|
|
661
|
-
print(f"Training {n_folds}-fold cross-validation ensemble...")
|
|
662
|
-
|
|
663
|
-
# Validate all SMILES upfront and filter invalid ones
|
|
664
|
-
all_extra_features = None
|
|
665
|
-
if use_extra_features:
|
|
666
|
-
all_extra_features = all_df[extra_feature_cols].values.astype(np.float32)
|
|
667
|
-
col_means = np.nanmean(all_extra_features, axis=0)
|
|
668
|
-
for i in range(all_extra_features.shape[1]):
|
|
669
|
-
all_extra_features[np.isnan(all_extra_features[:, i]), i] = col_means[i]
|
|
670
|
-
else:
|
|
671
|
-
col_means = None
|
|
672
|
-
|
|
673
|
-
# Filter invalid SMILES from the full dataset
|
|
674
|
-
_, valid_indices = create_molecule_datapoints(
|
|
675
|
-
all_df[smiles_column].tolist(), all_df[target].tolist(), all_extra_features
|
|
676
|
-
)
|
|
677
|
-
all_df = all_df.iloc[valid_indices].reset_index(drop=True)
|
|
678
|
-
if all_extra_features is not None:
|
|
679
|
-
all_extra_features = all_extra_features[valid_indices]
|
|
680
|
-
print(f"Data after SMILES validation: {all_df.shape}")
|
|
681
|
-
|
|
682
|
-
# Set up K-Fold
|
|
683
|
-
if model_type == "classifier":
|
|
684
|
-
kfold = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
|
|
685
|
-
split_target = all_df[target]
|
|
686
|
-
else:
|
|
687
|
-
kfold = KFold(n_splits=n_folds, shuffle=True, random_state=42)
|
|
688
|
-
split_target = None
|
|
689
|
-
|
|
690
|
-
# Initialize storage for out-of-fold predictions
|
|
691
|
-
oof_predictions = np.full(len(all_df), np.nan, dtype=np.float64)
|
|
692
|
-
if model_type == "classifier" and num_classes and num_classes > 1:
|
|
693
|
-
oof_proba = np.full((len(all_df), num_classes), np.nan, dtype=np.float64)
|
|
732
|
+
fold_predictions = trainer.predict(mpnn, val_loader_pred)
|
|
733
|
+
fold_preds = np.concatenate([p.numpy() for p in fold_predictions], axis=0)
|
|
734
|
+
if fold_preds.ndim == 3 and fold_preds.shape[1] == 1:
|
|
735
|
+
fold_preds = fold_preds.squeeze(axis=1)
|
|
736
|
+
|
|
737
|
+
# Store out-of-fold predictions
|
|
738
|
+
if model_type == "classifier" and fold_preds.ndim == 2:
|
|
739
|
+
# Store class index in first column for classification
|
|
740
|
+
oof_predictions[val_idx, 0] = np.argmax(fold_preds, axis=1)
|
|
741
|
+
if oof_proba is not None:
|
|
742
|
+
oof_proba[val_idx] = fold_preds
|
|
694
743
|
else:
|
|
695
|
-
|
|
744
|
+
# Regression: fold_preds shape is (n_val, n_targets) or (n_val,)
|
|
745
|
+
if fold_preds.ndim == 1:
|
|
746
|
+
fold_preds = fold_preds.reshape(-1, 1)
|
|
747
|
+
oof_predictions[val_idx] = fold_preds
|
|
696
748
|
|
|
697
|
-
|
|
749
|
+
print(f"Fold {fold_idx + 1} complete!")
|
|
698
750
|
|
|
699
|
-
|
|
700
|
-
print(f"\n{'='*50}")
|
|
701
|
-
print(f"Training Fold {fold_idx + 1}/{n_folds}")
|
|
702
|
-
print(f"{'='*50}")
|
|
751
|
+
print(f"\nTraining complete! Trained {len(ensemble_models)} model(s).")
|
|
703
752
|
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
output_transform = None
|
|
735
|
-
if model_type == "regressor":
|
|
736
|
-
target_scaler = train_dataset.normalize_targets()
|
|
737
|
-
val_dataset.normalize_targets(target_scaler)
|
|
738
|
-
output_transform = nn.UnscaleTransform.from_standard_scaler(target_scaler)
|
|
739
|
-
|
|
740
|
-
train_loader = data.build_dataloader(train_dataset, batch_size=batch_size, shuffle=True)
|
|
741
|
-
val_loader = data.build_dataloader(val_dataset, batch_size=batch_size, shuffle=False)
|
|
742
|
-
|
|
743
|
-
# Build and train model for this fold
|
|
744
|
-
pl.seed_everything(42 + fold_idx)
|
|
745
|
-
mpnn = build_mpnn_model(
|
|
746
|
-
hyperparameters, task=task, num_classes=num_classes,
|
|
747
|
-
n_extra_descriptors=n_extra, x_d_transform=x_d_transform, output_transform=output_transform,
|
|
748
|
-
)
|
|
749
|
-
|
|
750
|
-
callbacks = [
|
|
751
|
-
pl.callbacks.EarlyStopping(monitor="val_loss", patience=patience, mode="min"),
|
|
752
|
-
pl.callbacks.ModelCheckpoint(
|
|
753
|
-
dirpath=args.model_dir, filename=f"best_model_{fold_idx}",
|
|
754
|
-
monitor="val_loss", mode="min", save_top_k=1,
|
|
755
|
-
),
|
|
756
|
-
]
|
|
757
|
-
|
|
758
|
-
trainer = pl.Trainer(
|
|
759
|
-
accelerator="auto", max_epochs=max_epochs, callbacks=callbacks,
|
|
760
|
-
logger=False, enable_progress_bar=True,
|
|
761
|
-
)
|
|
762
|
-
|
|
763
|
-
trainer.fit(mpnn, train_loader, val_loader)
|
|
764
|
-
|
|
765
|
-
if trainer.checkpoint_callback and trainer.checkpoint_callback.best_model_path:
|
|
766
|
-
checkpoint = torch.load(trainer.checkpoint_callback.best_model_path, weights_only=False)
|
|
767
|
-
mpnn.load_state_dict(checkpoint["state_dict"])
|
|
768
|
-
|
|
769
|
-
mpnn.eval()
|
|
770
|
-
ensemble_models.append(mpnn)
|
|
771
|
-
|
|
772
|
-
# Make out-of-fold predictions using raw features
|
|
773
|
-
val_datapoints_raw, _ = create_molecule_datapoints(
|
|
774
|
-
df_val[smiles_column].tolist(), df_val[target].tolist(), val_extra_raw
|
|
775
|
-
)
|
|
776
|
-
val_dataset_raw = data.MoleculeDataset(val_datapoints_raw)
|
|
777
|
-
val_loader_pred = data.build_dataloader(val_dataset_raw, batch_size=batch_size, shuffle=False)
|
|
753
|
+
# Use out-of-fold predictions for metrics
|
|
754
|
+
# For n_folds=1, we only have predictions for val_idx, so filter to those rows
|
|
755
|
+
if n_folds == 1:
|
|
756
|
+
# oof_predictions is always 2D now: check if any column has a value
|
|
757
|
+
val_mask = ~np.isnan(oof_predictions).all(axis=1)
|
|
758
|
+
preds = oof_predictions[val_mask]
|
|
759
|
+
df_val = all_df[val_mask].copy()
|
|
760
|
+
y_validate = all_targets[val_mask]
|
|
761
|
+
if oof_proba is not None:
|
|
762
|
+
oof_proba = oof_proba[val_mask]
|
|
763
|
+
val_extra_features = all_extra_features[val_mask] if all_extra_features is not None else None
|
|
764
|
+
else:
|
|
765
|
+
preds = oof_predictions
|
|
766
|
+
df_val = all_df.copy()
|
|
767
|
+
y_validate = all_targets
|
|
768
|
+
val_extra_features = all_extra_features
|
|
769
|
+
|
|
770
|
+
# Compute prediction_std by running all ensemble models on validation data
|
|
771
|
+
# For n_folds=1, std will be 0 (only one model). For n_folds>1, std shows ensemble disagreement.
|
|
772
|
+
preds_std = None
|
|
773
|
+
if model_type in ["regressor", "uq_regressor"] and len(ensemble_models) > 0:
|
|
774
|
+
print("Computing prediction_std from ensemble predictions on validation data...")
|
|
775
|
+
val_datapoints_for_std, _ = create_molecule_datapoints(
|
|
776
|
+
df_val[smiles_column].tolist(),
|
|
777
|
+
y_validate,
|
|
778
|
+
val_extra_features
|
|
779
|
+
)
|
|
780
|
+
val_dataset_for_std = data.MoleculeDataset(val_datapoints_for_std)
|
|
781
|
+
val_loader_for_std = data.build_dataloader(val_dataset_for_std, batch_size=batch_size, shuffle=False)
|
|
778
782
|
|
|
783
|
+
all_ensemble_preds_for_std = []
|
|
784
|
+
trainer_pred = pl.Trainer(accelerator="auto", logger=False, enable_progress_bar=False)
|
|
785
|
+
for ens_model in ensemble_models:
|
|
779
786
|
with torch.inference_mode():
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
if
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
print(f"Fold {fold_idx + 1} complete!")
|
|
794
|
-
|
|
795
|
-
print(f"\nCross-validation complete! Trained {len(ensemble_models)} models.")
|
|
796
|
-
|
|
797
|
-
# Use out-of-fold predictions for metrics
|
|
798
|
-
preds = oof_predictions
|
|
799
|
-
preds_std = None # Will compute from ensemble at inference time
|
|
800
|
-
y_validate = all_df[target].values
|
|
801
|
-
df_val = all_df # For saving predictions
|
|
787
|
+
ens_preds = trainer_pred.predict(ens_model, val_loader_for_std)
|
|
788
|
+
ens_preds = np.concatenate([p.numpy() for p in ens_preds], axis=0)
|
|
789
|
+
if ens_preds.ndim == 3 and ens_preds.shape[1] == 1:
|
|
790
|
+
ens_preds = ens_preds.squeeze(axis=1)
|
|
791
|
+
all_ensemble_preds_for_std.append(ens_preds)
|
|
792
|
+
|
|
793
|
+
# Stack ensemble predictions: shape (n_ensemble, n_samples, n_targets)
|
|
794
|
+
ensemble_preds_stacked = np.stack(all_ensemble_preds_for_std, axis=0)
|
|
795
|
+
preds_std = np.std(ensemble_preds_stacked, axis=0)
|
|
796
|
+
# Ensure 2D
|
|
797
|
+
if preds_std.ndim == 1:
|
|
798
|
+
preds_std = preds_std.reshape(-1, 1)
|
|
799
|
+
print(f"Ensemble prediction_std - mean per target: {np.nanmean(preds_std, axis=0)}")
|
|
802
800
|
|
|
803
801
|
if model_type == "classifier":
|
|
804
|
-
# Classification metrics -
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
# Multi-class probabilities: (n_samples, n_classes), take argmax
|
|
808
|
-
class_preds = np.argmax(preds, axis=1)
|
|
809
|
-
has_proba = True
|
|
810
|
-
elif preds.ndim == 1:
|
|
811
|
-
# Either class indices (CV mode) or binary probabilities
|
|
812
|
-
if n_folds > 1:
|
|
813
|
-
# CV mode: preds are already class indices
|
|
814
|
-
class_preds = preds.astype(int)
|
|
815
|
-
has_proba = False
|
|
816
|
-
else:
|
|
817
|
-
# Single model: preds are probabilities
|
|
818
|
-
class_preds = (preds > 0.5).astype(int)
|
|
819
|
-
has_proba = False
|
|
820
|
-
else:
|
|
821
|
-
# Squeeze extra dimensions if needed
|
|
822
|
-
preds = preds.squeeze()
|
|
823
|
-
if preds.ndim == 2:
|
|
824
|
-
class_preds = np.argmax(preds, axis=1)
|
|
825
|
-
has_proba = True
|
|
826
|
-
else:
|
|
827
|
-
class_preds = (preds > 0.5).astype(int)
|
|
828
|
-
has_proba = False
|
|
802
|
+
# Classification metrics - preds contains class indices in first column from OOF predictions
|
|
803
|
+
class_preds = preds[:, 0].astype(int)
|
|
804
|
+
has_proba = oof_proba is not None
|
|
829
805
|
|
|
830
806
|
print(f"class_preds shape: {class_preds.shape}")
|
|
831
807
|
|
|
832
|
-
# Decode labels for metrics
|
|
833
|
-
|
|
808
|
+
# Decode labels for metrics (classification is single-target only)
|
|
809
|
+
target_name = target_columns[0]
|
|
810
|
+
y_validate_decoded = label_encoder.inverse_transform(y_validate[:, 0].astype(int))
|
|
834
811
|
preds_decoded = label_encoder.inverse_transform(class_preds)
|
|
835
812
|
|
|
836
813
|
# Calculate metrics
|
|
@@ -841,7 +818,7 @@ if __name__ == "__main__":
|
|
|
841
818
|
|
|
842
819
|
score_df = pd.DataFrame(
|
|
843
820
|
{
|
|
844
|
-
|
|
821
|
+
target_name: label_names,
|
|
845
822
|
"precision": scores[0],
|
|
846
823
|
"recall": scores[1],
|
|
847
824
|
"f1": scores[2],
|
|
@@ -853,7 +830,7 @@ if __name__ == "__main__":
|
|
|
853
830
|
metrics = ["precision", "recall", "f1", "support"]
|
|
854
831
|
for t in label_names:
|
|
855
832
|
for m in metrics:
|
|
856
|
-
value = score_df.loc[score_df[
|
|
833
|
+
value = score_df.loc[score_df[target_name] == t, m].iloc[0]
|
|
857
834
|
print(f"Metrics:{t}:{m} {value}")
|
|
858
835
|
|
|
859
836
|
# Confusion matrix
|
|
@@ -868,34 +845,55 @@ if __name__ == "__main__":
|
|
|
868
845
|
# Save validation predictions
|
|
869
846
|
df_val = df_val.copy()
|
|
870
847
|
df_val["prediction"] = preds_decoded
|
|
871
|
-
if has_proba and
|
|
872
|
-
df_val["pred_proba"] = [p.tolist() for p in
|
|
848
|
+
if has_proba and oof_proba is not None:
|
|
849
|
+
df_val["pred_proba"] = [p.tolist() for p in oof_proba]
|
|
873
850
|
df_val = expand_proba_column(df_val, label_names)
|
|
874
851
|
|
|
875
852
|
else:
|
|
876
|
-
# Regression metrics
|
|
877
|
-
preds_flat = preds.flatten()
|
|
878
|
-
rmse = root_mean_squared_error(y_validate, preds_flat)
|
|
879
|
-
mae = mean_absolute_error(y_validate, preds_flat)
|
|
880
|
-
r2 = r2_score(y_validate, preds_flat)
|
|
881
|
-
print(f"RMSE: {rmse:.3f}")
|
|
882
|
-
print(f"MAE: {mae:.3f}")
|
|
883
|
-
print(f"R2: {r2:.3f}")
|
|
884
|
-
print(f"NumRows: {len(df_val)}")
|
|
885
|
-
|
|
853
|
+
# Regression metrics: compute per target (works for single or multi-task)
|
|
886
854
|
df_val = df_val.copy()
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
855
|
+
print("\n--- Per-target metrics ---")
|
|
856
|
+
for t_idx, t_name in enumerate(target_columns):
|
|
857
|
+
# Get valid (non-NaN) indices for this target
|
|
858
|
+
target_valid_mask = ~np.isnan(y_validate[:, t_idx])
|
|
859
|
+
y_true = y_validate[target_valid_mask, t_idx]
|
|
860
|
+
y_pred = preds[target_valid_mask, t_idx]
|
|
861
|
+
|
|
862
|
+
if len(y_true) > 0:
|
|
863
|
+
rmse = root_mean_squared_error(y_true, y_pred)
|
|
864
|
+
mae = mean_absolute_error(y_true, y_pred)
|
|
865
|
+
medae = median_absolute_error(y_true, y_pred)
|
|
866
|
+
r2 = r2_score(y_true, y_pred)
|
|
867
|
+
spearman_corr = spearmanr(y_true, y_pred).correlation
|
|
868
|
+
support = len(y_true)
|
|
869
|
+
# Print metrics in format expected by SageMaker metric definitions
|
|
870
|
+
print(f"rmse: {rmse:.3f}")
|
|
871
|
+
print(f"mae: {mae:.3f}")
|
|
872
|
+
print(f"medae: {medae:.3f}")
|
|
873
|
+
print(f"r2: {r2:.3f}")
|
|
874
|
+
print(f"spearmanr: {spearman_corr:.3f}")
|
|
875
|
+
print(f"support: {support}")
|
|
876
|
+
|
|
877
|
+
# Store predictions in dataframe
|
|
878
|
+
df_val[f"{t_name}_pred"] = preds[:, t_idx]
|
|
879
|
+
if preds_std is not None:
|
|
880
|
+
df_val[f"{t_name}_pred_std"] = preds_std[:, t_idx]
|
|
881
|
+
else:
|
|
882
|
+
df_val[f"{t_name}_pred_std"] = 0.0
|
|
893
883
|
|
|
894
884
|
# Save validation predictions to S3
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
885
|
+
# Include id_column if it exists in df_val
|
|
886
|
+
output_columns = []
|
|
887
|
+
if id_column in df_val.columns:
|
|
888
|
+
output_columns.append(id_column)
|
|
889
|
+
# Include all target columns and their predictions
|
|
890
|
+
output_columns += target_columns
|
|
891
|
+
output_columns += [f"{t}_pred" for t in target_columns]
|
|
892
|
+
output_columns += [f"{t}_pred_std" for t in target_columns]
|
|
893
|
+
# Add proba columns for classifiers
|
|
898
894
|
output_columns += [col for col in df_val.columns if col.endswith("_proba")]
|
|
895
|
+
# Filter to only columns that exist
|
|
896
|
+
output_columns = [c for c in output_columns if c in df_val.columns]
|
|
899
897
|
wr.s3.to_csv(
|
|
900
898
|
df_val[output_columns],
|
|
901
899
|
path=f"{model_metrics_s3_path}/validation_predictions.csv",
|
|
@@ -910,9 +908,13 @@ if __name__ == "__main__":
|
|
|
910
908
|
|
|
911
909
|
# Save ensemble metadata (n_ensemble = number of models for inference)
|
|
912
910
|
n_ensemble = len(ensemble_models)
|
|
913
|
-
ensemble_metadata = {
|
|
911
|
+
ensemble_metadata = {
|
|
912
|
+
"n_ensemble": n_ensemble,
|
|
913
|
+
"n_folds": n_folds,
|
|
914
|
+
"target_columns": target_columns,
|
|
915
|
+
}
|
|
914
916
|
joblib.dump(ensemble_metadata, os.path.join(args.model_dir, "ensemble_metadata.joblib"))
|
|
915
|
-
print(f"Saved ensemble metadata (n_ensemble={n_ensemble}, n_folds={n_folds})")
|
|
917
|
+
print(f"Saved ensemble metadata (n_ensemble={n_ensemble}, n_folds={n_folds}, targets={target_columns})")
|
|
916
918
|
|
|
917
919
|
# Save label encoder if classification
|
|
918
920
|
if label_encoder is not None:
|