workbench 0.8.205__py3-none-any.whl → 0.8.212__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- workbench/algorithms/models/noise_model.py +388 -0
- workbench/api/endpoint.py +3 -6
- workbench/api/feature_set.py +1 -1
- workbench/api/model.py +5 -11
- workbench/cached/cached_model.py +4 -4
- workbench/core/artifacts/endpoint_core.py +57 -145
- workbench/core/artifacts/model_core.py +21 -19
- workbench/core/transforms/features_to_model/features_to_model.py +2 -2
- workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +1 -1
- workbench/model_script_utils/model_script_utils.py +335 -0
- workbench/model_script_utils/pytorch_utils.py +395 -0
- workbench/model_script_utils/uq_harness.py +278 -0
- workbench/model_scripts/chemprop/chemprop.template +289 -666
- workbench/model_scripts/chemprop/generated_model_script.py +292 -669
- workbench/model_scripts/chemprop/model_script_utils.py +335 -0
- workbench/model_scripts/chemprop/requirements.txt +2 -10
- workbench/model_scripts/pytorch_model/generated_model_script.py +355 -612
- workbench/model_scripts/pytorch_model/model_script_utils.py +335 -0
- workbench/model_scripts/pytorch_model/pytorch.template +350 -607
- workbench/model_scripts/pytorch_model/pytorch_utils.py +395 -0
- workbench/model_scripts/pytorch_model/requirements.txt +1 -1
- workbench/model_scripts/pytorch_model/uq_harness.py +278 -0
- workbench/model_scripts/script_generation.py +2 -5
- workbench/model_scripts/uq_models/generated_model_script.py +65 -422
- workbench/model_scripts/xgb_model/generated_model_script.py +349 -412
- workbench/model_scripts/xgb_model/model_script_utils.py +335 -0
- workbench/model_scripts/xgb_model/uq_harness.py +278 -0
- workbench/model_scripts/xgb_model/xgb_model.template +344 -407
- workbench/scripts/training_test.py +85 -0
- workbench/utils/chemprop_utils.py +18 -656
- workbench/utils/metrics_utils.py +172 -0
- workbench/utils/model_utils.py +104 -47
- workbench/utils/pytorch_utils.py +32 -472
- workbench/utils/xgboost_local_crossfold.py +267 -0
- workbench/utils/xgboost_model_utils.py +49 -356
- workbench/web_interface/components/plugins/model_details.py +30 -68
- {workbench-0.8.205.dist-info → workbench-0.8.212.dist-info}/METADATA +5 -5
- {workbench-0.8.205.dist-info → workbench-0.8.212.dist-info}/RECORD +42 -31
- {workbench-0.8.205.dist-info → workbench-0.8.212.dist-info}/entry_points.txt +1 -0
- workbench/model_scripts/uq_models/mapie.template +0 -605
- workbench/model_scripts/uq_models/requirements.txt +0 -1
- {workbench-0.8.205.dist-info → workbench-0.8.212.dist-info}/WHEEL +0 -0
- {workbench-0.8.205.dist-info → workbench-0.8.212.dist-info}/licenses/LICENSE +0 -0
- {workbench-0.8.205.dist-info → workbench-0.8.212.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,395 @@
|
|
|
1
|
+
"""PyTorch utilities for tabular data modeling.
|
|
2
|
+
|
|
3
|
+
Provides a lightweight TabularMLP model with categorical embeddings and
|
|
4
|
+
training utilities for use in Workbench model scripts.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
import os
|
|
9
|
+
from typing import Optional
|
|
10
|
+
|
|
11
|
+
import joblib
|
|
12
|
+
import numpy as np
|
|
13
|
+
import pandas as pd
|
|
14
|
+
import torch
|
|
15
|
+
import torch.nn as nn
|
|
16
|
+
from torch.utils.data import DataLoader, TensorDataset
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class FeatureScaler:
|
|
20
|
+
"""Standard scaler for continuous features (zero mean, unit variance)."""
|
|
21
|
+
|
|
22
|
+
def __init__(self):
|
|
23
|
+
self.means: Optional[np.ndarray] = None
|
|
24
|
+
self.stds: Optional[np.ndarray] = None
|
|
25
|
+
self.feature_names: Optional[list[str]] = None
|
|
26
|
+
|
|
27
|
+
def fit(self, df: pd.DataFrame, continuous_cols: list[str]) -> "FeatureScaler":
|
|
28
|
+
"""Fit the scaler on training data."""
|
|
29
|
+
self.feature_names = continuous_cols
|
|
30
|
+
data = df[continuous_cols].values.astype(np.float32)
|
|
31
|
+
self.means = np.nanmean(data, axis=0)
|
|
32
|
+
self.stds = np.nanstd(data, axis=0)
|
|
33
|
+
# Avoid division by zero for constant features
|
|
34
|
+
self.stds[self.stds == 0] = 1.0
|
|
35
|
+
return self
|
|
36
|
+
|
|
37
|
+
def transform(self, df: pd.DataFrame) -> np.ndarray:
|
|
38
|
+
"""Transform data using fitted parameters."""
|
|
39
|
+
data = df[self.feature_names].values.astype(np.float32)
|
|
40
|
+
# Fill NaN with mean before scaling
|
|
41
|
+
for i, mean in enumerate(self.means):
|
|
42
|
+
data[np.isnan(data[:, i]), i] = mean
|
|
43
|
+
return (data - self.means) / self.stds
|
|
44
|
+
|
|
45
|
+
def fit_transform(self, df: pd.DataFrame, continuous_cols: list[str]) -> np.ndarray:
|
|
46
|
+
"""Fit and transform in one step."""
|
|
47
|
+
self.fit(df, continuous_cols)
|
|
48
|
+
return self.transform(df)
|
|
49
|
+
|
|
50
|
+
def save(self, path: str) -> None:
|
|
51
|
+
"""Save scaler parameters."""
|
|
52
|
+
joblib.dump(
|
|
53
|
+
{
|
|
54
|
+
"means": self.means.tolist(),
|
|
55
|
+
"stds": self.stds.tolist(),
|
|
56
|
+
"feature_names": self.feature_names,
|
|
57
|
+
},
|
|
58
|
+
path,
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
@classmethod
|
|
62
|
+
def load(cls, path: str) -> "FeatureScaler":
|
|
63
|
+
"""Load scaler from saved parameters."""
|
|
64
|
+
data = joblib.load(path)
|
|
65
|
+
scaler = cls()
|
|
66
|
+
scaler.means = np.array(data["means"], dtype=np.float32)
|
|
67
|
+
scaler.stds = np.array(data["stds"], dtype=np.float32)
|
|
68
|
+
scaler.feature_names = data["feature_names"]
|
|
69
|
+
return scaler
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class TabularMLP(nn.Module):
|
|
73
|
+
"""Feedforward neural network for tabular data with optional categorical embeddings.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
n_continuous: Number of continuous input features
|
|
77
|
+
categorical_cardinalities: List of cardinalities for each categorical feature
|
|
78
|
+
embedding_dims: List of embedding dimensions for each categorical feature
|
|
79
|
+
hidden_layers: List of hidden layer sizes (e.g., [256, 128, 64])
|
|
80
|
+
n_outputs: Number of output units
|
|
81
|
+
task: "regression" or "classification"
|
|
82
|
+
dropout: Dropout rate
|
|
83
|
+
use_batch_norm: Whether to use batch normalization
|
|
84
|
+
"""
|
|
85
|
+
|
|
86
|
+
def __init__(
|
|
87
|
+
self,
|
|
88
|
+
n_continuous: int,
|
|
89
|
+
categorical_cardinalities: list[int],
|
|
90
|
+
embedding_dims: list[int],
|
|
91
|
+
hidden_layers: list[int],
|
|
92
|
+
n_outputs: int,
|
|
93
|
+
task: str = "regression",
|
|
94
|
+
dropout: float = 0.1,
|
|
95
|
+
use_batch_norm: bool = True,
|
|
96
|
+
):
|
|
97
|
+
super().__init__()
|
|
98
|
+
self.task = task
|
|
99
|
+
self.n_continuous = n_continuous
|
|
100
|
+
self.categorical_cardinalities = categorical_cardinalities
|
|
101
|
+
|
|
102
|
+
# Embedding layers for categorical features
|
|
103
|
+
self.embeddings = nn.ModuleList(
|
|
104
|
+
[nn.Embedding(n_cats, emb_dim) for n_cats, emb_dim in zip(categorical_cardinalities, embedding_dims)]
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
# Calculate input dimension
|
|
108
|
+
total_emb_dim = sum(embedding_dims)
|
|
109
|
+
input_dim = n_continuous + total_emb_dim
|
|
110
|
+
|
|
111
|
+
# Build MLP layers
|
|
112
|
+
layers = []
|
|
113
|
+
for hidden_dim in hidden_layers:
|
|
114
|
+
layers.append(nn.Linear(input_dim, hidden_dim))
|
|
115
|
+
if use_batch_norm:
|
|
116
|
+
layers.append(nn.BatchNorm1d(hidden_dim))
|
|
117
|
+
layers.append(nn.LeakyReLU())
|
|
118
|
+
layers.append(nn.Dropout(dropout))
|
|
119
|
+
input_dim = hidden_dim
|
|
120
|
+
|
|
121
|
+
self.mlp = nn.Sequential(*layers)
|
|
122
|
+
self.head = nn.Linear(input_dim, n_outputs)
|
|
123
|
+
|
|
124
|
+
def forward(self, x_cont: torch.Tensor, x_cat: Optional[torch.Tensor] = None) -> torch.Tensor:
|
|
125
|
+
"""Forward pass.
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
x_cont: Continuous features tensor of shape (batch, n_continuous)
|
|
129
|
+
x_cat: Categorical features tensor of shape (batch, n_categoricals), optional
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
Output tensor of shape (batch, n_outputs)
|
|
133
|
+
"""
|
|
134
|
+
# Embed categorical features and concatenate with continuous
|
|
135
|
+
if x_cat is not None and len(self.embeddings) > 0:
|
|
136
|
+
embs = [emb(x_cat[:, i]) for i, emb in enumerate(self.embeddings)]
|
|
137
|
+
x = torch.cat([x_cont] + embs, dim=1)
|
|
138
|
+
else:
|
|
139
|
+
x = x_cont
|
|
140
|
+
|
|
141
|
+
x = self.mlp(x)
|
|
142
|
+
out = self.head(x)
|
|
143
|
+
|
|
144
|
+
if self.task == "classification":
|
|
145
|
+
out = torch.softmax(out, dim=1)
|
|
146
|
+
|
|
147
|
+
return out
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def compute_embedding_dims(cardinalities: list[int], max_dim: int = 50) -> list[int]:
|
|
151
|
+
"""Compute embedding dimensions using the rule of thumb: min(50, (n+1)//2)."""
|
|
152
|
+
return [min(max_dim, (n + 1) // 2) for n in cardinalities]
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def prepare_data(
|
|
156
|
+
df: pd.DataFrame,
|
|
157
|
+
continuous_cols: list[str],
|
|
158
|
+
categorical_cols: list[str],
|
|
159
|
+
target_col: Optional[str] = None,
|
|
160
|
+
category_mappings: Optional[dict] = None,
|
|
161
|
+
scaler: Optional[FeatureScaler] = None,
|
|
162
|
+
) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor], dict, Optional[FeatureScaler]]:
|
|
163
|
+
"""Prepare dataframe for model input.
|
|
164
|
+
|
|
165
|
+
Args:
|
|
166
|
+
df: Input dataframe
|
|
167
|
+
continuous_cols: List of continuous feature column names
|
|
168
|
+
categorical_cols: List of categorical feature column names
|
|
169
|
+
target_col: Target column name (optional, for training)
|
|
170
|
+
category_mappings: Existing category mappings (for inference)
|
|
171
|
+
scaler: Existing FeatureScaler (for inference), or None to fit a new one
|
|
172
|
+
|
|
173
|
+
Returns:
|
|
174
|
+
Tuple of (x_cont, x_cat, y, category_mappings, scaler)
|
|
175
|
+
"""
|
|
176
|
+
# Continuous features with standardization
|
|
177
|
+
if scaler is None:
|
|
178
|
+
scaler = FeatureScaler()
|
|
179
|
+
cont_data = scaler.fit_transform(df, continuous_cols)
|
|
180
|
+
else:
|
|
181
|
+
cont_data = scaler.transform(df)
|
|
182
|
+
x_cont = torch.tensor(cont_data, dtype=torch.float32)
|
|
183
|
+
|
|
184
|
+
# Categorical features
|
|
185
|
+
x_cat = None
|
|
186
|
+
if categorical_cols:
|
|
187
|
+
if category_mappings is None:
|
|
188
|
+
category_mappings = {}
|
|
189
|
+
for col in categorical_cols:
|
|
190
|
+
unique_vals = df[col].unique().tolist()
|
|
191
|
+
category_mappings[col] = {v: i for i, v in enumerate(unique_vals)}
|
|
192
|
+
|
|
193
|
+
cat_indices = []
|
|
194
|
+
for col in categorical_cols:
|
|
195
|
+
mapping = category_mappings[col]
|
|
196
|
+
# Map values to indices, use 0 for unknown categories
|
|
197
|
+
indices = df[col].map(lambda x: mapping.get(x, 0)).values
|
|
198
|
+
cat_indices.append(indices)
|
|
199
|
+
|
|
200
|
+
x_cat = torch.tensor(np.column_stack(cat_indices), dtype=torch.long)
|
|
201
|
+
|
|
202
|
+
# Target
|
|
203
|
+
y = None
|
|
204
|
+
if target_col is not None:
|
|
205
|
+
y = torch.tensor(df[target_col].values, dtype=torch.float32)
|
|
206
|
+
if len(y.shape) == 1:
|
|
207
|
+
y = y.unsqueeze(1)
|
|
208
|
+
|
|
209
|
+
return x_cont, x_cat, y, category_mappings, scaler
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def create_model(
|
|
213
|
+
n_continuous: int,
|
|
214
|
+
categorical_cardinalities: list[int],
|
|
215
|
+
hidden_layers: list[int],
|
|
216
|
+
n_outputs: int,
|
|
217
|
+
task: str = "regression",
|
|
218
|
+
dropout: float = 0.1,
|
|
219
|
+
use_batch_norm: bool = True,
|
|
220
|
+
) -> TabularMLP:
|
|
221
|
+
"""Create a TabularMLP model with appropriate embedding dimensions."""
|
|
222
|
+
embedding_dims = compute_embedding_dims(categorical_cardinalities)
|
|
223
|
+
return TabularMLP(
|
|
224
|
+
n_continuous=n_continuous,
|
|
225
|
+
categorical_cardinalities=categorical_cardinalities,
|
|
226
|
+
embedding_dims=embedding_dims,
|
|
227
|
+
hidden_layers=hidden_layers,
|
|
228
|
+
n_outputs=n_outputs,
|
|
229
|
+
task=task,
|
|
230
|
+
dropout=dropout,
|
|
231
|
+
use_batch_norm=use_batch_norm,
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def train_model(
|
|
236
|
+
model: TabularMLP,
|
|
237
|
+
train_x_cont: torch.Tensor,
|
|
238
|
+
train_x_cat: Optional[torch.Tensor],
|
|
239
|
+
train_y: torch.Tensor,
|
|
240
|
+
val_x_cont: torch.Tensor,
|
|
241
|
+
val_x_cat: Optional[torch.Tensor],
|
|
242
|
+
val_y: torch.Tensor,
|
|
243
|
+
task: str = "regression",
|
|
244
|
+
max_epochs: int = 200,
|
|
245
|
+
patience: int = 20,
|
|
246
|
+
batch_size: int = 128,
|
|
247
|
+
learning_rate: float = 1e-3,
|
|
248
|
+
device: str = "cpu",
|
|
249
|
+
) -> tuple[TabularMLP, dict]:
|
|
250
|
+
"""Train the model with early stopping.
|
|
251
|
+
|
|
252
|
+
Returns:
|
|
253
|
+
Tuple of (trained model, training history dict)
|
|
254
|
+
"""
|
|
255
|
+
model = model.to(device)
|
|
256
|
+
|
|
257
|
+
# Create dataloaders
|
|
258
|
+
if train_x_cat is not None:
|
|
259
|
+
train_dataset = TensorDataset(train_x_cont, train_x_cat, train_y)
|
|
260
|
+
val_dataset = TensorDataset(val_x_cont, val_x_cat, val_y)
|
|
261
|
+
else:
|
|
262
|
+
# Use dummy categorical tensor
|
|
263
|
+
dummy_cat = torch.zeros(train_x_cont.shape[0], 0, dtype=torch.long)
|
|
264
|
+
dummy_val_cat = torch.zeros(val_x_cont.shape[0], 0, dtype=torch.long)
|
|
265
|
+
train_dataset = TensorDataset(train_x_cont, dummy_cat, train_y)
|
|
266
|
+
val_dataset = TensorDataset(val_x_cont, dummy_val_cat, val_y)
|
|
267
|
+
|
|
268
|
+
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
|
|
269
|
+
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
|
|
270
|
+
|
|
271
|
+
# Loss and optimizer
|
|
272
|
+
if task == "classification":
|
|
273
|
+
criterion = nn.CrossEntropyLoss()
|
|
274
|
+
else:
|
|
275
|
+
criterion = nn.MSELoss()
|
|
276
|
+
|
|
277
|
+
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
|
|
278
|
+
|
|
279
|
+
# Training loop with early stopping
|
|
280
|
+
best_val_loss = float("inf")
|
|
281
|
+
best_state = None
|
|
282
|
+
epochs_without_improvement = 0
|
|
283
|
+
history = {"train_loss": [], "val_loss": []}
|
|
284
|
+
|
|
285
|
+
for epoch in range(max_epochs):
|
|
286
|
+
# Training
|
|
287
|
+
model.train()
|
|
288
|
+
train_losses = []
|
|
289
|
+
for batch in train_loader:
|
|
290
|
+
x_cont, x_cat, y = [b.to(device) for b in batch]
|
|
291
|
+
x_cat = x_cat if x_cat.shape[1] > 0 else None
|
|
292
|
+
|
|
293
|
+
optimizer.zero_grad()
|
|
294
|
+
out = model(x_cont, x_cat)
|
|
295
|
+
|
|
296
|
+
if task == "classification":
|
|
297
|
+
loss = criterion(out, y.squeeze().long())
|
|
298
|
+
else:
|
|
299
|
+
loss = criterion(out, y)
|
|
300
|
+
|
|
301
|
+
loss.backward()
|
|
302
|
+
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
|
|
303
|
+
optimizer.step()
|
|
304
|
+
train_losses.append(loss.item())
|
|
305
|
+
|
|
306
|
+
# Validation
|
|
307
|
+
model.eval()
|
|
308
|
+
val_losses = []
|
|
309
|
+
with torch.no_grad():
|
|
310
|
+
for batch in val_loader:
|
|
311
|
+
x_cont, x_cat, y = [b.to(device) for b in batch]
|
|
312
|
+
x_cat = x_cat if x_cat.shape[1] > 0 else None
|
|
313
|
+
out = model(x_cont, x_cat)
|
|
314
|
+
|
|
315
|
+
if task == "classification":
|
|
316
|
+
loss = criterion(out, y.squeeze().long())
|
|
317
|
+
else:
|
|
318
|
+
loss = criterion(out, y)
|
|
319
|
+
val_losses.append(loss.item())
|
|
320
|
+
|
|
321
|
+
train_loss = np.mean(train_losses)
|
|
322
|
+
val_loss = np.mean(val_losses)
|
|
323
|
+
history["train_loss"].append(train_loss)
|
|
324
|
+
history["val_loss"].append(val_loss)
|
|
325
|
+
|
|
326
|
+
# Early stopping check
|
|
327
|
+
if val_loss < best_val_loss:
|
|
328
|
+
best_val_loss = val_loss
|
|
329
|
+
best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
|
|
330
|
+
epochs_without_improvement = 0
|
|
331
|
+
else:
|
|
332
|
+
epochs_without_improvement += 1
|
|
333
|
+
|
|
334
|
+
if (epoch + 1) % 10 == 0:
|
|
335
|
+
print(f"Epoch {epoch + 1}: train_loss={train_loss:.4f}, val_loss={val_loss:.4f}")
|
|
336
|
+
|
|
337
|
+
if epochs_without_improvement >= patience:
|
|
338
|
+
print(f"Early stopping at epoch {epoch + 1}")
|
|
339
|
+
break
|
|
340
|
+
|
|
341
|
+
# Load best weights
|
|
342
|
+
if best_state is not None:
|
|
343
|
+
model.load_state_dict(best_state)
|
|
344
|
+
|
|
345
|
+
model = model.to("cpu")
|
|
346
|
+
return model, history
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
def predict(
|
|
350
|
+
model: TabularMLP,
|
|
351
|
+
x_cont: torch.Tensor,
|
|
352
|
+
x_cat: Optional[torch.Tensor] = None,
|
|
353
|
+
device: str = "cpu",
|
|
354
|
+
) -> np.ndarray:
|
|
355
|
+
"""Run inference with the model."""
|
|
356
|
+
model = model.to(device)
|
|
357
|
+
model.eval()
|
|
358
|
+
|
|
359
|
+
with torch.no_grad():
|
|
360
|
+
x_cont = x_cont.to(device)
|
|
361
|
+
if x_cat is not None:
|
|
362
|
+
x_cat = x_cat.to(device)
|
|
363
|
+
out = model(x_cont, x_cat)
|
|
364
|
+
|
|
365
|
+
return out.cpu().numpy()
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
def save_model(model: TabularMLP, path: str, model_config: dict) -> None:
|
|
369
|
+
"""Save model weights and configuration."""
|
|
370
|
+
os.makedirs(path, exist_ok=True)
|
|
371
|
+
torch.save(model.state_dict(), os.path.join(path, "model.pt"))
|
|
372
|
+
with open(os.path.join(path, "config.json"), "w") as f:
|
|
373
|
+
json.dump(model_config, f, indent=2)
|
|
374
|
+
|
|
375
|
+
|
|
376
|
+
def load_model(path: str, device: str = "cpu") -> TabularMLP:
|
|
377
|
+
"""Load model from saved weights and configuration."""
|
|
378
|
+
with open(os.path.join(path, "config.json")) as f:
|
|
379
|
+
config = json.load(f)
|
|
380
|
+
|
|
381
|
+
model = create_model(
|
|
382
|
+
n_continuous=config["n_continuous"],
|
|
383
|
+
categorical_cardinalities=config["categorical_cardinalities"],
|
|
384
|
+
hidden_layers=config["hidden_layers"],
|
|
385
|
+
n_outputs=config["n_outputs"],
|
|
386
|
+
task=config["task"],
|
|
387
|
+
dropout=config.get("dropout", 0.1),
|
|
388
|
+
use_batch_norm=config.get("use_batch_norm", True),
|
|
389
|
+
)
|
|
390
|
+
|
|
391
|
+
state_dict = torch.load(os.path.join(path, "model.pt"), map_location=device, weights_only=True)
|
|
392
|
+
model.load_state_dict(state_dict)
|
|
393
|
+
model.eval()
|
|
394
|
+
|
|
395
|
+
return model
|
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
# Note: The training and inference images already have torch
|
|
1
|
+
# Note: The training and inference images already have torch + supporting packages installed.
|
|
2
2
|
# So we only need to install packages that are not already included in the images.
|
|
@@ -0,0 +1,278 @@
|
|
|
1
|
+
"""UQ Harness: Uncertainty Quantification using MAPIE Conformalized Quantile Regression.
|
|
2
|
+
|
|
3
|
+
This module provides a reusable UQ harness that can wrap any point predictor model
|
|
4
|
+
(XGBoost, PyTorch, ChemProp, etc.) to provide calibrated prediction intervals.
|
|
5
|
+
|
|
6
|
+
Usage:
|
|
7
|
+
# Training
|
|
8
|
+
uq_models, uq_metadata = train_uq_models(X_train, y_train, X_val, y_val)
|
|
9
|
+
save_uq_models(uq_models, uq_metadata, model_dir)
|
|
10
|
+
|
|
11
|
+
# Inference
|
|
12
|
+
uq_models, uq_metadata = load_uq_models(model_dir)
|
|
13
|
+
df = predict_intervals(df, X, uq_models, uq_metadata)
|
|
14
|
+
df = compute_confidence(df, uq_metadata["median_interval_width"])
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import json
|
|
18
|
+
import os
|
|
19
|
+
import numpy as np
|
|
20
|
+
import pandas as pd
|
|
21
|
+
import joblib
|
|
22
|
+
from lightgbm import LGBMRegressor
|
|
23
|
+
from mapie.regression import ConformalizedQuantileRegressor
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# Default confidence levels for prediction intervals
|
|
27
|
+
DEFAULT_CONFIDENCE_LEVELS = [0.50, 0.68, 0.80, 0.90, 0.95]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def train_uq_models(
|
|
31
|
+
X_train: pd.DataFrame | np.ndarray,
|
|
32
|
+
y_train: pd.Series | np.ndarray,
|
|
33
|
+
X_val: pd.DataFrame | np.ndarray,
|
|
34
|
+
y_val: pd.Series | np.ndarray,
|
|
35
|
+
confidence_levels: list[float] | None = None,
|
|
36
|
+
) -> tuple[dict, dict]:
|
|
37
|
+
"""Train MAPIE UQ models for multiple confidence levels.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
X_train: Training features
|
|
41
|
+
y_train: Training targets
|
|
42
|
+
X_val: Validation features for conformalization
|
|
43
|
+
y_val: Validation targets for conformalization
|
|
44
|
+
confidence_levels: List of confidence levels (default: [0.50, 0.68, 0.80, 0.90, 0.95])
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
Tuple of (uq_models dict, uq_metadata dict)
|
|
48
|
+
"""
|
|
49
|
+
if confidence_levels is None:
|
|
50
|
+
confidence_levels = DEFAULT_CONFIDENCE_LEVELS
|
|
51
|
+
|
|
52
|
+
mapie_models = {}
|
|
53
|
+
|
|
54
|
+
for confidence_level in confidence_levels:
|
|
55
|
+
alpha = 1 - confidence_level
|
|
56
|
+
lower_q = alpha / 2
|
|
57
|
+
upper_q = 1 - alpha / 2
|
|
58
|
+
|
|
59
|
+
print(f"\nTraining quantile models for {confidence_level * 100:.0f}% confidence interval...")
|
|
60
|
+
print(f" Quantiles: {lower_q:.3f}, {upper_q:.3f}, 0.500")
|
|
61
|
+
|
|
62
|
+
# Train three LightGBM quantile models for this confidence level
|
|
63
|
+
quantile_estimators = []
|
|
64
|
+
for q in [lower_q, upper_q, 0.5]:
|
|
65
|
+
print(f" Training model for quantile {q:.3f}...")
|
|
66
|
+
est = LGBMRegressor(
|
|
67
|
+
objective="quantile",
|
|
68
|
+
alpha=q,
|
|
69
|
+
n_estimators=1000,
|
|
70
|
+
max_depth=6,
|
|
71
|
+
learning_rate=0.01,
|
|
72
|
+
num_leaves=31,
|
|
73
|
+
min_child_samples=20,
|
|
74
|
+
subsample=0.8,
|
|
75
|
+
colsample_bytree=0.8,
|
|
76
|
+
random_state=42,
|
|
77
|
+
verbose=-1,
|
|
78
|
+
force_col_wise=True,
|
|
79
|
+
)
|
|
80
|
+
est.fit(X_train, y_train)
|
|
81
|
+
quantile_estimators.append(est)
|
|
82
|
+
|
|
83
|
+
# Create MAPIE CQR model for this confidence level
|
|
84
|
+
print(f" Setting up MAPIE CQR for {confidence_level * 100:.0f}% confidence...")
|
|
85
|
+
mapie_model = ConformalizedQuantileRegressor(
|
|
86
|
+
quantile_estimators, confidence_level=confidence_level, prefit=True
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
# Conformalize the model with validation data
|
|
90
|
+
print(" Conformalizing with validation data...")
|
|
91
|
+
mapie_model.conformalize(X_val, y_val)
|
|
92
|
+
|
|
93
|
+
# Store the model
|
|
94
|
+
model_name = f"mapie_{confidence_level:.2f}"
|
|
95
|
+
mapie_models[model_name] = mapie_model
|
|
96
|
+
|
|
97
|
+
# Validate coverage for this confidence level
|
|
98
|
+
y_pred, y_pis = mapie_model.predict_interval(X_val)
|
|
99
|
+
coverage = np.mean((y_val >= y_pis[:, 0, 0]) & (y_val <= y_pis[:, 1, 0]))
|
|
100
|
+
print(f" Coverage: Target={confidence_level * 100:.0f}%, Empirical={coverage * 100:.1f}%")
|
|
101
|
+
|
|
102
|
+
# Compute median interval width for confidence calculation (using 80% CI = q_10 to q_90)
|
|
103
|
+
print("\nComputing normalization statistics for confidence scores...")
|
|
104
|
+
model_80 = mapie_models["mapie_0.80"]
|
|
105
|
+
_, y_pis_80 = model_80.predict_interval(X_val)
|
|
106
|
+
interval_width = np.abs(y_pis_80[:, 1, 0] - y_pis_80[:, 0, 0])
|
|
107
|
+
median_interval_width = float(np.median(interval_width))
|
|
108
|
+
print(f" Median interval width (q_10-q_90): {median_interval_width:.6f}")
|
|
109
|
+
|
|
110
|
+
# Analyze interval widths across confidence levels
|
|
111
|
+
print("\nInterval Width Analysis:")
|
|
112
|
+
for conf_level in confidence_levels:
|
|
113
|
+
model = mapie_models[f"mapie_{conf_level:.2f}"]
|
|
114
|
+
_, y_pis = model.predict_interval(X_val)
|
|
115
|
+
widths = y_pis[:, 1, 0] - y_pis[:, 0, 0]
|
|
116
|
+
print(f" {conf_level * 100:.0f}% CI: Mean width={np.mean(widths):.3f}, Std={np.std(widths):.3f}")
|
|
117
|
+
|
|
118
|
+
uq_metadata = {
|
|
119
|
+
"confidence_levels": confidence_levels,
|
|
120
|
+
"median_interval_width": median_interval_width,
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
return mapie_models, uq_metadata
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def save_uq_models(uq_models: dict, uq_metadata: dict, model_dir: str) -> None:
|
|
127
|
+
"""Save UQ models and metadata to disk.
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
uq_models: Dictionary of MAPIE models keyed by name (e.g., "mapie_0.80")
|
|
131
|
+
uq_metadata: Dictionary with confidence_levels and median_interval_width
|
|
132
|
+
model_dir: Directory to save models
|
|
133
|
+
"""
|
|
134
|
+
# Save each MAPIE model
|
|
135
|
+
for model_name, model in uq_models.items():
|
|
136
|
+
joblib.dump(model, os.path.join(model_dir, f"{model_name}.joblib"))
|
|
137
|
+
|
|
138
|
+
# Save median interval width
|
|
139
|
+
with open(os.path.join(model_dir, "median_interval_width.json"), "w") as fp:
|
|
140
|
+
json.dump(uq_metadata["median_interval_width"], fp)
|
|
141
|
+
|
|
142
|
+
# Save UQ metadata
|
|
143
|
+
with open(os.path.join(model_dir, "uq_metadata.json"), "w") as fp:
|
|
144
|
+
json.dump(uq_metadata, fp, indent=2)
|
|
145
|
+
|
|
146
|
+
print(f"Saved {len(uq_models)} UQ models to {model_dir}")
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def load_uq_models(model_dir: str) -> tuple[dict, dict]:
|
|
150
|
+
"""Load UQ models and metadata from disk.
|
|
151
|
+
|
|
152
|
+
Args:
|
|
153
|
+
model_dir: Directory containing saved models
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
Tuple of (uq_models dict, uq_metadata dict)
|
|
157
|
+
"""
|
|
158
|
+
# Load UQ metadata
|
|
159
|
+
uq_metadata_path = os.path.join(model_dir, "uq_metadata.json")
|
|
160
|
+
if os.path.exists(uq_metadata_path):
|
|
161
|
+
with open(uq_metadata_path) as fp:
|
|
162
|
+
uq_metadata = json.load(fp)
|
|
163
|
+
else:
|
|
164
|
+
# Fallback for older models that only have median_interval_width.json
|
|
165
|
+
uq_metadata = {"confidence_levels": DEFAULT_CONFIDENCE_LEVELS}
|
|
166
|
+
median_width_path = os.path.join(model_dir, "median_interval_width.json")
|
|
167
|
+
if os.path.exists(median_width_path):
|
|
168
|
+
with open(median_width_path) as fp:
|
|
169
|
+
uq_metadata["median_interval_width"] = json.load(fp)
|
|
170
|
+
|
|
171
|
+
# Load all MAPIE models
|
|
172
|
+
uq_models = {}
|
|
173
|
+
for conf_level in uq_metadata["confidence_levels"]:
|
|
174
|
+
model_name = f"mapie_{conf_level:.2f}"
|
|
175
|
+
model_path = os.path.join(model_dir, f"{model_name}.joblib")
|
|
176
|
+
if os.path.exists(model_path):
|
|
177
|
+
uq_models[model_name] = joblib.load(model_path)
|
|
178
|
+
|
|
179
|
+
return uq_models, uq_metadata
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def predict_intervals(
|
|
183
|
+
df: pd.DataFrame,
|
|
184
|
+
X: pd.DataFrame | np.ndarray,
|
|
185
|
+
uq_models: dict,
|
|
186
|
+
uq_metadata: dict,
|
|
187
|
+
) -> pd.DataFrame:
|
|
188
|
+
"""Add prediction intervals to a DataFrame.
|
|
189
|
+
|
|
190
|
+
Args:
|
|
191
|
+
df: DataFrame to add interval columns to
|
|
192
|
+
X: Features for prediction (must match training features)
|
|
193
|
+
uq_models: Dictionary of MAPIE models
|
|
194
|
+
uq_metadata: Dictionary with confidence_levels
|
|
195
|
+
|
|
196
|
+
Returns:
|
|
197
|
+
DataFrame with added quantile columns (q_025, q_05, ..., q_975)
|
|
198
|
+
"""
|
|
199
|
+
confidence_levels = uq_metadata["confidence_levels"]
|
|
200
|
+
|
|
201
|
+
for conf_level in confidence_levels:
|
|
202
|
+
model_name = f"mapie_{conf_level:.2f}"
|
|
203
|
+
model = uq_models[model_name]
|
|
204
|
+
|
|
205
|
+
# Get conformalized predictions
|
|
206
|
+
y_pred, y_pis = model.predict_interval(X)
|
|
207
|
+
|
|
208
|
+
# Map confidence levels to quantile column names
|
|
209
|
+
if conf_level == 0.50: # 50% CI
|
|
210
|
+
df["q_25"] = y_pis[:, 0, 0]
|
|
211
|
+
df["q_75"] = y_pis[:, 1, 0]
|
|
212
|
+
df["q_50"] = y_pred # Median prediction
|
|
213
|
+
elif conf_level == 0.68: # 68% CI (~1 std)
|
|
214
|
+
df["q_16"] = y_pis[:, 0, 0]
|
|
215
|
+
df["q_84"] = y_pis[:, 1, 0]
|
|
216
|
+
elif conf_level == 0.80: # 80% CI
|
|
217
|
+
df["q_10"] = y_pis[:, 0, 0]
|
|
218
|
+
df["q_90"] = y_pis[:, 1, 0]
|
|
219
|
+
elif conf_level == 0.90: # 90% CI
|
|
220
|
+
df["q_05"] = y_pis[:, 0, 0]
|
|
221
|
+
df["q_95"] = y_pis[:, 1, 0]
|
|
222
|
+
elif conf_level == 0.95: # 95% CI
|
|
223
|
+
df["q_025"] = y_pis[:, 0, 0]
|
|
224
|
+
df["q_975"] = y_pis[:, 1, 0]
|
|
225
|
+
|
|
226
|
+
# Calculate pseudo-standard deviation from the 68% interval width
|
|
227
|
+
if "q_84" in df.columns and "q_16" in df.columns:
|
|
228
|
+
df["prediction_std"] = (df["q_84"] - df["q_16"]).abs() / 2.0
|
|
229
|
+
|
|
230
|
+
# Reorder quantile columns for easier reading
|
|
231
|
+
quantile_cols = ["q_025", "q_05", "q_10", "q_16", "q_25", "q_50", "q_75", "q_84", "q_90", "q_95", "q_975"]
|
|
232
|
+
existing_q_cols = [c for c in quantile_cols if c in df.columns]
|
|
233
|
+
other_cols = [c for c in df.columns if c not in quantile_cols]
|
|
234
|
+
df = df[other_cols + existing_q_cols]
|
|
235
|
+
|
|
236
|
+
return df
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def compute_confidence(
|
|
240
|
+
df: pd.DataFrame,
|
|
241
|
+
median_interval_width: float,
|
|
242
|
+
lower_q: str = "q_10",
|
|
243
|
+
upper_q: str = "q_90",
|
|
244
|
+
alpha: float = 1.0,
|
|
245
|
+
beta: float = 1.0,
|
|
246
|
+
) -> pd.DataFrame:
|
|
247
|
+
"""Compute confidence scores (0.0 to 1.0) based on prediction interval width.
|
|
248
|
+
|
|
249
|
+
Uses exponential decay based on:
|
|
250
|
+
1. Interval width relative to median (alpha weight)
|
|
251
|
+
2. Distance from median prediction (beta weight)
|
|
252
|
+
|
|
253
|
+
Args:
|
|
254
|
+
df: DataFrame with 'prediction', 'q_50', and quantile columns
|
|
255
|
+
median_interval_width: Pre-computed median interval width from training data
|
|
256
|
+
lower_q: Lower quantile column name (default: 'q_10')
|
|
257
|
+
upper_q: Upper quantile column name (default: 'q_90')
|
|
258
|
+
alpha: Weight for interval width term (default: 1.0)
|
|
259
|
+
beta: Weight for distance from median term (default: 1.0)
|
|
260
|
+
|
|
261
|
+
Returns:
|
|
262
|
+
DataFrame with added 'confidence' column
|
|
263
|
+
"""
|
|
264
|
+
# Interval width
|
|
265
|
+
interval_width = (df[upper_q] - df[lower_q]).abs()
|
|
266
|
+
|
|
267
|
+
# Distance from median, normalized by interval width
|
|
268
|
+
distance_from_median = (df["prediction"] - df["q_50"]).abs()
|
|
269
|
+
normalized_distance = distance_from_median / (interval_width + 1e-6)
|
|
270
|
+
|
|
271
|
+
# Cap the distance penalty at 1.0
|
|
272
|
+
normalized_distance = np.minimum(normalized_distance, 1.0)
|
|
273
|
+
|
|
274
|
+
# Confidence using exponential decay
|
|
275
|
+
interval_term = interval_width / median_interval_width
|
|
276
|
+
df["confidence"] = np.exp(-(alpha * interval_term + beta * normalized_distance))
|
|
277
|
+
|
|
278
|
+
return df
|
|
@@ -110,18 +110,15 @@ def generate_model_script(template_params: dict) -> str:
|
|
|
110
110
|
if template_params.get("model_class"):
|
|
111
111
|
template_name = "scikit_learn.template"
|
|
112
112
|
model_script_dir = "scikit_learn"
|
|
113
|
-
elif template_params["model_framework"] == ModelFramework.
|
|
113
|
+
elif template_params["model_framework"] == ModelFramework.PYTORCH:
|
|
114
114
|
template_name = "pytorch.template"
|
|
115
115
|
model_script_dir = "pytorch_model"
|
|
116
116
|
elif template_params["model_framework"] == ModelFramework.CHEMPROP:
|
|
117
117
|
template_name = "chemprop.template"
|
|
118
118
|
model_script_dir = "chemprop"
|
|
119
|
-
elif template_params["model_type"] in [ModelType.REGRESSOR, ModelType.CLASSIFIER]:
|
|
119
|
+
elif template_params["model_type"] in [ModelType.REGRESSOR, ModelType.UQ_REGRESSOR, ModelType.CLASSIFIER]:
|
|
120
120
|
template_name = "xgb_model.template"
|
|
121
121
|
model_script_dir = "xgb_model"
|
|
122
|
-
elif template_params["model_type"] == ModelType.UQ_REGRESSOR:
|
|
123
|
-
template_name = "mapie.template"
|
|
124
|
-
model_script_dir = "uq_models"
|
|
125
122
|
elif template_params["model_type"] == ModelType.ENSEMBLE_REGRESSOR:
|
|
126
123
|
template_name = "ensemble_xgb.template"
|
|
127
124
|
model_script_dir = "ensemble_xgb"
|