workbench 0.8.202__py3-none-any.whl → 0.8.220__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of workbench might be problematic. Click here for more details.

Files changed (84) hide show
  1. workbench/algorithms/dataframe/compound_dataset_overlap.py +321 -0
  2. workbench/algorithms/dataframe/feature_space_proximity.py +168 -75
  3. workbench/algorithms/dataframe/fingerprint_proximity.py +421 -85
  4. workbench/algorithms/dataframe/projection_2d.py +44 -21
  5. workbench/algorithms/dataframe/proximity.py +78 -150
  6. workbench/algorithms/graph/light/proximity_graph.py +5 -5
  7. workbench/algorithms/models/cleanlab_model.py +382 -0
  8. workbench/algorithms/models/noise_model.py +388 -0
  9. workbench/algorithms/sql/outliers.py +3 -3
  10. workbench/api/__init__.py +3 -0
  11. workbench/api/df_store.py +17 -108
  12. workbench/api/endpoint.py +13 -11
  13. workbench/api/feature_set.py +111 -8
  14. workbench/api/meta_model.py +289 -0
  15. workbench/api/model.py +45 -12
  16. workbench/api/parameter_store.py +3 -52
  17. workbench/cached/cached_model.py +4 -4
  18. workbench/core/artifacts/artifact.py +5 -5
  19. workbench/core/artifacts/df_store_core.py +114 -0
  20. workbench/core/artifacts/endpoint_core.py +228 -237
  21. workbench/core/artifacts/feature_set_core.py +185 -230
  22. workbench/core/artifacts/model_core.py +34 -26
  23. workbench/core/artifacts/parameter_store_core.py +98 -0
  24. workbench/core/pipelines/pipeline_executor.py +1 -1
  25. workbench/core/transforms/features_to_model/features_to_model.py +22 -10
  26. workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +41 -10
  27. workbench/core/transforms/pandas_transforms/pandas_to_features.py +11 -2
  28. workbench/model_script_utils/model_script_utils.py +339 -0
  29. workbench/model_script_utils/pytorch_utils.py +405 -0
  30. workbench/model_script_utils/uq_harness.py +278 -0
  31. workbench/model_scripts/chemprop/chemprop.template +428 -631
  32. workbench/model_scripts/chemprop/generated_model_script.py +432 -635
  33. workbench/model_scripts/chemprop/model_script_utils.py +339 -0
  34. workbench/model_scripts/chemprop/requirements.txt +2 -10
  35. workbench/model_scripts/custom_models/chem_info/fingerprints.py +87 -46
  36. workbench/model_scripts/custom_models/proximity/feature_space_proximity.py +194 -0
  37. workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +6 -6
  38. workbench/model_scripts/custom_models/uq_models/feature_space_proximity.py +194 -0
  39. workbench/model_scripts/meta_model/generated_model_script.py +209 -0
  40. workbench/model_scripts/meta_model/meta_model.template +209 -0
  41. workbench/model_scripts/pytorch_model/generated_model_script.py +374 -613
  42. workbench/model_scripts/pytorch_model/model_script_utils.py +339 -0
  43. workbench/model_scripts/pytorch_model/pytorch.template +370 -609
  44. workbench/model_scripts/pytorch_model/pytorch_utils.py +405 -0
  45. workbench/model_scripts/pytorch_model/requirements.txt +1 -1
  46. workbench/model_scripts/pytorch_model/uq_harness.py +278 -0
  47. workbench/model_scripts/script_generation.py +6 -5
  48. workbench/model_scripts/uq_models/generated_model_script.py +65 -422
  49. workbench/model_scripts/xgb_model/generated_model_script.py +372 -395
  50. workbench/model_scripts/xgb_model/model_script_utils.py +339 -0
  51. workbench/model_scripts/xgb_model/uq_harness.py +278 -0
  52. workbench/model_scripts/xgb_model/xgb_model.template +366 -396
  53. workbench/repl/workbench_shell.py +0 -5
  54. workbench/resources/open_source_api.key +1 -1
  55. workbench/scripts/endpoint_test.py +2 -2
  56. workbench/scripts/meta_model_sim.py +35 -0
  57. workbench/scripts/training_test.py +85 -0
  58. workbench/utils/chem_utils/fingerprints.py +87 -46
  59. workbench/utils/chem_utils/projections.py +16 -6
  60. workbench/utils/chemprop_utils.py +36 -655
  61. workbench/utils/meta_model_simulator.py +499 -0
  62. workbench/utils/metrics_utils.py +256 -0
  63. workbench/utils/model_utils.py +192 -54
  64. workbench/utils/pytorch_utils.py +33 -472
  65. workbench/utils/shap_utils.py +1 -55
  66. workbench/utils/xgboost_local_crossfold.py +267 -0
  67. workbench/utils/xgboost_model_utils.py +49 -356
  68. workbench/web_interface/components/model_plot.py +7 -1
  69. workbench/web_interface/components/plugins/model_details.py +30 -68
  70. workbench/web_interface/components/plugins/scatter_plot.py +4 -8
  71. {workbench-0.8.202.dist-info → workbench-0.8.220.dist-info}/METADATA +6 -5
  72. {workbench-0.8.202.dist-info → workbench-0.8.220.dist-info}/RECORD +76 -60
  73. {workbench-0.8.202.dist-info → workbench-0.8.220.dist-info}/entry_points.txt +2 -0
  74. workbench/core/cloud_platform/aws/aws_df_store.py +0 -404
  75. workbench/core/cloud_platform/aws/aws_parameter_store.py +0 -296
  76. workbench/model_scripts/custom_models/meta_endpoints/example.py +0 -53
  77. workbench/model_scripts/custom_models/proximity/proximity.py +0 -410
  78. workbench/model_scripts/custom_models/uq_models/meta_uq.template +0 -377
  79. workbench/model_scripts/custom_models/uq_models/proximity.py +0 -410
  80. workbench/model_scripts/uq_models/mapie.template +0 -605
  81. workbench/model_scripts/uq_models/requirements.txt +0 -1
  82. {workbench-0.8.202.dist-info → workbench-0.8.220.dist-info}/WHEEL +0 -0
  83. {workbench-0.8.202.dist-info → workbench-0.8.220.dist-info}/licenses/LICENSE +0 -0
  84. {workbench-0.8.202.dist-info → workbench-0.8.220.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,405 @@
1
+ """PyTorch utilities for tabular data modeling.
2
+
3
+ Provides a lightweight TabularMLP model with categorical embeddings and
4
+ training utilities for use in Workbench model scripts.
5
+ """
6
+
7
+ import json
8
+ import os
9
+ from typing import Optional
10
+
11
+ import joblib
12
+ import numpy as np
13
+ import pandas as pd
14
+ import torch
15
+ import torch.nn as nn
16
+ from torch.utils.data import DataLoader, TensorDataset
17
+
18
+
19
+ class FeatureScaler:
20
+ """Standard scaler for continuous features (zero mean, unit variance)."""
21
+
22
+ def __init__(self):
23
+ self.means: Optional[np.ndarray] = None
24
+ self.stds: Optional[np.ndarray] = None
25
+ self.feature_names: Optional[list[str]] = None
26
+
27
+ def fit(self, df: pd.DataFrame, continuous_cols: list[str]) -> "FeatureScaler":
28
+ """Fit the scaler on training data."""
29
+ self.feature_names = continuous_cols
30
+ data = df[continuous_cols].values.astype(np.float32)
31
+ self.means = np.nanmean(data, axis=0)
32
+ self.stds = np.nanstd(data, axis=0)
33
+ # Avoid division by zero for constant features
34
+ self.stds[self.stds == 0] = 1.0
35
+ return self
36
+
37
+ def transform(self, df: pd.DataFrame) -> np.ndarray:
38
+ """Transform data using fitted parameters."""
39
+ data = df[self.feature_names].values.astype(np.float32)
40
+ # Fill NaN with mean before scaling
41
+ for i, mean in enumerate(self.means):
42
+ data[np.isnan(data[:, i]), i] = mean
43
+ return (data - self.means) / self.stds
44
+
45
+ def fit_transform(self, df: pd.DataFrame, continuous_cols: list[str]) -> np.ndarray:
46
+ """Fit and transform in one step."""
47
+ self.fit(df, continuous_cols)
48
+ return self.transform(df)
49
+
50
+ def save(self, path: str) -> None:
51
+ """Save scaler parameters."""
52
+ joblib.dump(
53
+ {
54
+ "means": self.means.tolist(),
55
+ "stds": self.stds.tolist(),
56
+ "feature_names": self.feature_names,
57
+ },
58
+ path,
59
+ )
60
+
61
+ @classmethod
62
+ def load(cls, path: str) -> "FeatureScaler":
63
+ """Load scaler from saved parameters."""
64
+ data = joblib.load(path)
65
+ scaler = cls()
66
+ scaler.means = np.array(data["means"], dtype=np.float32)
67
+ scaler.stds = np.array(data["stds"], dtype=np.float32)
68
+ scaler.feature_names = data["feature_names"]
69
+ return scaler
70
+
71
+
72
+ class TabularMLP(nn.Module):
73
+ """Feedforward neural network for tabular data with optional categorical embeddings.
74
+
75
+ Args:
76
+ n_continuous: Number of continuous input features
77
+ categorical_cardinalities: List of cardinalities for each categorical feature
78
+ embedding_dims: List of embedding dimensions for each categorical feature
79
+ hidden_layers: List of hidden layer sizes (e.g., [256, 128, 64])
80
+ n_outputs: Number of output units
81
+ task: "regression" or "classification"
82
+ dropout: Dropout rate
83
+ use_batch_norm: Whether to use batch normalization
84
+ """
85
+
86
+ def __init__(
87
+ self,
88
+ n_continuous: int,
89
+ categorical_cardinalities: list[int],
90
+ embedding_dims: list[int],
91
+ hidden_layers: list[int],
92
+ n_outputs: int,
93
+ task: str = "regression",
94
+ dropout: float = 0.1,
95
+ use_batch_norm: bool = True,
96
+ ):
97
+ super().__init__()
98
+ self.task = task
99
+ self.n_continuous = n_continuous
100
+ self.categorical_cardinalities = categorical_cardinalities
101
+
102
+ # Embedding layers for categorical features
103
+ self.embeddings = nn.ModuleList(
104
+ [nn.Embedding(n_cats, emb_dim) for n_cats, emb_dim in zip(categorical_cardinalities, embedding_dims)]
105
+ )
106
+
107
+ # Calculate input dimension
108
+ total_emb_dim = sum(embedding_dims)
109
+ input_dim = n_continuous + total_emb_dim
110
+
111
+ # Build MLP layers
112
+ layers = []
113
+ for hidden_dim in hidden_layers:
114
+ layers.append(nn.Linear(input_dim, hidden_dim))
115
+ if use_batch_norm:
116
+ layers.append(nn.BatchNorm1d(hidden_dim))
117
+ layers.append(nn.LeakyReLU())
118
+ layers.append(nn.Dropout(dropout))
119
+ input_dim = hidden_dim
120
+
121
+ self.mlp = nn.Sequential(*layers)
122
+ self.head = nn.Linear(input_dim, n_outputs)
123
+
124
+ def forward(self, x_cont: torch.Tensor, x_cat: Optional[torch.Tensor] = None) -> torch.Tensor:
125
+ """Forward pass.
126
+
127
+ Args:
128
+ x_cont: Continuous features tensor of shape (batch, n_continuous)
129
+ x_cat: Categorical features tensor of shape (batch, n_categoricals), optional
130
+
131
+ Returns:
132
+ Output tensor of shape (batch, n_outputs)
133
+ """
134
+ # Embed categorical features and concatenate with continuous
135
+ if x_cat is not None and len(self.embeddings) > 0:
136
+ embs = [emb(x_cat[:, i]) for i, emb in enumerate(self.embeddings)]
137
+ x = torch.cat([x_cont] + embs, dim=1)
138
+ else:
139
+ x = x_cont
140
+
141
+ x = self.mlp(x)
142
+ out = self.head(x)
143
+
144
+ if self.task == "classification":
145
+ out = torch.softmax(out, dim=1)
146
+
147
+ return out
148
+
149
+
150
+ def compute_embedding_dims(cardinalities: list[int], max_dim: int = 50) -> list[int]:
151
+ """Compute embedding dimensions using the rule of thumb: min(50, (n+1)//2)."""
152
+ return [min(max_dim, (n + 1) // 2) for n in cardinalities]
153
+
154
+
155
+ def prepare_data(
156
+ df: pd.DataFrame,
157
+ continuous_cols: list[str],
158
+ categorical_cols: list[str],
159
+ target_col: Optional[str] = None,
160
+ category_mappings: Optional[dict] = None,
161
+ scaler: Optional[FeatureScaler] = None,
162
+ ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor], dict, Optional[FeatureScaler]]:
163
+ """Prepare dataframe for model input.
164
+
165
+ Args:
166
+ df: Input dataframe
167
+ continuous_cols: List of continuous feature column names
168
+ categorical_cols: List of categorical feature column names
169
+ target_col: Target column name (optional, for training)
170
+ category_mappings: Existing category mappings (for inference)
171
+ scaler: Existing FeatureScaler (for inference), or None to fit a new one
172
+
173
+ Returns:
174
+ Tuple of (x_cont, x_cat, y, category_mappings, scaler)
175
+ """
176
+ # Continuous features with standardization
177
+ if scaler is None:
178
+ scaler = FeatureScaler()
179
+ cont_data = scaler.fit_transform(df, continuous_cols)
180
+ else:
181
+ cont_data = scaler.transform(df)
182
+ x_cont = torch.tensor(cont_data, dtype=torch.float32)
183
+
184
+ # Categorical features
185
+ x_cat = None
186
+ if categorical_cols:
187
+ if category_mappings is None:
188
+ category_mappings = {}
189
+ for col in categorical_cols:
190
+ unique_vals = df[col].unique().tolist()
191
+ category_mappings[col] = {v: i for i, v in enumerate(unique_vals)}
192
+
193
+ cat_indices = []
194
+ for col in categorical_cols:
195
+ mapping = category_mappings[col]
196
+ # Map values to indices, use 0 for unknown categories
197
+ indices = df[col].map(lambda x: mapping.get(x, 0)).values
198
+ cat_indices.append(indices)
199
+
200
+ x_cat = torch.tensor(np.column_stack(cat_indices), dtype=torch.long)
201
+
202
+ # Target
203
+ y = None
204
+ if target_col is not None:
205
+ y = torch.tensor(df[target_col].values, dtype=torch.float32)
206
+ if len(y.shape) == 1:
207
+ y = y.unsqueeze(1)
208
+
209
+ return x_cont, x_cat, y, category_mappings, scaler
210
+
211
+
212
+ def create_model(
213
+ n_continuous: int,
214
+ categorical_cardinalities: list[int],
215
+ hidden_layers: list[int],
216
+ n_outputs: int,
217
+ task: str = "regression",
218
+ dropout: float = 0.1,
219
+ use_batch_norm: bool = True,
220
+ ) -> TabularMLP:
221
+ """Create a TabularMLP model with appropriate embedding dimensions."""
222
+ embedding_dims = compute_embedding_dims(categorical_cardinalities)
223
+ return TabularMLP(
224
+ n_continuous=n_continuous,
225
+ categorical_cardinalities=categorical_cardinalities,
226
+ embedding_dims=embedding_dims,
227
+ hidden_layers=hidden_layers,
228
+ n_outputs=n_outputs,
229
+ task=task,
230
+ dropout=dropout,
231
+ use_batch_norm=use_batch_norm,
232
+ )
233
+
234
+
235
+ def train_model(
236
+ model: TabularMLP,
237
+ train_x_cont: torch.Tensor,
238
+ train_x_cat: Optional[torch.Tensor],
239
+ train_y: torch.Tensor,
240
+ val_x_cont: torch.Tensor,
241
+ val_x_cat: Optional[torch.Tensor],
242
+ val_y: torch.Tensor,
243
+ task: str = "regression",
244
+ max_epochs: int = 200,
245
+ patience: int = 20,
246
+ batch_size: int = 128,
247
+ learning_rate: float = 1e-3,
248
+ loss: str = "L1Loss",
249
+ device: str = "cpu",
250
+ ) -> tuple[TabularMLP, dict]:
251
+ """Train the model with early stopping.
252
+
253
+ Returns:
254
+ Tuple of (trained model, training history dict)
255
+ """
256
+ model = model.to(device)
257
+
258
+ # Create dataloaders
259
+ if train_x_cat is not None:
260
+ train_dataset = TensorDataset(train_x_cont, train_x_cat, train_y)
261
+ val_dataset = TensorDataset(val_x_cont, val_x_cat, val_y)
262
+ else:
263
+ # Use dummy categorical tensor
264
+ dummy_cat = torch.zeros(train_x_cont.shape[0], 0, dtype=torch.long)
265
+ dummy_val_cat = torch.zeros(val_x_cont.shape[0], 0, dtype=torch.long)
266
+ train_dataset = TensorDataset(train_x_cont, dummy_cat, train_y)
267
+ val_dataset = TensorDataset(val_x_cont, dummy_val_cat, val_y)
268
+
269
+ train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
270
+ val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
271
+
272
+ # Loss and optimizer
273
+ if task == "classification":
274
+ criterion = nn.CrossEntropyLoss()
275
+ else:
276
+ # Map loss name to PyTorch loss class
277
+ loss_map = {
278
+ "L1Loss": nn.L1Loss,
279
+ "MSELoss": nn.MSELoss,
280
+ "HuberLoss": nn.HuberLoss,
281
+ "SmoothL1Loss": nn.SmoothL1Loss,
282
+ }
283
+ if loss not in loss_map:
284
+ raise ValueError(f"Unknown loss '{loss}'. Supported: {list(loss_map.keys())}")
285
+ criterion = loss_map[loss]()
286
+
287
+ optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
288
+
289
+ # Training loop with early stopping
290
+ best_val_loss = float("inf")
291
+ best_state = None
292
+ epochs_without_improvement = 0
293
+ history = {"train_loss": [], "val_loss": []}
294
+
295
+ for epoch in range(max_epochs):
296
+ # Training
297
+ model.train()
298
+ train_losses = []
299
+ for batch in train_loader:
300
+ x_cont, x_cat, y = [b.to(device) for b in batch]
301
+ x_cat = x_cat if x_cat.shape[1] > 0 else None
302
+
303
+ optimizer.zero_grad()
304
+ out = model(x_cont, x_cat)
305
+
306
+ if task == "classification":
307
+ loss = criterion(out, y.squeeze().long())
308
+ else:
309
+ loss = criterion(out, y)
310
+
311
+ loss.backward()
312
+ torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
313
+ optimizer.step()
314
+ train_losses.append(loss.item())
315
+
316
+ # Validation
317
+ model.eval()
318
+ val_losses = []
319
+ with torch.no_grad():
320
+ for batch in val_loader:
321
+ x_cont, x_cat, y = [b.to(device) for b in batch]
322
+ x_cat = x_cat if x_cat.shape[1] > 0 else None
323
+ out = model(x_cont, x_cat)
324
+
325
+ if task == "classification":
326
+ loss = criterion(out, y.squeeze().long())
327
+ else:
328
+ loss = criterion(out, y)
329
+ val_losses.append(loss.item())
330
+
331
+ train_loss = np.mean(train_losses)
332
+ val_loss = np.mean(val_losses)
333
+ history["train_loss"].append(train_loss)
334
+ history["val_loss"].append(val_loss)
335
+
336
+ # Early stopping check
337
+ if val_loss < best_val_loss:
338
+ best_val_loss = val_loss
339
+ best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
340
+ epochs_without_improvement = 0
341
+ else:
342
+ epochs_without_improvement += 1
343
+
344
+ if (epoch + 1) % 10 == 0:
345
+ print(f"Epoch {epoch + 1}: train_loss={train_loss:.4f}, val_loss={val_loss:.4f}")
346
+
347
+ if epochs_without_improvement >= patience:
348
+ print(f"Early stopping at epoch {epoch + 1}")
349
+ break
350
+
351
+ # Load best weights
352
+ if best_state is not None:
353
+ model.load_state_dict(best_state)
354
+
355
+ model = model.to("cpu")
356
+ return model, history
357
+
358
+
359
+ def predict(
360
+ model: TabularMLP,
361
+ x_cont: torch.Tensor,
362
+ x_cat: Optional[torch.Tensor] = None,
363
+ device: str = "cpu",
364
+ ) -> np.ndarray:
365
+ """Run inference with the model."""
366
+ model = model.to(device)
367
+ model.eval()
368
+
369
+ with torch.no_grad():
370
+ x_cont = x_cont.to(device)
371
+ if x_cat is not None:
372
+ x_cat = x_cat.to(device)
373
+ out = model(x_cont, x_cat)
374
+
375
+ return out.cpu().numpy()
376
+
377
+
378
+ def save_model(model: TabularMLP, path: str, model_config: dict) -> None:
379
+ """Save model weights and configuration."""
380
+ os.makedirs(path, exist_ok=True)
381
+ torch.save(model.state_dict(), os.path.join(path, "model.pt"))
382
+ with open(os.path.join(path, "config.json"), "w") as f:
383
+ json.dump(model_config, f, indent=2)
384
+
385
+
386
+ def load_model(path: str, device: str = "cpu") -> TabularMLP:
387
+ """Load model from saved weights and configuration."""
388
+ with open(os.path.join(path, "config.json")) as f:
389
+ config = json.load(f)
390
+
391
+ model = create_model(
392
+ n_continuous=config["n_continuous"],
393
+ categorical_cardinalities=config["categorical_cardinalities"],
394
+ hidden_layers=config["hidden_layers"],
395
+ n_outputs=config["n_outputs"],
396
+ task=config["task"],
397
+ dropout=config.get("dropout", 0.1),
398
+ use_batch_norm=config.get("use_batch_norm", True),
399
+ )
400
+
401
+ state_dict = torch.load(os.path.join(path, "model.pt"), map_location=device, weights_only=True)
402
+ model.load_state_dict(state_dict)
403
+ model.eval()
404
+
405
+ return model
@@ -0,0 +1,278 @@
1
+ """UQ Harness: Uncertainty Quantification using MAPIE Conformalized Quantile Regression.
2
+
3
+ This module provides a reusable UQ harness that can wrap any point predictor model
4
+ (XGBoost, PyTorch, ChemProp, etc.) to provide calibrated prediction intervals.
5
+
6
+ Usage:
7
+ # Training
8
+ uq_models, uq_metadata = train_uq_models(X_train, y_train, X_val, y_val)
9
+ save_uq_models(uq_models, uq_metadata, model_dir)
10
+
11
+ # Inference
12
+ uq_models, uq_metadata = load_uq_models(model_dir)
13
+ df = predict_intervals(df, X, uq_models, uq_metadata)
14
+ df = compute_confidence(df, uq_metadata["median_interval_width"])
15
+ """
16
+
17
+ import json
18
+ import os
19
+ import numpy as np
20
+ import pandas as pd
21
+ import joblib
22
+ from lightgbm import LGBMRegressor
23
+ from mapie.regression import ConformalizedQuantileRegressor
24
+
25
+
26
+ # Default confidence levels for prediction intervals
27
+ DEFAULT_CONFIDENCE_LEVELS = [0.50, 0.68, 0.80, 0.90, 0.95]
28
+
29
+
30
+ def train_uq_models(
31
+ X_train: pd.DataFrame | np.ndarray,
32
+ y_train: pd.Series | np.ndarray,
33
+ X_val: pd.DataFrame | np.ndarray,
34
+ y_val: pd.Series | np.ndarray,
35
+ confidence_levels: list[float] | None = None,
36
+ ) -> tuple[dict, dict]:
37
+ """Train MAPIE UQ models for multiple confidence levels.
38
+
39
+ Args:
40
+ X_train: Training features
41
+ y_train: Training targets
42
+ X_val: Validation features for conformalization
43
+ y_val: Validation targets for conformalization
44
+ confidence_levels: List of confidence levels (default: [0.50, 0.68, 0.80, 0.90, 0.95])
45
+
46
+ Returns:
47
+ Tuple of (uq_models dict, uq_metadata dict)
48
+ """
49
+ if confidence_levels is None:
50
+ confidence_levels = DEFAULT_CONFIDENCE_LEVELS
51
+
52
+ mapie_models = {}
53
+
54
+ for confidence_level in confidence_levels:
55
+ alpha = 1 - confidence_level
56
+ lower_q = alpha / 2
57
+ upper_q = 1 - alpha / 2
58
+
59
+ print(f"\nTraining quantile models for {confidence_level * 100:.0f}% confidence interval...")
60
+ print(f" Quantiles: {lower_q:.3f}, {upper_q:.3f}, 0.500")
61
+
62
+ # Train three LightGBM quantile models for this confidence level
63
+ quantile_estimators = []
64
+ for q in [lower_q, upper_q, 0.5]:
65
+ print(f" Training model for quantile {q:.3f}...")
66
+ est = LGBMRegressor(
67
+ objective="quantile",
68
+ alpha=q,
69
+ n_estimators=1000,
70
+ max_depth=6,
71
+ learning_rate=0.01,
72
+ num_leaves=31,
73
+ min_child_samples=20,
74
+ subsample=0.8,
75
+ colsample_bytree=0.8,
76
+ random_state=42,
77
+ verbose=-1,
78
+ force_col_wise=True,
79
+ )
80
+ est.fit(X_train, y_train)
81
+ quantile_estimators.append(est)
82
+
83
+ # Create MAPIE CQR model for this confidence level
84
+ print(f" Setting up MAPIE CQR for {confidence_level * 100:.0f}% confidence...")
85
+ mapie_model = ConformalizedQuantileRegressor(
86
+ quantile_estimators, confidence_level=confidence_level, prefit=True
87
+ )
88
+
89
+ # Conformalize the model with validation data
90
+ print(" Conformalizing with validation data...")
91
+ mapie_model.conformalize(X_val, y_val)
92
+
93
+ # Store the model
94
+ model_name = f"mapie_{confidence_level:.2f}"
95
+ mapie_models[model_name] = mapie_model
96
+
97
+ # Validate coverage for this confidence level
98
+ y_pred, y_pis = mapie_model.predict_interval(X_val)
99
+ coverage = np.mean((y_val >= y_pis[:, 0, 0]) & (y_val <= y_pis[:, 1, 0]))
100
+ print(f" Coverage: Target={confidence_level * 100:.0f}%, Empirical={coverage * 100:.1f}%")
101
+
102
+ # Compute median interval width for confidence calculation (using 80% CI = q_10 to q_90)
103
+ print("\nComputing normalization statistics for confidence scores...")
104
+ model_80 = mapie_models["mapie_0.80"]
105
+ _, y_pis_80 = model_80.predict_interval(X_val)
106
+ interval_width = np.abs(y_pis_80[:, 1, 0] - y_pis_80[:, 0, 0])
107
+ median_interval_width = float(np.median(interval_width))
108
+ print(f" Median interval width (q_10-q_90): {median_interval_width:.6f}")
109
+
110
+ # Analyze interval widths across confidence levels
111
+ print("\nInterval Width Analysis:")
112
+ for conf_level in confidence_levels:
113
+ model = mapie_models[f"mapie_{conf_level:.2f}"]
114
+ _, y_pis = model.predict_interval(X_val)
115
+ widths = y_pis[:, 1, 0] - y_pis[:, 0, 0]
116
+ print(f" {conf_level * 100:.0f}% CI: Mean width={np.mean(widths):.3f}, Std={np.std(widths):.3f}")
117
+
118
+ uq_metadata = {
119
+ "confidence_levels": confidence_levels,
120
+ "median_interval_width": median_interval_width,
121
+ }
122
+
123
+ return mapie_models, uq_metadata
124
+
125
+
126
+ def save_uq_models(uq_models: dict, uq_metadata: dict, model_dir: str) -> None:
127
+ """Save UQ models and metadata to disk.
128
+
129
+ Args:
130
+ uq_models: Dictionary of MAPIE models keyed by name (e.g., "mapie_0.80")
131
+ uq_metadata: Dictionary with confidence_levels and median_interval_width
132
+ model_dir: Directory to save models
133
+ """
134
+ # Save each MAPIE model
135
+ for model_name, model in uq_models.items():
136
+ joblib.dump(model, os.path.join(model_dir, f"{model_name}.joblib"))
137
+
138
+ # Save median interval width
139
+ with open(os.path.join(model_dir, "median_interval_width.json"), "w") as fp:
140
+ json.dump(uq_metadata["median_interval_width"], fp)
141
+
142
+ # Save UQ metadata
143
+ with open(os.path.join(model_dir, "uq_metadata.json"), "w") as fp:
144
+ json.dump(uq_metadata, fp, indent=2)
145
+
146
+ print(f"Saved {len(uq_models)} UQ models to {model_dir}")
147
+
148
+
149
+ def load_uq_models(model_dir: str) -> tuple[dict, dict]:
150
+ """Load UQ models and metadata from disk.
151
+
152
+ Args:
153
+ model_dir: Directory containing saved models
154
+
155
+ Returns:
156
+ Tuple of (uq_models dict, uq_metadata dict)
157
+ """
158
+ # Load UQ metadata
159
+ uq_metadata_path = os.path.join(model_dir, "uq_metadata.json")
160
+ if os.path.exists(uq_metadata_path):
161
+ with open(uq_metadata_path) as fp:
162
+ uq_metadata = json.load(fp)
163
+ else:
164
+ # Fallback for older models that only have median_interval_width.json
165
+ uq_metadata = {"confidence_levels": DEFAULT_CONFIDENCE_LEVELS}
166
+ median_width_path = os.path.join(model_dir, "median_interval_width.json")
167
+ if os.path.exists(median_width_path):
168
+ with open(median_width_path) as fp:
169
+ uq_metadata["median_interval_width"] = json.load(fp)
170
+
171
+ # Load all MAPIE models
172
+ uq_models = {}
173
+ for conf_level in uq_metadata["confidence_levels"]:
174
+ model_name = f"mapie_{conf_level:.2f}"
175
+ model_path = os.path.join(model_dir, f"{model_name}.joblib")
176
+ if os.path.exists(model_path):
177
+ uq_models[model_name] = joblib.load(model_path)
178
+
179
+ return uq_models, uq_metadata
180
+
181
+
182
+ def predict_intervals(
183
+ df: pd.DataFrame,
184
+ X: pd.DataFrame | np.ndarray,
185
+ uq_models: dict,
186
+ uq_metadata: dict,
187
+ ) -> pd.DataFrame:
188
+ """Add prediction intervals to a DataFrame.
189
+
190
+ Args:
191
+ df: DataFrame to add interval columns to
192
+ X: Features for prediction (must match training features)
193
+ uq_models: Dictionary of MAPIE models
194
+ uq_metadata: Dictionary with confidence_levels
195
+
196
+ Returns:
197
+ DataFrame with added quantile columns (q_025, q_05, ..., q_975)
198
+ """
199
+ confidence_levels = uq_metadata["confidence_levels"]
200
+
201
+ for conf_level in confidence_levels:
202
+ model_name = f"mapie_{conf_level:.2f}"
203
+ model = uq_models[model_name]
204
+
205
+ # Get conformalized predictions
206
+ y_pred, y_pis = model.predict_interval(X)
207
+
208
+ # Map confidence levels to quantile column names
209
+ if conf_level == 0.50: # 50% CI
210
+ df["q_25"] = y_pis[:, 0, 0]
211
+ df["q_75"] = y_pis[:, 1, 0]
212
+ df["q_50"] = y_pred # Median prediction
213
+ elif conf_level == 0.68: # 68% CI (~1 std)
214
+ df["q_16"] = y_pis[:, 0, 0]
215
+ df["q_84"] = y_pis[:, 1, 0]
216
+ elif conf_level == 0.80: # 80% CI
217
+ df["q_10"] = y_pis[:, 0, 0]
218
+ df["q_90"] = y_pis[:, 1, 0]
219
+ elif conf_level == 0.90: # 90% CI
220
+ df["q_05"] = y_pis[:, 0, 0]
221
+ df["q_95"] = y_pis[:, 1, 0]
222
+ elif conf_level == 0.95: # 95% CI
223
+ df["q_025"] = y_pis[:, 0, 0]
224
+ df["q_975"] = y_pis[:, 1, 0]
225
+
226
+ # Calculate pseudo-standard deviation from the 68% interval width
227
+ if "q_84" in df.columns and "q_16" in df.columns:
228
+ df["prediction_std"] = (df["q_84"] - df["q_16"]).abs() / 2.0
229
+
230
+ # Reorder quantile columns for easier reading
231
+ quantile_cols = ["q_025", "q_05", "q_10", "q_16", "q_25", "q_50", "q_75", "q_84", "q_90", "q_95", "q_975"]
232
+ existing_q_cols = [c for c in quantile_cols if c in df.columns]
233
+ other_cols = [c for c in df.columns if c not in quantile_cols]
234
+ df = df[other_cols + existing_q_cols]
235
+
236
+ return df
237
+
238
+
239
+ def compute_confidence(
240
+ df: pd.DataFrame,
241
+ median_interval_width: float,
242
+ lower_q: str = "q_10",
243
+ upper_q: str = "q_90",
244
+ alpha: float = 1.0,
245
+ beta: float = 1.0,
246
+ ) -> pd.DataFrame:
247
+ """Compute confidence scores (0.0 to 1.0) based on prediction interval width.
248
+
249
+ Uses exponential decay based on:
250
+ 1. Interval width relative to median (alpha weight)
251
+ 2. Distance from median prediction (beta weight)
252
+
253
+ Args:
254
+ df: DataFrame with 'prediction', 'q_50', and quantile columns
255
+ median_interval_width: Pre-computed median interval width from training data
256
+ lower_q: Lower quantile column name (default: 'q_10')
257
+ upper_q: Upper quantile column name (default: 'q_90')
258
+ alpha: Weight for interval width term (default: 1.0)
259
+ beta: Weight for distance from median term (default: 1.0)
260
+
261
+ Returns:
262
+ DataFrame with added 'confidence' column
263
+ """
264
+ # Interval width
265
+ interval_width = (df[upper_q] - df[lower_q]).abs()
266
+
267
+ # Distance from median, normalized by interval width
268
+ distance_from_median = (df["prediction"] - df["q_50"]).abs()
269
+ normalized_distance = distance_from_median / (interval_width + 1e-6)
270
+
271
+ # Cap the distance penalty at 1.0
272
+ normalized_distance = np.minimum(normalized_distance, 1.0)
273
+
274
+ # Confidence using exponential decay
275
+ interval_term = interval_width / median_interval_width
276
+ df["confidence"] = np.exp(-(alpha * interval_term + beta * normalized_distance))
277
+
278
+ return df