workbench 0.8.205__py3-none-any.whl → 0.8.213__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. workbench/algorithms/models/noise_model.py +388 -0
  2. workbench/api/endpoint.py +3 -6
  3. workbench/api/feature_set.py +1 -1
  4. workbench/api/model.py +5 -11
  5. workbench/cached/cached_model.py +4 -4
  6. workbench/core/artifacts/endpoint_core.py +63 -153
  7. workbench/core/artifacts/model_core.py +21 -19
  8. workbench/core/transforms/features_to_model/features_to_model.py +2 -2
  9. workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +1 -1
  10. workbench/model_script_utils/model_script_utils.py +335 -0
  11. workbench/model_script_utils/pytorch_utils.py +395 -0
  12. workbench/model_script_utils/uq_harness.py +278 -0
  13. workbench/model_scripts/chemprop/chemprop.template +289 -666
  14. workbench/model_scripts/chemprop/generated_model_script.py +292 -669
  15. workbench/model_scripts/chemprop/model_script_utils.py +335 -0
  16. workbench/model_scripts/chemprop/requirements.txt +2 -10
  17. workbench/model_scripts/pytorch_model/generated_model_script.py +355 -612
  18. workbench/model_scripts/pytorch_model/model_script_utils.py +335 -0
  19. workbench/model_scripts/pytorch_model/pytorch.template +350 -607
  20. workbench/model_scripts/pytorch_model/pytorch_utils.py +395 -0
  21. workbench/model_scripts/pytorch_model/requirements.txt +1 -1
  22. workbench/model_scripts/pytorch_model/uq_harness.py +278 -0
  23. workbench/model_scripts/script_generation.py +2 -5
  24. workbench/model_scripts/uq_models/generated_model_script.py +65 -422
  25. workbench/model_scripts/xgb_model/generated_model_script.py +349 -412
  26. workbench/model_scripts/xgb_model/model_script_utils.py +335 -0
  27. workbench/model_scripts/xgb_model/uq_harness.py +278 -0
  28. workbench/model_scripts/xgb_model/xgb_model.template +344 -407
  29. workbench/scripts/training_test.py +85 -0
  30. workbench/utils/chemprop_utils.py +18 -656
  31. workbench/utils/metrics_utils.py +172 -0
  32. workbench/utils/model_utils.py +104 -47
  33. workbench/utils/pytorch_utils.py +32 -472
  34. workbench/utils/xgboost_local_crossfold.py +267 -0
  35. workbench/utils/xgboost_model_utils.py +49 -356
  36. workbench/web_interface/components/plugins/model_details.py +30 -68
  37. {workbench-0.8.205.dist-info → workbench-0.8.213.dist-info}/METADATA +5 -5
  38. {workbench-0.8.205.dist-info → workbench-0.8.213.dist-info}/RECORD +42 -31
  39. {workbench-0.8.205.dist-info → workbench-0.8.213.dist-info}/entry_points.txt +1 -0
  40. workbench/model_scripts/uq_models/mapie.template +0 -605
  41. workbench/model_scripts/uq_models/requirements.txt +0 -1
  42. {workbench-0.8.205.dist-info → workbench-0.8.213.dist-info}/WHEEL +0 -0
  43. {workbench-0.8.205.dist-info → workbench-0.8.213.dist-info}/licenses/LICENSE +0 -0
  44. {workbench-0.8.205.dist-info → workbench-0.8.213.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,388 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ from xgboost import XGBRegressor
4
+ from typing import List
5
+ import logging
6
+
7
+ from workbench.algorithms.dataframe.proximity import Proximity
8
+
9
+ # Set up logging
10
+ log = logging.getLogger("workbench")
11
+
12
+
13
+ class NoiseModel:
14
+ """Composite noise detection for regression data using multiple complementary signals.
15
+
16
+ The NoiseModel identifies potentially noisy or problematic samples in regression datasets
17
+ by combining three independent signals:
18
+
19
+ 1. **Underfit Model Residuals**: A deliberately simple XGBoost model (low depth, few trees)
20
+ that captures only the main trends. High residuals indicate samples in complex regions
21
+ or unusual areas of the feature space.
22
+
23
+ 2. **Overfit Model Residuals**: A deliberately complex XGBoost model (deep trees, many
24
+ iterations, no regularization) that attempts to memorize the training data. High residuals
25
+ here indicate samples the model *cannot* fit even when trying to memorize - a strong
26
+ signal of label noise. This is the "training error" approach validated in:
27
+ "Denoising Drug Discovery Data for Improved ADMET Property Prediction" (Merck, JCIM 2024)
28
+
29
+ 3. **High Target Gradient (HTG)**: Using the Proximity class, measures disagreement between
30
+ a sample's target value and its neighbors in feature space. High gradients indicate
31
+ activity cliffs or potential measurement errors where similar compounds have very
32
+ different target values.
33
+
34
+ The combined noise score weights the overfit residual signal more heavily (2x) based on
35
+ the paper's finding that training error is the most reliable noise detector for regression.
36
+
37
+ Example:
38
+ ```python
39
+ from workbench.algorithms.models.noise_model import NoiseModel
40
+
41
+ # Create noise model
42
+ noise_model = NoiseModel(df, id_column="id", features=feature_list, target="target")
43
+
44
+ # Get noise scores for all samples
45
+ scores_df = noise_model.get_scores()
46
+
47
+ # Get sample weights for training (lower weight for noisy samples)
48
+ weights = noise_model.get_sample_weights(strategy="inverse")
49
+
50
+ # Get clean subset (bottom 90% by noise score)
51
+ clean_df = noise_model.get_clean_subset(percentile=90)
52
+
53
+ # Find samples with same features but different targets (definite noise)
54
+ conflicts = noise_model.coincident_conflicts()
55
+ ```
56
+
57
+ References:
58
+ Adrian, M., Chung, Y., & Cheng, A. C. (2024). Denoising Drug Discovery Data for
59
+ Improved ADMET Property Prediction. J. Chem. Inf. Model., 64(16), 6324-6337.
60
+ """
61
+
62
+ def __init__(
63
+ self,
64
+ df: pd.DataFrame,
65
+ id_column: str,
66
+ features: List[str],
67
+ target: str,
68
+ ):
69
+ """
70
+ Initialize the NoiseModel class.
71
+
72
+ Args:
73
+ df: DataFrame containing data for noise detection.
74
+ id_column: Name of the column used as the identifier.
75
+ features: List of feature column names.
76
+ target: Name of the target column.
77
+ """
78
+ self.id_column = id_column
79
+ self.target = target
80
+
81
+ # Filter out non-numeric features
82
+ self.features = self._validate_features(df, features)
83
+
84
+ # Drop NaN rows in features and target
85
+ self.df = df.dropna(subset=self.features + [self.target]).copy()
86
+
87
+ # Compute target stats for normalization
88
+ self.target_std = self.df[self.target].std()
89
+ self.target_range = self.df[self.target].max() - self.df[self.target].min()
90
+
91
+ # Build all component models
92
+ self._build_models()
93
+
94
+ # Precompute all noise signals
95
+ self._precompute_signals()
96
+
97
+ def get_scores(self) -> pd.DataFrame:
98
+ """
99
+ Get noise scores for all samples.
100
+
101
+ Returns:
102
+ DataFrame with id, individual signal columns, and combined noise_score
103
+ """
104
+ result = self.df[[self.id_column, self.target]].copy()
105
+ result["underfit_residual"] = self.df["underfit_residual"]
106
+ result["overfit_residual"] = self.df["overfit_residual"]
107
+ result["htg_score"] = self.df["htg_score"]
108
+ result["noise_score"] = self.df["noise_score"]
109
+ return result.sort_values("noise_score", ascending=False).reset_index(drop=True)
110
+
111
+ def get_sample_weights(self, strategy: str = "inverse") -> pd.Series:
112
+ """
113
+ Get sample weights for training, indexed by id_column.
114
+
115
+ Args:
116
+ strategy: Weighting strategy
117
+ - "inverse": 1 / (1 + noise_score)
118
+ - "soft": 1 - noise_score (clipped to [0.1, 1.0])
119
+ - "threshold": 1.0 if noise_score < median, else 0.5
120
+
121
+ Returns:
122
+ Series of weights indexed by id_column
123
+ """
124
+ scores = self.df.set_index(self.id_column)["noise_score"]
125
+
126
+ if strategy == "inverse":
127
+ weights = 1.0 / (1.0 + scores)
128
+ elif strategy == "soft":
129
+ weights = (1.0 - scores).clip(lower=0.1, upper=1.0)
130
+ elif strategy == "threshold":
131
+ median_score = scores.median()
132
+ weights = (scores < median_score).apply(lambda x: 1.0 if x else 0.5)
133
+ else:
134
+ raise ValueError(f"Unknown strategy: {strategy}")
135
+
136
+ return weights
137
+
138
+ def get_clean_subset(self, percentile: float = 90.0) -> pd.DataFrame:
139
+ """
140
+ Get a subset of data with lowest noise scores.
141
+
142
+ Args:
143
+ percentile: Keep samples below this percentile of noise score (default: 90 = bottom 90%)
144
+
145
+ Returns:
146
+ DataFrame of "clean" samples
147
+ """
148
+ threshold = np.percentile(self.df["noise_score"], percentile)
149
+ return self.df[self.df["noise_score"] <= threshold].copy()
150
+
151
+ def get_noisy_samples(self, top_percent: float = 10.0) -> pd.DataFrame:
152
+ """
153
+ Get samples with highest noise scores.
154
+
155
+ Args:
156
+ top_percent: Percentage of noisiest samples to return (default: 10%)
157
+
158
+ Returns:
159
+ DataFrame of noisy samples, sorted by noise_score descending
160
+ """
161
+ percentile = 100 - top_percent
162
+ threshold = np.percentile(self.df["noise_score"], percentile)
163
+ noisy = self.df[self.df["noise_score"] >= threshold].copy()
164
+ return noisy.sort_values("noise_score", ascending=False).reset_index(drop=True)
165
+
166
+ def coincident_conflicts(self, distance_threshold: float = 1e-5) -> pd.DataFrame:
167
+ """
168
+ Find samples that map to the same point in feature space but have different targets.
169
+
170
+ These are definitive noise - same features, different target values.
171
+
172
+ Args:
173
+ distance_threshold: Maximum distance to consider "coincident" (default: 1e-5)
174
+
175
+ Returns:
176
+ DataFrame of coincident conflicts with their target differences
177
+ """
178
+ # Use proximity to find coincident points
179
+ coincident = self.df[self.df["nn_distance"] < distance_threshold].copy()
180
+
181
+ if len(coincident) == 0:
182
+ return pd.DataFrame(columns=[self.id_column, self.target, "nn_id", "nn_target", "nn_target_diff"])
183
+
184
+ return (
185
+ coincident[[self.id_column, self.target, "nn_id", "nn_target", "nn_target_diff", "noise_score"]]
186
+ .sort_values("nn_target_diff", ascending=False)
187
+ .reset_index(drop=True)
188
+ )
189
+
190
+ def _validate_features(self, df: pd.DataFrame, features: List[str]) -> List[str]:
191
+ """Remove non-numeric features and log warnings."""
192
+ non_numeric = [f for f in features if f not in df.select_dtypes(include=["number"]).columns]
193
+ if non_numeric:
194
+ log.warning(f"Non-numeric features {non_numeric} aren't currently supported, excluding them")
195
+ return [f for f in features if f not in non_numeric]
196
+
197
+ def _build_models(self) -> None:
198
+ """Build the underfit, overfit, and proximity models."""
199
+ log.info("Building noise detection models...")
200
+
201
+ X = self.df[self.features]
202
+ y = self.df[self.target]
203
+
204
+ # Underfit model: intentionally simple (high bias)
205
+ log.info(" Fitting underfit model...")
206
+ self.underfit_model = XGBRegressor(
207
+ max_depth=2,
208
+ n_estimators=20,
209
+ learning_rate=0.1,
210
+ random_state=42,
211
+ verbosity=0,
212
+ )
213
+ self.underfit_model.fit(X, y)
214
+
215
+ # Overfit model: intentionally complex (high variance, low regularization)
216
+ log.info(" Fitting overfit model...")
217
+ self.overfit_model = XGBRegressor(
218
+ max_depth=12,
219
+ n_estimators=500,
220
+ learning_rate=0.1,
221
+ reg_lambda=0.0,
222
+ reg_alpha=0.0,
223
+ min_child_weight=1,
224
+ random_state=42,
225
+ verbosity=0,
226
+ )
227
+ self.overfit_model.fit(X, y)
228
+
229
+ # Proximity model for feature space analysis
230
+ log.info(" Building proximity model...")
231
+ self.proximity = Proximity(
232
+ self.df,
233
+ id_column=self.id_column,
234
+ features=self.features,
235
+ target=self.target,
236
+ )
237
+
238
+ # Copy proximity metrics to our df
239
+ self.df["nn_distance"] = self.proximity.df["nn_distance"].values
240
+ self.df["nn_id"] = self.proximity.df["nn_id"].values
241
+ self.df["nn_target"] = self.proximity.df["nn_target"].values
242
+ self.df["nn_target_diff"] = self.proximity.df["nn_target_diff"].values
243
+
244
+ log.info("Noise detection models built successfully")
245
+
246
+ def _precompute_signals(self) -> None:
247
+ """Precompute all noise signals for every sample."""
248
+ log.info("Precomputing noise signals...")
249
+
250
+ X = self.df[self.features]
251
+ y = self.df[self.target].values
252
+
253
+ # Underfit residuals (normalized by target std)
254
+ underfit_pred = self.underfit_model.predict(X)
255
+ self.df["underfit_residual"] = np.abs(y - underfit_pred) / self.target_std
256
+
257
+ # Overfit residuals (normalized by target std)
258
+ # This is the key "training error" signal from the paper
259
+ overfit_pred = self.overfit_model.predict(X)
260
+ self.df["overfit_residual"] = np.abs(y - overfit_pred) / self.target_std
261
+
262
+ # HTG score: neighbor disagreement (normalized by target std)
263
+ # Using nn_target_diff directly, normalized
264
+ self.df["htg_score"] = self.df["nn_target_diff"] / self.target_std
265
+
266
+ # Combine into overall noise score
267
+ # Scale each component to [0, 1] using percentile ranks, then average
268
+ self.df["noise_score"] = self._compute_combined_score()
269
+
270
+ log.info("Noise signals precomputed successfully")
271
+
272
+ def _compute_combined_score(self) -> np.ndarray:
273
+ """
274
+ Combine individual signals into a single noise score.
275
+
276
+ Uses percentile ranks to normalize each signal to [0, 1], then averages.
277
+ Overfit residual gets higher weight as it's the most validated signal (per the paper).
278
+ """
279
+ # Convert to percentile ranks (0-1 scale)
280
+ overfit_rank = self.df["overfit_residual"].rank(pct=True)
281
+ htg_rank = self.df["htg_score"].rank(pct=True)
282
+
283
+ # Weighted average: overfit gets 2x weight based on paper's findings
284
+ # that training error is the best noise detector
285
+ combined = (2.0 * overfit_rank + 1.0 * htg_rank) / 3.0
286
+
287
+ return combined.values
288
+
289
+
290
+ # Testing the NoiseModel class
291
+ if __name__ == "__main__":
292
+
293
+ from workbench.api import FeatureSet, Model
294
+
295
+ pd.set_option("display.max_columns", None)
296
+ pd.set_option("display.width", 1000)
297
+
298
+ # Create a sample DataFrame with some noisy points
299
+ np.random.seed(42)
300
+ n_samples = 100
301
+
302
+ # Generate clean data: y = 2*x1 + 3*x2 + noise
303
+ x1 = np.random.randn(n_samples)
304
+ x2 = np.random.randn(n_samples)
305
+ y_clean = 2 * x1 + 3 * x2 + np.random.randn(n_samples) * 0.1
306
+
307
+ # Add some noisy points (last 10 samples)
308
+ y_noisy = y_clean.copy()
309
+ y_noisy[-10:] += np.random.randn(10) * 5 # Large noise
310
+
311
+ data = {
312
+ "ID": [f"sample_{i}" for i in range(n_samples)],
313
+ "Feature1": x1,
314
+ "Feature2": x2,
315
+ "target": y_noisy,
316
+ }
317
+ df = pd.DataFrame(data)
318
+
319
+ print("=" * 80)
320
+ print("Testing NoiseModel...")
321
+ print("=" * 80)
322
+
323
+ # Create noise model
324
+ noise_model = NoiseModel(
325
+ df,
326
+ id_column="ID",
327
+ features=["Feature1", "Feature2"],
328
+ target="target",
329
+ )
330
+
331
+ # Get noise scores
332
+ print("\nTop 10 noisiest samples:")
333
+ scores = noise_model.get_scores()
334
+ print(scores.head(10))
335
+
336
+ # Check if our artificially noisy samples are detected
337
+ noisy_ids = [f"sample_{i}" for i in range(90, 100)]
338
+ detected = scores[scores["ID"].isin(noisy_ids)]
339
+ median_score = scores["noise_score"].median()
340
+ print(f"\nOf 10 noisy samples, {len(detected[detected['noise_score'] > median_score])} above median noise score")
341
+
342
+ # Get sample weights
343
+ print("\nSample weights (inverse strategy):")
344
+ weights = noise_model.get_sample_weights(strategy="inverse")
345
+ print(f" Min weight: {weights.min():.3f}")
346
+ print(f" Max weight: {weights.max():.3f}")
347
+ print(f" Mean weight: {weights.mean():.3f}")
348
+
349
+ # Get clean subset
350
+ clean = noise_model.get_clean_subset(percentile=90)
351
+ print(f"\nClean subset (bottom 90%): {len(clean)} samples")
352
+
353
+ # Get noisy samples
354
+ noisy = noise_model.get_noisy_samples(top_percent=10)
355
+ print(f"\nNoisy samples (top 10%): {len(noisy)} samples")
356
+ print(noisy[["ID", "target", "overfit_residual", "htg_score", "noise_score"]].head())
357
+
358
+ # Test with real data
359
+ print("\n" + "=" * 80)
360
+ print("Testing with AQSol data...")
361
+ print("=" * 80)
362
+ fs = FeatureSet("aqsol_features")
363
+ model = Model("aqsol-regression")
364
+
365
+ if fs.exists():
366
+ features = model.features()
367
+ target = model.target()
368
+ df = fs.pull_dataframe()
369
+
370
+ noise_model = NoiseModel(
371
+ df,
372
+ id_column=fs.id_column,
373
+ features=features,
374
+ target=target,
375
+ )
376
+
377
+ print("\nTop 10 noisiest compounds:")
378
+ scores = noise_model.get_scores()
379
+ print(scores.head(10))
380
+
381
+ print("\nCoincident conflicts:")
382
+ conflicts = noise_model.coincident_conflicts()
383
+ print(f"Found {len(conflicts)} coincident conflicts")
384
+ if len(conflicts) > 0:
385
+ print(conflicts.head())
386
+
387
+ print("\nNoise score distribution:")
388
+ print(scores["noise_score"].describe())
workbench/api/endpoint.py CHANGED
@@ -70,16 +70,13 @@ class Endpoint(EndpointCore):
70
70
  """
71
71
  return super().fast_inference(eval_df, threads=threads)
72
72
 
73
- def cross_fold_inference(self, nfolds: int = 5) -> pd.DataFrame:
74
- """Run cross-fold inference (only works for XGBoost models)
75
-
76
- Args:
77
- nfolds (int): The number of folds to use for cross-validation (default: 5)
73
+ def cross_fold_inference(self) -> pd.DataFrame:
74
+ """Pull cross-fold inference from model associated with this Endpoint
78
75
 
79
76
  Returns:
80
77
  pd.DataFrame: A DataFrame with cross fold predictions
81
78
  """
82
- return super().cross_fold_inference(nfolds)
79
+ return super().cross_fold_inference()
83
80
 
84
81
 
85
82
  if __name__ == "__main__":
@@ -128,7 +128,7 @@ class FeatureSet(FeatureSetCore):
128
128
  tags = [name] if tags is None else tags
129
129
 
130
130
  # If the model framework is PyTorch or ChemProp, ensure we set the training and inference images
131
- if model_framework in (ModelFramework.PYTORCH_TABULAR, ModelFramework.CHEMPROP):
131
+ if model_framework in (ModelFramework.PYTORCH, ModelFramework.CHEMPROP):
132
132
  training_image = "pytorch_training"
133
133
  inference_image = "pytorch_inference"
134
134
 
workbench/api/model.py CHANGED
@@ -10,7 +10,7 @@ from workbench.core.artifacts.artifact import Artifact
10
10
  from workbench.core.artifacts.model_core import ModelCore, ModelType, ModelFramework # noqa: F401
11
11
  from workbench.core.transforms.model_to_endpoint.model_to_endpoint import ModelToEndpoint
12
12
  from workbench.api.endpoint import Endpoint
13
- from workbench.utils.model_utils import proximity_model_local, uq_model
13
+ from workbench.utils.model_utils import proximity_model_local, noise_model_local
14
14
 
15
15
 
16
16
  class Model(ModelCore):
@@ -91,19 +91,13 @@ class Model(ModelCore):
91
91
  """
92
92
  return proximity_model_local(self)
93
93
 
94
- def uq_model(self, uq_model_name: str = None, train_all_data: bool = False) -> "Model":
95
- """Create a Uncertainty Quantification Model for this Model
96
-
97
- Args:
98
- uq_model_name (str, optional): Name of the UQ Model (if not specified, a name will be generated)
99
- train_all_data (bool, optional): Whether to train the UQ Model on all data (default: False)
94
+ def noise_model(self):
95
+ """Create a local Noise Model for this Model
100
96
 
101
97
  Returns:
102
- Model: The UQ Model
98
+ NoiseModel: A local Noise Model
103
99
  """
104
- if uq_model_name is None:
105
- uq_model_name = self.model_name + "-uq"
106
- return uq_model(self, uq_model_name, train_all_data=train_all_data)
100
+ return noise_model_local(self)
107
101
 
108
102
 
109
103
  if __name__ == "__main__":
@@ -72,11 +72,11 @@ class CachedModel(CachedArtifactMixin, ModelCore):
72
72
  return super().list_inference_runs()
73
73
 
74
74
  @CachedArtifactMixin.cache_result
75
- def get_inference_metrics(self, capture_name: str = "latest") -> Union[pd.DataFrame, None]:
75
+ def get_inference_metrics(self, capture_name: str = "auto") -> Union[pd.DataFrame, None]:
76
76
  """Retrieve the captured prediction results for this model
77
77
 
78
78
  Args:
79
- capture_name (str, optional): Specific capture_name (default: latest)
79
+ capture_name (str, optional): Specific capture_name (default: auto)
80
80
 
81
81
  Returns:
82
82
  pd.DataFrame: DataFrame of the Captured Metrics (might be None)
@@ -101,11 +101,11 @@ class CachedModel(CachedArtifactMixin, ModelCore):
101
101
  return df
102
102
 
103
103
  @CachedArtifactMixin.cache_result
104
- def confusion_matrix(self, capture_name: str = "latest") -> Union[pd.DataFrame, None]:
104
+ def confusion_matrix(self, capture_name: str = "auto") -> Union[pd.DataFrame, None]:
105
105
  """Retrieve the confusion matrix for the model
106
106
 
107
107
  Args:
108
- capture_name (str, optional): Specific capture_name (default: latest)
108
+ capture_name (str, optional): Specific capture_name (default: auto)
109
109
 
110
110
  Returns:
111
111
  pd.DataFrame: DataFrame of the Confusion Matrix (might be None)