workbench 0.8.174__py3-none-any.whl → 0.8.227__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of workbench might be problematic. Click here for more details.
- workbench/__init__.py +1 -0
- workbench/algorithms/dataframe/__init__.py +1 -2
- workbench/algorithms/dataframe/compound_dataset_overlap.py +321 -0
- workbench/algorithms/dataframe/feature_space_proximity.py +168 -75
- workbench/algorithms/dataframe/fingerprint_proximity.py +422 -86
- workbench/algorithms/dataframe/projection_2d.py +44 -21
- workbench/algorithms/dataframe/proximity.py +259 -305
- workbench/algorithms/graph/light/proximity_graph.py +12 -11
- workbench/algorithms/models/cleanlab_model.py +382 -0
- workbench/algorithms/models/noise_model.py +388 -0
- workbench/algorithms/sql/column_stats.py +0 -1
- workbench/algorithms/sql/correlations.py +0 -1
- workbench/algorithms/sql/descriptive_stats.py +0 -1
- workbench/algorithms/sql/outliers.py +3 -3
- workbench/api/__init__.py +5 -1
- workbench/api/df_store.py +17 -108
- workbench/api/endpoint.py +14 -12
- workbench/api/feature_set.py +117 -11
- workbench/api/meta.py +0 -1
- workbench/api/meta_model.py +289 -0
- workbench/api/model.py +52 -21
- workbench/api/parameter_store.py +3 -52
- workbench/cached/cached_meta.py +0 -1
- workbench/cached/cached_model.py +49 -11
- workbench/core/artifacts/__init__.py +11 -2
- workbench/core/artifacts/artifact.py +7 -7
- workbench/core/artifacts/data_capture_core.py +8 -1
- workbench/core/artifacts/df_store_core.py +114 -0
- workbench/core/artifacts/endpoint_core.py +323 -205
- workbench/core/artifacts/feature_set_core.py +249 -45
- workbench/core/artifacts/model_core.py +133 -101
- workbench/core/artifacts/parameter_store_core.py +98 -0
- workbench/core/cloud_platform/aws/aws_account_clamp.py +48 -2
- workbench/core/cloud_platform/cloud_meta.py +0 -1
- workbench/core/pipelines/pipeline_executor.py +1 -1
- workbench/core/transforms/features_to_model/features_to_model.py +60 -44
- workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +43 -10
- workbench/core/transforms/pandas_transforms/pandas_to_features.py +38 -2
- workbench/core/views/training_view.py +113 -42
- workbench/core/views/view.py +53 -3
- workbench/core/views/view_utils.py +4 -4
- workbench/model_script_utils/model_script_utils.py +339 -0
- workbench/model_script_utils/pytorch_utils.py +405 -0
- workbench/model_script_utils/uq_harness.py +277 -0
- workbench/model_scripts/chemprop/chemprop.template +774 -0
- workbench/model_scripts/chemprop/generated_model_script.py +774 -0
- workbench/model_scripts/chemprop/model_script_utils.py +339 -0
- workbench/model_scripts/chemprop/requirements.txt +3 -0
- workbench/model_scripts/custom_models/chem_info/fingerprints.py +175 -0
- workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +18 -7
- workbench/model_scripts/custom_models/chem_info/mol_standardize.py +80 -58
- workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +0 -1
- workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py +1 -2
- workbench/model_scripts/custom_models/proximity/feature_space_proximity.py +194 -0
- workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +8 -10
- workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
- workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +20 -21
- workbench/model_scripts/custom_models/uq_models/feature_space_proximity.py +194 -0
- workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
- workbench/model_scripts/custom_models/uq_models/ngboost.template +15 -16
- workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +15 -17
- workbench/model_scripts/meta_model/generated_model_script.py +209 -0
- workbench/model_scripts/meta_model/meta_model.template +209 -0
- workbench/model_scripts/pytorch_model/generated_model_script.py +443 -499
- workbench/model_scripts/pytorch_model/model_script_utils.py +339 -0
- workbench/model_scripts/pytorch_model/pytorch.template +440 -496
- workbench/model_scripts/pytorch_model/pytorch_utils.py +405 -0
- workbench/model_scripts/pytorch_model/requirements.txt +1 -1
- workbench/model_scripts/pytorch_model/uq_harness.py +277 -0
- workbench/model_scripts/scikit_learn/generated_model_script.py +7 -12
- workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
- workbench/model_scripts/script_generation.py +15 -12
- workbench/model_scripts/uq_models/generated_model_script.py +248 -0
- workbench/model_scripts/xgb_model/generated_model_script.py +371 -403
- workbench/model_scripts/xgb_model/model_script_utils.py +339 -0
- workbench/model_scripts/xgb_model/uq_harness.py +277 -0
- workbench/model_scripts/xgb_model/xgb_model.template +367 -399
- workbench/repl/workbench_shell.py +18 -14
- workbench/resources/open_source_api.key +1 -1
- workbench/scripts/endpoint_test.py +162 -0
- workbench/scripts/lambda_test.py +73 -0
- workbench/scripts/meta_model_sim.py +35 -0
- workbench/scripts/ml_pipeline_sqs.py +122 -6
- workbench/scripts/training_test.py +85 -0
- workbench/themes/dark/custom.css +59 -0
- workbench/themes/dark/plotly.json +5 -5
- workbench/themes/light/custom.css +153 -40
- workbench/themes/light/plotly.json +9 -9
- workbench/themes/midnight_blue/custom.css +59 -0
- workbench/utils/aws_utils.py +0 -1
- workbench/utils/chem_utils/fingerprints.py +87 -46
- workbench/utils/chem_utils/mol_descriptors.py +18 -7
- workbench/utils/chem_utils/mol_standardize.py +80 -58
- workbench/utils/chem_utils/projections.py +16 -6
- workbench/utils/chem_utils/vis.py +25 -27
- workbench/utils/chemprop_utils.py +141 -0
- workbench/utils/config_manager.py +2 -6
- workbench/utils/endpoint_utils.py +5 -7
- workbench/utils/license_manager.py +2 -6
- workbench/utils/markdown_utils.py +57 -0
- workbench/utils/meta_model_simulator.py +499 -0
- workbench/utils/metrics_utils.py +256 -0
- workbench/utils/model_utils.py +274 -87
- workbench/utils/pipeline_utils.py +0 -1
- workbench/utils/plot_utils.py +159 -34
- workbench/utils/pytorch_utils.py +87 -0
- workbench/utils/shap_utils.py +11 -57
- workbench/utils/theme_manager.py +95 -30
- workbench/utils/xgboost_local_crossfold.py +267 -0
- workbench/utils/xgboost_model_utils.py +127 -220
- workbench/web_interface/components/experiments/outlier_plot.py +0 -1
- workbench/web_interface/components/model_plot.py +16 -2
- workbench/web_interface/components/plugin_unit_test.py +5 -3
- workbench/web_interface/components/plugins/ag_table.py +2 -4
- workbench/web_interface/components/plugins/confusion_matrix.py +3 -6
- workbench/web_interface/components/plugins/model_details.py +48 -80
- workbench/web_interface/components/plugins/scatter_plot.py +192 -92
- workbench/web_interface/components/settings_menu.py +184 -0
- workbench/web_interface/page_views/main_page.py +0 -1
- {workbench-0.8.174.dist-info → workbench-0.8.227.dist-info}/METADATA +31 -17
- {workbench-0.8.174.dist-info → workbench-0.8.227.dist-info}/RECORD +125 -111
- {workbench-0.8.174.dist-info → workbench-0.8.227.dist-info}/entry_points.txt +4 -0
- {workbench-0.8.174.dist-info → workbench-0.8.227.dist-info}/licenses/LICENSE +1 -1
- workbench/core/cloud_platform/aws/aws_df_store.py +0 -404
- workbench/core/cloud_platform/aws/aws_parameter_store.py +0 -280
- workbench/model_scripts/custom_models/meta_endpoints/example.py +0 -53
- workbench/model_scripts/custom_models/proximity/generated_model_script.py +0 -138
- workbench/model_scripts/custom_models/proximity/proximity.py +0 -384
- workbench/model_scripts/custom_models/uq_models/generated_model_script.py +0 -393
- workbench/model_scripts/custom_models/uq_models/mapie.template +0 -502
- workbench/model_scripts/custom_models/uq_models/meta_uq.template +0 -386
- workbench/model_scripts/custom_models/uq_models/proximity.py +0 -384
- workbench/model_scripts/ensemble_xgb/generated_model_script.py +0 -279
- workbench/model_scripts/quant_regression/quant_regression.template +0 -279
- workbench/model_scripts/quant_regression/requirements.txt +0 -1
- workbench/themes/quartz/base_css.url +0 -1
- workbench/themes/quartz/custom.css +0 -117
- workbench/themes/quartz/plotly.json +0 -642
- workbench/themes/quartz_dark/base_css.url +0 -1
- workbench/themes/quartz_dark/custom.css +0 -131
- workbench/themes/quartz_dark/plotly.json +0 -642
- workbench/utils/fast_inference.py +0 -167
- workbench/utils/resource_utils.py +0 -39
- {workbench-0.8.174.dist-info → workbench-0.8.227.dist-info}/WHEEL +0 -0
- {workbench-0.8.174.dist-info → workbench-0.8.227.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,388 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
from xgboost import XGBRegressor
|
|
4
|
+
from typing import List
|
|
5
|
+
import logging
|
|
6
|
+
|
|
7
|
+
from workbench.algorithms.dataframe.feature_space_proximity import FeatureSpaceProximity
|
|
8
|
+
|
|
9
|
+
# Set up logging
|
|
10
|
+
log = logging.getLogger("workbench")
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class NoiseModel:
|
|
14
|
+
"""Composite noise detection for regression data using multiple complementary signals.
|
|
15
|
+
|
|
16
|
+
The NoiseModel identifies potentially noisy or problematic samples in regression datasets
|
|
17
|
+
by combining three independent signals:
|
|
18
|
+
|
|
19
|
+
1. **Underfit Model Residuals**: A deliberately simple XGBoost model (low depth, few trees)
|
|
20
|
+
that captures only the main trends. High residuals indicate samples in complex regions
|
|
21
|
+
or unusual areas of the feature space.
|
|
22
|
+
|
|
23
|
+
2. **Overfit Model Residuals**: A deliberately complex XGBoost model (deep trees, many
|
|
24
|
+
iterations, no regularization) that attempts to memorize the training data. High residuals
|
|
25
|
+
here indicate samples the model *cannot* fit even when trying to memorize - a strong
|
|
26
|
+
signal of label noise. This is the "training error" approach validated in:
|
|
27
|
+
"Denoising Drug Discovery Data for Improved ADMET Property Prediction" (Merck, JCIM 2024)
|
|
28
|
+
|
|
29
|
+
3. **High Target Gradient (HTG)**: Using the Proximity class, measures disagreement between
|
|
30
|
+
a sample's target value and its neighbors in feature space. High gradients indicate
|
|
31
|
+
activity cliffs or potential measurement errors where similar compounds have very
|
|
32
|
+
different target values.
|
|
33
|
+
|
|
34
|
+
The combined noise score weights the overfit residual signal more heavily (2x) based on
|
|
35
|
+
the paper's finding that training error is the most reliable noise detector for regression.
|
|
36
|
+
|
|
37
|
+
Example:
|
|
38
|
+
```python
|
|
39
|
+
from workbench.algorithms.models.noise_model import NoiseModel
|
|
40
|
+
|
|
41
|
+
# Create noise model
|
|
42
|
+
noise_model = NoiseModel(df, id_column="id", features=feature_list, target="target")
|
|
43
|
+
|
|
44
|
+
# Get noise scores for all samples
|
|
45
|
+
scores_df = noise_model.get_scores()
|
|
46
|
+
|
|
47
|
+
# Get sample weights for training (lower weight for noisy samples)
|
|
48
|
+
weights = noise_model.get_sample_weights(strategy="inverse")
|
|
49
|
+
|
|
50
|
+
# Get clean subset (bottom 90% by noise score)
|
|
51
|
+
clean_df = noise_model.get_clean_subset(percentile=90)
|
|
52
|
+
|
|
53
|
+
# Find samples with same features but different targets (definite noise)
|
|
54
|
+
conflicts = noise_model.coincident_conflicts()
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
References:
|
|
58
|
+
Adrian, M., Chung, Y., & Cheng, A. C. (2024). Denoising Drug Discovery Data for
|
|
59
|
+
Improved ADMET Property Prediction. J. Chem. Inf. Model., 64(16), 6324-6337.
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
def __init__(
|
|
63
|
+
self,
|
|
64
|
+
df: pd.DataFrame,
|
|
65
|
+
id_column: str,
|
|
66
|
+
features: List[str],
|
|
67
|
+
target: str,
|
|
68
|
+
):
|
|
69
|
+
"""
|
|
70
|
+
Initialize the NoiseModel class.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
df: DataFrame containing data for noise detection.
|
|
74
|
+
id_column: Name of the column used as the identifier.
|
|
75
|
+
features: List of feature column names.
|
|
76
|
+
target: Name of the target column.
|
|
77
|
+
"""
|
|
78
|
+
self.id_column = id_column
|
|
79
|
+
self.target = target
|
|
80
|
+
|
|
81
|
+
# Filter out non-numeric features
|
|
82
|
+
self.features = self._validate_features(df, features)
|
|
83
|
+
|
|
84
|
+
# Drop NaN rows in features and target
|
|
85
|
+
self.df = df.dropna(subset=self.features + [self.target]).copy()
|
|
86
|
+
|
|
87
|
+
# Compute target stats for normalization
|
|
88
|
+
self.target_std = self.df[self.target].std()
|
|
89
|
+
self.target_range = self.df[self.target].max() - self.df[self.target].min()
|
|
90
|
+
|
|
91
|
+
# Build all component models
|
|
92
|
+
self._build_models()
|
|
93
|
+
|
|
94
|
+
# Precompute all noise signals
|
|
95
|
+
self._precompute_signals()
|
|
96
|
+
|
|
97
|
+
def get_scores(self) -> pd.DataFrame:
|
|
98
|
+
"""
|
|
99
|
+
Get noise scores for all samples.
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
DataFrame with id, individual signal columns, and combined noise_score
|
|
103
|
+
"""
|
|
104
|
+
result = self.df[[self.id_column, self.target]].copy()
|
|
105
|
+
result["underfit_residual"] = self.df["underfit_residual"]
|
|
106
|
+
result["overfit_residual"] = self.df["overfit_residual"]
|
|
107
|
+
result["htg_score"] = self.df["htg_score"]
|
|
108
|
+
result["noise_score"] = self.df["noise_score"]
|
|
109
|
+
return result.sort_values("noise_score", ascending=False).reset_index(drop=True)
|
|
110
|
+
|
|
111
|
+
def get_sample_weights(self, strategy: str = "inverse") -> pd.Series:
|
|
112
|
+
"""
|
|
113
|
+
Get sample weights for training, indexed by id_column.
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
strategy: Weighting strategy
|
|
117
|
+
- "inverse": 1 / (1 + noise_score)
|
|
118
|
+
- "soft": 1 - noise_score (clipped to [0.1, 1.0])
|
|
119
|
+
- "threshold": 1.0 if noise_score < median, else 0.5
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
Series of weights indexed by id_column
|
|
123
|
+
"""
|
|
124
|
+
scores = self.df.set_index(self.id_column)["noise_score"]
|
|
125
|
+
|
|
126
|
+
if strategy == "inverse":
|
|
127
|
+
weights = 1.0 / (1.0 + scores)
|
|
128
|
+
elif strategy == "soft":
|
|
129
|
+
weights = (1.0 - scores).clip(lower=0.1, upper=1.0)
|
|
130
|
+
elif strategy == "threshold":
|
|
131
|
+
median_score = scores.median()
|
|
132
|
+
weights = (scores < median_score).apply(lambda x: 1.0 if x else 0.5)
|
|
133
|
+
else:
|
|
134
|
+
raise ValueError(f"Unknown strategy: {strategy}")
|
|
135
|
+
|
|
136
|
+
return weights
|
|
137
|
+
|
|
138
|
+
def get_clean_subset(self, percentile: float = 90.0) -> pd.DataFrame:
|
|
139
|
+
"""
|
|
140
|
+
Get a subset of data with lowest noise scores.
|
|
141
|
+
|
|
142
|
+
Args:
|
|
143
|
+
percentile: Keep samples below this percentile of noise score (default: 90 = bottom 90%)
|
|
144
|
+
|
|
145
|
+
Returns:
|
|
146
|
+
DataFrame of "clean" samples
|
|
147
|
+
"""
|
|
148
|
+
threshold = np.percentile(self.df["noise_score"], percentile)
|
|
149
|
+
return self.df[self.df["noise_score"] <= threshold].copy()
|
|
150
|
+
|
|
151
|
+
def get_noisy_samples(self, top_percent: float = 10.0) -> pd.DataFrame:
|
|
152
|
+
"""
|
|
153
|
+
Get samples with highest noise scores.
|
|
154
|
+
|
|
155
|
+
Args:
|
|
156
|
+
top_percent: Percentage of noisiest samples to return (default: 10%)
|
|
157
|
+
|
|
158
|
+
Returns:
|
|
159
|
+
DataFrame of noisy samples, sorted by noise_score descending
|
|
160
|
+
"""
|
|
161
|
+
percentile = 100 - top_percent
|
|
162
|
+
threshold = np.percentile(self.df["noise_score"], percentile)
|
|
163
|
+
noisy = self.df[self.df["noise_score"] >= threshold].copy()
|
|
164
|
+
return noisy.sort_values("noise_score", ascending=False).reset_index(drop=True)
|
|
165
|
+
|
|
166
|
+
def coincident_conflicts(self, distance_threshold: float = 1e-5) -> pd.DataFrame:
|
|
167
|
+
"""
|
|
168
|
+
Find samples that map to the same point in feature space but have different targets.
|
|
169
|
+
|
|
170
|
+
These are definitive noise - same features, different target values.
|
|
171
|
+
|
|
172
|
+
Args:
|
|
173
|
+
distance_threshold: Maximum distance to consider "coincident" (default: 1e-5)
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
DataFrame of coincident conflicts with their target differences
|
|
177
|
+
"""
|
|
178
|
+
# Use proximity to find coincident points
|
|
179
|
+
coincident = self.df[self.df["nn_distance"] < distance_threshold].copy()
|
|
180
|
+
|
|
181
|
+
if len(coincident) == 0:
|
|
182
|
+
return pd.DataFrame(columns=[self.id_column, self.target, "nn_id", "nn_target", "nn_target_diff"])
|
|
183
|
+
|
|
184
|
+
return (
|
|
185
|
+
coincident[[self.id_column, self.target, "nn_id", "nn_target", "nn_target_diff", "noise_score"]]
|
|
186
|
+
.sort_values("nn_target_diff", ascending=False)
|
|
187
|
+
.reset_index(drop=True)
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
def _validate_features(self, df: pd.DataFrame, features: List[str]) -> List[str]:
|
|
191
|
+
"""Remove non-numeric features and log warnings."""
|
|
192
|
+
non_numeric = [f for f in features if f not in df.select_dtypes(include=["number"]).columns]
|
|
193
|
+
if non_numeric:
|
|
194
|
+
log.warning(f"Non-numeric features {non_numeric} aren't currently supported, excluding them")
|
|
195
|
+
return [f for f in features if f not in non_numeric]
|
|
196
|
+
|
|
197
|
+
def _build_models(self) -> None:
|
|
198
|
+
"""Build the underfit, overfit, and proximity models."""
|
|
199
|
+
log.info("Building noise detection models...")
|
|
200
|
+
|
|
201
|
+
X = self.df[self.features]
|
|
202
|
+
y = self.df[self.target]
|
|
203
|
+
|
|
204
|
+
# Underfit model: intentionally simple (high bias)
|
|
205
|
+
log.info(" Fitting underfit model...")
|
|
206
|
+
self.underfit_model = XGBRegressor(
|
|
207
|
+
max_depth=2,
|
|
208
|
+
n_estimators=20,
|
|
209
|
+
learning_rate=0.1,
|
|
210
|
+
random_state=42,
|
|
211
|
+
verbosity=0,
|
|
212
|
+
)
|
|
213
|
+
self.underfit_model.fit(X, y)
|
|
214
|
+
|
|
215
|
+
# Overfit model: intentionally complex (high variance, low regularization)
|
|
216
|
+
log.info(" Fitting overfit model...")
|
|
217
|
+
self.overfit_model = XGBRegressor(
|
|
218
|
+
max_depth=12,
|
|
219
|
+
n_estimators=500,
|
|
220
|
+
learning_rate=0.1,
|
|
221
|
+
reg_lambda=0.0,
|
|
222
|
+
reg_alpha=0.0,
|
|
223
|
+
min_child_weight=1,
|
|
224
|
+
random_state=42,
|
|
225
|
+
verbosity=0,
|
|
226
|
+
)
|
|
227
|
+
self.overfit_model.fit(X, y)
|
|
228
|
+
|
|
229
|
+
# Proximity model for feature space analysis
|
|
230
|
+
log.info(" Building proximity model...")
|
|
231
|
+
self.proximity = FeatureSpaceProximity(
|
|
232
|
+
self.df,
|
|
233
|
+
id_column=self.id_column,
|
|
234
|
+
features=self.features,
|
|
235
|
+
target=self.target,
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
# Copy proximity metrics to our df
|
|
239
|
+
self.df["nn_distance"] = self.proximity.df["nn_distance"].values
|
|
240
|
+
self.df["nn_id"] = self.proximity.df["nn_id"].values
|
|
241
|
+
self.df["nn_target"] = self.proximity.df["nn_target"].values
|
|
242
|
+
self.df["nn_target_diff"] = self.proximity.df["nn_target_diff"].values
|
|
243
|
+
|
|
244
|
+
log.info("Noise detection models built successfully")
|
|
245
|
+
|
|
246
|
+
def _precompute_signals(self) -> None:
|
|
247
|
+
"""Precompute all noise signals for every sample."""
|
|
248
|
+
log.info("Precomputing noise signals...")
|
|
249
|
+
|
|
250
|
+
X = self.df[self.features]
|
|
251
|
+
y = self.df[self.target].values
|
|
252
|
+
|
|
253
|
+
# Underfit residuals (normalized by target std)
|
|
254
|
+
underfit_pred = self.underfit_model.predict(X)
|
|
255
|
+
self.df["underfit_residual"] = np.abs(y - underfit_pred) / self.target_std
|
|
256
|
+
|
|
257
|
+
# Overfit residuals (normalized by target std)
|
|
258
|
+
# This is the key "training error" signal from the paper
|
|
259
|
+
overfit_pred = self.overfit_model.predict(X)
|
|
260
|
+
self.df["overfit_residual"] = np.abs(y - overfit_pred) / self.target_std
|
|
261
|
+
|
|
262
|
+
# HTG score: neighbor disagreement (normalized by target std)
|
|
263
|
+
# Using nn_target_diff directly, normalized
|
|
264
|
+
self.df["htg_score"] = self.df["nn_target_diff"] / self.target_std
|
|
265
|
+
|
|
266
|
+
# Combine into overall noise score
|
|
267
|
+
# Scale each component to [0, 1] using percentile ranks, then average
|
|
268
|
+
self.df["noise_score"] = self._compute_combined_score()
|
|
269
|
+
|
|
270
|
+
log.info("Noise signals precomputed successfully")
|
|
271
|
+
|
|
272
|
+
def _compute_combined_score(self) -> np.ndarray:
|
|
273
|
+
"""
|
|
274
|
+
Combine individual signals into a single noise score.
|
|
275
|
+
|
|
276
|
+
Uses percentile ranks to normalize each signal to [0, 1], then averages.
|
|
277
|
+
Overfit residual gets higher weight as it's the most validated signal (per the paper).
|
|
278
|
+
"""
|
|
279
|
+
# Convert to percentile ranks (0-1 scale)
|
|
280
|
+
overfit_rank = self.df["overfit_residual"].rank(pct=True)
|
|
281
|
+
htg_rank = self.df["htg_score"].rank(pct=True)
|
|
282
|
+
|
|
283
|
+
# Weighted average: overfit gets 2x weight based on paper's findings
|
|
284
|
+
# that training error is the best noise detector
|
|
285
|
+
combined = (2.0 * overfit_rank + 1.0 * htg_rank) / 3.0
|
|
286
|
+
|
|
287
|
+
return combined.values
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
# Testing the NoiseModel class
|
|
291
|
+
if __name__ == "__main__":
|
|
292
|
+
|
|
293
|
+
from workbench.api import FeatureSet, Model
|
|
294
|
+
|
|
295
|
+
pd.set_option("display.max_columns", None)
|
|
296
|
+
pd.set_option("display.width", 1000)
|
|
297
|
+
|
|
298
|
+
# Create a sample DataFrame with some noisy points
|
|
299
|
+
np.random.seed(42)
|
|
300
|
+
n_samples = 100
|
|
301
|
+
|
|
302
|
+
# Generate clean data: y = 2*x1 + 3*x2 + noise
|
|
303
|
+
x1 = np.random.randn(n_samples)
|
|
304
|
+
x2 = np.random.randn(n_samples)
|
|
305
|
+
y_clean = 2 * x1 + 3 * x2 + np.random.randn(n_samples) * 0.1
|
|
306
|
+
|
|
307
|
+
# Add some noisy points (last 10 samples)
|
|
308
|
+
y_noisy = y_clean.copy()
|
|
309
|
+
y_noisy[-10:] += np.random.randn(10) * 5 # Large noise
|
|
310
|
+
|
|
311
|
+
data = {
|
|
312
|
+
"ID": [f"sample_{i}" for i in range(n_samples)],
|
|
313
|
+
"Feature1": x1,
|
|
314
|
+
"Feature2": x2,
|
|
315
|
+
"target": y_noisy,
|
|
316
|
+
}
|
|
317
|
+
df = pd.DataFrame(data)
|
|
318
|
+
|
|
319
|
+
print("=" * 80)
|
|
320
|
+
print("Testing NoiseModel...")
|
|
321
|
+
print("=" * 80)
|
|
322
|
+
|
|
323
|
+
# Create noise model
|
|
324
|
+
noise_model = NoiseModel(
|
|
325
|
+
df,
|
|
326
|
+
id_column="ID",
|
|
327
|
+
features=["Feature1", "Feature2"],
|
|
328
|
+
target="target",
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
# Get noise scores
|
|
332
|
+
print("\nTop 10 noisiest samples:")
|
|
333
|
+
scores = noise_model.get_scores()
|
|
334
|
+
print(scores.head(10))
|
|
335
|
+
|
|
336
|
+
# Check if our artificially noisy samples are detected
|
|
337
|
+
noisy_ids = [f"sample_{i}" for i in range(90, 100)]
|
|
338
|
+
detected = scores[scores["ID"].isin(noisy_ids)]
|
|
339
|
+
median_score = scores["noise_score"].median()
|
|
340
|
+
print(f"\nOf 10 noisy samples, {len(detected[detected['noise_score'] > median_score])} above median noise score")
|
|
341
|
+
|
|
342
|
+
# Get sample weights
|
|
343
|
+
print("\nSample weights (inverse strategy):")
|
|
344
|
+
weights = noise_model.get_sample_weights(strategy="inverse")
|
|
345
|
+
print(f" Min weight: {weights.min():.3f}")
|
|
346
|
+
print(f" Max weight: {weights.max():.3f}")
|
|
347
|
+
print(f" Mean weight: {weights.mean():.3f}")
|
|
348
|
+
|
|
349
|
+
# Get clean subset
|
|
350
|
+
clean = noise_model.get_clean_subset(percentile=90)
|
|
351
|
+
print(f"\nClean subset (bottom 90%): {len(clean)} samples")
|
|
352
|
+
|
|
353
|
+
# Get noisy samples
|
|
354
|
+
noisy = noise_model.get_noisy_samples(top_percent=10)
|
|
355
|
+
print(f"\nNoisy samples (top 10%): {len(noisy)} samples")
|
|
356
|
+
print(noisy[["ID", "target", "overfit_residual", "htg_score", "noise_score"]].head())
|
|
357
|
+
|
|
358
|
+
# Test with real data
|
|
359
|
+
print("\n" + "=" * 80)
|
|
360
|
+
print("Testing with AQSol data...")
|
|
361
|
+
print("=" * 80)
|
|
362
|
+
fs = FeatureSet("aqsol_features")
|
|
363
|
+
model = Model("aqsol-regression")
|
|
364
|
+
|
|
365
|
+
if fs.exists():
|
|
366
|
+
features = model.features()
|
|
367
|
+
target = model.target()
|
|
368
|
+
df = fs.pull_dataframe()
|
|
369
|
+
|
|
370
|
+
noise_model = NoiseModel(
|
|
371
|
+
df,
|
|
372
|
+
id_column=fs.id_column,
|
|
373
|
+
features=features,
|
|
374
|
+
target=target,
|
|
375
|
+
)
|
|
376
|
+
|
|
377
|
+
print("\nTop 10 noisiest compounds:")
|
|
378
|
+
scores = noise_model.get_scores()
|
|
379
|
+
print(scores.head(10))
|
|
380
|
+
|
|
381
|
+
print("\nCoincident conflicts:")
|
|
382
|
+
conflicts = noise_model.coincident_conflicts()
|
|
383
|
+
print(f"Found {len(conflicts)} coincident conflicts")
|
|
384
|
+
if len(conflicts) > 0:
|
|
385
|
+
print(conflicts.head())
|
|
386
|
+
|
|
387
|
+
print("\nNoise score distribution:")
|
|
388
|
+
print(scores["noise_score"].describe())
|
|
@@ -209,9 +209,9 @@ class Outliers:
|
|
|
209
209
|
else:
|
|
210
210
|
return group.nlargest(n, col)
|
|
211
211
|
|
|
212
|
-
# Group by 'outlier_group' and apply the helper function, explicitly selecting columns
|
|
213
|
-
top_outliers = outlier_df.groupby("outlier_group", group_keys=False).apply(
|
|
214
|
-
get_extreme_values
|
|
212
|
+
# Group by 'outlier_group' and apply the helper function, explicitly selecting columns to silence warning
|
|
213
|
+
top_outliers = outlier_df.groupby("outlier_group", group_keys=False)[outlier_df.columns].apply(
|
|
214
|
+
get_extreme_values
|
|
215
215
|
)
|
|
216
216
|
return top_outliers.reset_index(drop=True)
|
|
217
217
|
|
workbench/api/__init__.py
CHANGED
|
@@ -5,6 +5,7 @@ These class provide high-level APIs for the Workbench package, offering easy acc
|
|
|
5
5
|
- DataSource: Manages AWS Data Catalog and Athena
|
|
6
6
|
- FeatureSet: Manages AWS Feature Store and Feature Groups
|
|
7
7
|
- Model: Manages the training and deployment of AWS Model Groups and Packages
|
|
8
|
+
- MetaModel: A Model that aggregates predictions from multiple child endpoints
|
|
8
9
|
- ModelType: Enum for the different model types supported by Workbench
|
|
9
10
|
- Endpoint: Manages the deployment and invocations/inference on AWS Endpoints
|
|
10
11
|
- Meta: Provides an API to retrieve AWS Metadata for the above classes
|
|
@@ -14,7 +15,8 @@ These class provide high-level APIs for the Workbench package, offering easy acc
|
|
|
14
15
|
|
|
15
16
|
from .data_source import DataSource
|
|
16
17
|
from .feature_set import FeatureSet
|
|
17
|
-
from .model import Model, ModelType
|
|
18
|
+
from .model import Model, ModelType, ModelFramework
|
|
19
|
+
from .meta_model import MetaModel
|
|
18
20
|
from .endpoint import Endpoint
|
|
19
21
|
from .meta import Meta
|
|
20
22
|
from .parameter_store import ParameterStore
|
|
@@ -24,7 +26,9 @@ __all__ = [
|
|
|
24
26
|
"DataSource",
|
|
25
27
|
"FeatureSet",
|
|
26
28
|
"Model",
|
|
29
|
+
"MetaModel",
|
|
27
30
|
"ModelType",
|
|
31
|
+
"ModelFramework",
|
|
28
32
|
"Endpoint",
|
|
29
33
|
"Meta",
|
|
30
34
|
"ParameterStore",
|
workbench/api/df_store.py
CHANGED
|
@@ -1,35 +1,32 @@
|
|
|
1
1
|
"""DFStore: Fast/efficient storage of DataFrames using AWS S3/Parquet/Snappy"""
|
|
2
2
|
|
|
3
|
-
from datetime import datetime
|
|
4
3
|
from typing import Union
|
|
5
|
-
import logging
|
|
6
|
-
import pandas as pd
|
|
7
4
|
|
|
8
5
|
# Workbench Imports
|
|
9
|
-
from workbench.core.
|
|
6
|
+
from workbench.core.artifacts.df_store_core import DFStoreCore
|
|
10
7
|
|
|
11
8
|
|
|
12
|
-
class DFStore(
|
|
9
|
+
class DFStore(DFStoreCore):
|
|
13
10
|
"""DFStore: Fast/efficient storage of DataFrames using AWS S3/Parquet/Snappy
|
|
14
11
|
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
12
|
+
Common Usage:
|
|
13
|
+
```python
|
|
14
|
+
df_store = DFStore()
|
|
18
15
|
|
|
19
|
-
|
|
20
|
-
|
|
16
|
+
# List Data
|
|
17
|
+
df_store.list()
|
|
21
18
|
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
19
|
+
# Add DataFrame
|
|
20
|
+
df = pd.DataFrame({"A": [1, 2], "B": [3, 4]})
|
|
21
|
+
df_store.upsert("/test/my_data", df)
|
|
25
22
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
23
|
+
# Retrieve DataFrame
|
|
24
|
+
df = df_store.get("/test/my_data")
|
|
25
|
+
print(df)
|
|
29
26
|
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
27
|
+
# Delete Data
|
|
28
|
+
df_store.delete("/test/my_data")
|
|
29
|
+
```
|
|
33
30
|
"""
|
|
34
31
|
|
|
35
32
|
def __init__(self, path_prefix: Union[str, None] = None):
|
|
@@ -38,101 +35,13 @@ class DFStore(AWSDFStore):
|
|
|
38
35
|
Args:
|
|
39
36
|
path_prefix (Union[str, None], optional): Add a path prefix to storage locations (Defaults to None)
|
|
40
37
|
"""
|
|
41
|
-
self.log = logging.getLogger("workbench")
|
|
42
|
-
|
|
43
|
-
# Initialize the SuperClass
|
|
44
38
|
super().__init__(path_prefix=path_prefix)
|
|
45
39
|
|
|
46
|
-
def list(self, include_cache: bool = False) -> list:
|
|
47
|
-
"""List all the objects in the data_store prefix.
|
|
48
|
-
|
|
49
|
-
Args:
|
|
50
|
-
include_cache (bool, optional): Include cache objects in the list (Defaults to False).
|
|
51
|
-
|
|
52
|
-
Returns:
|
|
53
|
-
list: A list of all the objects in the data_store prefix.
|
|
54
|
-
"""
|
|
55
|
-
return super().list(include_cache=include_cache)
|
|
56
|
-
|
|
57
|
-
def summary(self, include_cache: bool = False) -> pd.DataFrame:
|
|
58
|
-
"""Return a nicely formatted summary of object locations, sizes (in MB), and modified dates.
|
|
59
|
-
|
|
60
|
-
Args:
|
|
61
|
-
include_cache (bool, optional): Include cache objects in the summary (Defaults to False).
|
|
62
|
-
|
|
63
|
-
Returns:
|
|
64
|
-
pd.DataFrame: A formatted DataFrame with the summary details.
|
|
65
|
-
"""
|
|
66
|
-
return super().summary(include_cache=include_cache)
|
|
67
|
-
|
|
68
|
-
def details(self, include_cache: bool = False) -> pd.DataFrame:
|
|
69
|
-
"""Return a DataFrame with detailed metadata for all objects in the data_store prefix.
|
|
70
|
-
|
|
71
|
-
Args:
|
|
72
|
-
include_cache (bool, optional): Include cache objects in the details (Defaults to False).
|
|
73
|
-
|
|
74
|
-
Returns:
|
|
75
|
-
pd.DataFrame: A DataFrame with detailed metadata for all objects in the data_store prefix.
|
|
76
|
-
"""
|
|
77
|
-
return super().details(include_cache=include_cache)
|
|
78
|
-
|
|
79
|
-
def check(self, location: str) -> bool:
|
|
80
|
-
"""Check if a DataFrame exists at the specified location
|
|
81
|
-
|
|
82
|
-
Args:
|
|
83
|
-
location (str): The location of the data to check.
|
|
84
|
-
|
|
85
|
-
Returns:
|
|
86
|
-
bool: True if the data exists, False otherwise.
|
|
87
|
-
"""
|
|
88
|
-
return super().check(location)
|
|
89
|
-
|
|
90
|
-
def get(self, location: str) -> Union[pd.DataFrame, None]:
|
|
91
|
-
"""Retrieve a DataFrame from AWS S3.
|
|
92
|
-
|
|
93
|
-
Args:
|
|
94
|
-
location (str): The location of the data to retrieve.
|
|
95
|
-
|
|
96
|
-
Returns:
|
|
97
|
-
pd.DataFrame: The retrieved DataFrame or None if not found.
|
|
98
|
-
"""
|
|
99
|
-
_df = super().get(location)
|
|
100
|
-
if _df is None:
|
|
101
|
-
self.log.error(f"Dataframe not found at location: {location}")
|
|
102
|
-
return _df
|
|
103
|
-
|
|
104
|
-
def upsert(self, location: str, data: Union[pd.DataFrame, pd.Series]):
|
|
105
|
-
"""Insert or update a DataFrame or Series in the AWS S3.
|
|
106
|
-
|
|
107
|
-
Args:
|
|
108
|
-
location (str): The location of the data.
|
|
109
|
-
data (Union[pd.DataFrame, pd.Series]): The data to be stored.
|
|
110
|
-
"""
|
|
111
|
-
super().upsert(location, data)
|
|
112
|
-
|
|
113
|
-
def last_modified(self, location: str) -> Union[datetime, None]:
|
|
114
|
-
"""Get the last modified date of the DataFrame at the specified location.
|
|
115
|
-
|
|
116
|
-
Args:
|
|
117
|
-
location (str): The location of the data to check.
|
|
118
|
-
|
|
119
|
-
Returns:
|
|
120
|
-
Union[datetime, None]: The last modified date of the DataFrame or None if not found.
|
|
121
|
-
"""
|
|
122
|
-
return super().last_modified(location)
|
|
123
|
-
|
|
124
|
-
def delete(self, location: str):
|
|
125
|
-
"""Delete a DataFrame from the AWS S3.
|
|
126
|
-
|
|
127
|
-
Args:
|
|
128
|
-
location (str): The location of the data to delete.
|
|
129
|
-
"""
|
|
130
|
-
super().delete(location)
|
|
131
|
-
|
|
132
40
|
|
|
133
41
|
if __name__ == "__main__":
|
|
134
42
|
"""Exercise the DFStore Class"""
|
|
135
43
|
import time
|
|
44
|
+
import pandas as pd
|
|
136
45
|
|
|
137
46
|
# Create a DFStore manager
|
|
138
47
|
df_store = DFStore()
|
workbench/api/endpoint.py
CHANGED
|
@@ -44,16 +44,21 @@ class Endpoint(EndpointCore):
|
|
|
44
44
|
"""
|
|
45
45
|
return super().inference(eval_df, capture_name, id_column, drop_error_rows)
|
|
46
46
|
|
|
47
|
-
def auto_inference(self
|
|
48
|
-
"""Run inference on the Endpoint using the
|
|
47
|
+
def auto_inference(self) -> pd.DataFrame:
|
|
48
|
+
"""Run inference on the Endpoint using the test data from the model training view
|
|
49
49
|
|
|
50
|
-
|
|
51
|
-
|
|
50
|
+
Returns:
|
|
51
|
+
pd.DataFrame: The DataFrame with predictions
|
|
52
|
+
"""
|
|
53
|
+
return super().auto_inference()
|
|
54
|
+
|
|
55
|
+
def full_inference(self) -> pd.DataFrame:
|
|
56
|
+
"""Run inference on the Endpoint using the full data from the model training view
|
|
52
57
|
|
|
53
58
|
Returns:
|
|
54
59
|
pd.DataFrame: The DataFrame with predictions
|
|
55
60
|
"""
|
|
56
|
-
return super().
|
|
61
|
+
return super().full_inference()
|
|
57
62
|
|
|
58
63
|
def fast_inference(self, eval_df: pd.DataFrame, threads: int = 4) -> pd.DataFrame:
|
|
59
64
|
"""Run inference on the Endpoint using the provided DataFrame
|
|
@@ -70,16 +75,13 @@ class Endpoint(EndpointCore):
|
|
|
70
75
|
"""
|
|
71
76
|
return super().fast_inference(eval_df, threads=threads)
|
|
72
77
|
|
|
73
|
-
def cross_fold_inference(self
|
|
74
|
-
"""
|
|
75
|
-
|
|
76
|
-
Args:
|
|
77
|
-
nfolds (int): The number of folds to use for cross-validation (default: 5)
|
|
78
|
+
def cross_fold_inference(self) -> pd.DataFrame:
|
|
79
|
+
"""Pull cross-fold inference from model associated with this Endpoint
|
|
78
80
|
|
|
79
81
|
Returns:
|
|
80
|
-
|
|
82
|
+
pd.DataFrame: A DataFrame with cross fold predictions
|
|
81
83
|
"""
|
|
82
|
-
return super().cross_fold_inference(
|
|
84
|
+
return super().cross_fold_inference()
|
|
83
85
|
|
|
84
86
|
|
|
85
87
|
if __name__ == "__main__":
|