workbench 0.8.213__py3-none-any.whl → 0.8.219__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- workbench/algorithms/dataframe/feature_space_proximity.py +168 -75
- workbench/algorithms/dataframe/fingerprint_proximity.py +257 -80
- workbench/algorithms/dataframe/projection_2d.py +38 -21
- workbench/algorithms/dataframe/proximity.py +75 -150
- workbench/algorithms/graph/light/proximity_graph.py +5 -5
- workbench/algorithms/models/cleanlab_model.py +382 -0
- workbench/algorithms/models/noise_model.py +2 -2
- workbench/algorithms/sql/outliers.py +3 -3
- workbench/api/__init__.py +3 -0
- workbench/api/endpoint.py +10 -5
- workbench/api/feature_set.py +76 -6
- workbench/api/meta_model.py +289 -0
- workbench/api/model.py +43 -4
- workbench/core/artifacts/endpoint_core.py +65 -117
- workbench/core/artifacts/feature_set_core.py +3 -3
- workbench/core/artifacts/model_core.py +6 -4
- workbench/core/pipelines/pipeline_executor.py +1 -1
- workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +30 -10
- workbench/model_script_utils/model_script_utils.py +15 -11
- workbench/model_script_utils/pytorch_utils.py +11 -1
- workbench/model_scripts/chemprop/chemprop.template +147 -71
- workbench/model_scripts/chemprop/generated_model_script.py +151 -75
- workbench/model_scripts/chemprop/model_script_utils.py +15 -11
- workbench/model_scripts/custom_models/chem_info/fingerprints.py +87 -46
- workbench/model_scripts/custom_models/proximity/feature_space_proximity.py +194 -0
- workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +6 -6
- workbench/model_scripts/custom_models/uq_models/feature_space_proximity.py +194 -0
- workbench/model_scripts/meta_model/generated_model_script.py +209 -0
- workbench/model_scripts/meta_model/meta_model.template +209 -0
- workbench/model_scripts/pytorch_model/generated_model_script.py +45 -27
- workbench/model_scripts/pytorch_model/model_script_utils.py +15 -11
- workbench/model_scripts/pytorch_model/pytorch.template +42 -24
- workbench/model_scripts/pytorch_model/pytorch_utils.py +11 -1
- workbench/model_scripts/script_generation.py +4 -0
- workbench/model_scripts/xgb_model/generated_model_script.py +167 -156
- workbench/model_scripts/xgb_model/model_script_utils.py +15 -11
- workbench/model_scripts/xgb_model/xgb_model.template +163 -152
- workbench/repl/workbench_shell.py +0 -5
- workbench/scripts/endpoint_test.py +2 -2
- workbench/scripts/meta_model_sim.py +35 -0
- workbench/utils/chem_utils/fingerprints.py +87 -46
- workbench/utils/chemprop_utils.py +23 -5
- workbench/utils/meta_model_simulator.py +499 -0
- workbench/utils/metrics_utils.py +94 -10
- workbench/utils/model_utils.py +91 -9
- workbench/utils/pytorch_utils.py +1 -1
- workbench/utils/shap_utils.py +1 -55
- workbench/web_interface/components/plugins/scatter_plot.py +4 -8
- {workbench-0.8.213.dist-info → workbench-0.8.219.dist-info}/METADATA +2 -1
- {workbench-0.8.213.dist-info → workbench-0.8.219.dist-info}/RECORD +54 -50
- {workbench-0.8.213.dist-info → workbench-0.8.219.dist-info}/entry_points.txt +1 -0
- workbench/model_scripts/custom_models/meta_endpoints/example.py +0 -53
- workbench/model_scripts/custom_models/proximity/proximity.py +0 -410
- workbench/model_scripts/custom_models/uq_models/meta_uq.template +0 -377
- workbench/model_scripts/custom_models/uq_models/proximity.py +0 -410
- {workbench-0.8.213.dist-info → workbench-0.8.219.dist-info}/WHEEL +0 -0
- {workbench-0.8.213.dist-info → workbench-0.8.219.dist-info}/licenses/LICENSE +0 -0
- {workbench-0.8.213.dist-info → workbench-0.8.219.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,382 @@
|
|
|
1
|
+
"""Cleanlab-based label quality detection for regression and classification.
|
|
2
|
+
|
|
3
|
+
Note: Users must install cleanlab separately: pip install cleanlab
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import logging
|
|
7
|
+
from typing import List, Optional
|
|
8
|
+
|
|
9
|
+
import datasets
|
|
10
|
+
import pandas as pd
|
|
11
|
+
from sklearn.ensemble import HistGradientBoostingRegressor, HistGradientBoostingClassifier
|
|
12
|
+
from sklearn.preprocessing import LabelEncoder
|
|
13
|
+
|
|
14
|
+
from workbench.core.artifacts.model_core import ModelType
|
|
15
|
+
|
|
16
|
+
# Check datasets version - Datalab has a bug with datasets>=4.0.0
|
|
17
|
+
# See: https://github.com/cleanlab/cleanlab/issues/1253
|
|
18
|
+
_datasets_major = int(datasets.__version__.split(".")[0])
|
|
19
|
+
if _datasets_major >= 4:
|
|
20
|
+
raise ImportError(
|
|
21
|
+
"cleanlab's Datalab requires datasets<4.0.0 due to a known bug.\n"
|
|
22
|
+
"See: https://github.com/cleanlab/cleanlab/issues/1253\n"
|
|
23
|
+
"Fix: pip install 'datasets<4.0.0'"
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
# Check for cleanlab package
|
|
27
|
+
try:
|
|
28
|
+
from cleanlab.regression.learn import CleanLearning as CleanLearningRegressor
|
|
29
|
+
from cleanlab.classification import CleanLearning as CleanLearningClassifier
|
|
30
|
+
from cleanlab import Datalab
|
|
31
|
+
|
|
32
|
+
CLEANLAB_AVAILABLE = True
|
|
33
|
+
except ImportError:
|
|
34
|
+
CLEANLAB_AVAILABLE = False
|
|
35
|
+
CleanLearningRegressor = None
|
|
36
|
+
CleanLearningClassifier = None
|
|
37
|
+
Datalab = None
|
|
38
|
+
|
|
39
|
+
# Regressor types for convenience
|
|
40
|
+
REGRESSOR_TYPES = [ModelType.REGRESSOR, ModelType.UQ_REGRESSOR, ModelType.ENSEMBLE_REGRESSOR]
|
|
41
|
+
|
|
42
|
+
# Set up logging
|
|
43
|
+
log = logging.getLogger("workbench")
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class CleanlabModels:
|
|
47
|
+
"""Factory class for cleanlab models with shared data preparation.
|
|
48
|
+
|
|
49
|
+
This class handles data preparation once and provides lazy-loaded access
|
|
50
|
+
to both CleanLearning and Datalab models. Each model is only created
|
|
51
|
+
when first requested, and the prepared data is shared between them.
|
|
52
|
+
|
|
53
|
+
Attributes:
|
|
54
|
+
id_column: Name of the ID column in the data.
|
|
55
|
+
features: List of feature column names.
|
|
56
|
+
target: Name of the target column.
|
|
57
|
+
model_type: ModelType (REGRESSOR, CLASSIFIER, etc.).
|
|
58
|
+
|
|
59
|
+
Example:
|
|
60
|
+
```python
|
|
61
|
+
cleanlab = CleanlabModels(df, "id", features, "target", ModelType.REGRESSOR)
|
|
62
|
+
|
|
63
|
+
# Get CleanLearning model for label issues and uncertainty
|
|
64
|
+
cl = cleanlab.clean_learning()
|
|
65
|
+
issues = cl.get_label_issues()
|
|
66
|
+
|
|
67
|
+
# Get Datalab for comprehensive data quality report
|
|
68
|
+
lab = cleanlab.datalab()
|
|
69
|
+
lab.report()
|
|
70
|
+
```
|
|
71
|
+
"""
|
|
72
|
+
|
|
73
|
+
def __init__(
|
|
74
|
+
self,
|
|
75
|
+
df: pd.DataFrame,
|
|
76
|
+
id_column: str,
|
|
77
|
+
features: List[str],
|
|
78
|
+
target: str,
|
|
79
|
+
model_type: ModelType = ModelType.REGRESSOR,
|
|
80
|
+
):
|
|
81
|
+
"""Initialize CleanlabModels with data preparation.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
df: DataFrame containing data for analysis.
|
|
85
|
+
id_column: Name of the column used as the identifier.
|
|
86
|
+
features: List of feature column names.
|
|
87
|
+
target: Name of the target column.
|
|
88
|
+
model_type: ModelType (REGRESSOR, CLASSIFIER, etc.).
|
|
89
|
+
"""
|
|
90
|
+
if not CLEANLAB_AVAILABLE:
|
|
91
|
+
raise ImportError("cleanlab is not installed. Install with: pip install 'cleanlab[datalab]'")
|
|
92
|
+
|
|
93
|
+
self.id_column = id_column
|
|
94
|
+
self.target = target
|
|
95
|
+
self.model_type = model_type
|
|
96
|
+
|
|
97
|
+
# Filter to numeric features only
|
|
98
|
+
numeric_cols = df.select_dtypes(include=["number"]).columns
|
|
99
|
+
non_numeric = [f for f in features if f not in numeric_cols]
|
|
100
|
+
if non_numeric:
|
|
101
|
+
log.warning(f"Excluding non-numeric features: {non_numeric}")
|
|
102
|
+
features = [f for f in features if f in numeric_cols]
|
|
103
|
+
self.features = features
|
|
104
|
+
|
|
105
|
+
# Prepare clean data (shared by both models)
|
|
106
|
+
self._clean_df = df.dropna(subset=features + [target])[[id_column] + features + [target]].copy()
|
|
107
|
+
self._clean_df = self._clean_df.reset_index(drop=True)
|
|
108
|
+
self._X = self._clean_df[features].values
|
|
109
|
+
self._y = self._clean_df[target].values
|
|
110
|
+
|
|
111
|
+
# For classification, encode labels
|
|
112
|
+
self._label_encoder: Optional[LabelEncoder] = None
|
|
113
|
+
self._y_encoded = self._y
|
|
114
|
+
if model_type == ModelType.CLASSIFIER:
|
|
115
|
+
self._label_encoder = LabelEncoder()
|
|
116
|
+
self._y_encoded = self._label_encoder.fit_transform(self._y)
|
|
117
|
+
|
|
118
|
+
# Lazy-loaded models
|
|
119
|
+
self._clean_learning = None
|
|
120
|
+
self._datalab = None
|
|
121
|
+
|
|
122
|
+
def clean_learning(self):
|
|
123
|
+
"""Get the CleanLearning model (fitted, with label issues computed).
|
|
124
|
+
|
|
125
|
+
Returns the cleanlab CleanLearning model with enhanced get_label_issues()
|
|
126
|
+
that includes the ID column, sorts by label quality, and decodes labels.
|
|
127
|
+
|
|
128
|
+
Returns:
|
|
129
|
+
CleanLearning: Fitted cleanlab model with methods like:
|
|
130
|
+
- get_label_issues(): DataFrame with id_column, sorted by label_quality
|
|
131
|
+
- predict(X): Make predictions
|
|
132
|
+
- For regression: get_epistemic_uncertainty(), get_aleatoric_uncertainty()
|
|
133
|
+
"""
|
|
134
|
+
if self._clean_learning is not None:
|
|
135
|
+
return self._clean_learning
|
|
136
|
+
|
|
137
|
+
if self.model_type == ModelType.CLASSIFIER:
|
|
138
|
+
log.info("Building CleanLearning model (classification)...")
|
|
139
|
+
cl_model = CleanLearningClassifier(
|
|
140
|
+
HistGradientBoostingClassifier(),
|
|
141
|
+
find_label_issues_kwargs={"n_jobs": 1},
|
|
142
|
+
)
|
|
143
|
+
cl_model.fit(self._X, self._y_encoded)
|
|
144
|
+
else:
|
|
145
|
+
log.info("Building CleanLearning model (regression)...")
|
|
146
|
+
cl_model = CleanLearningRegressor(HistGradientBoostingRegressor())
|
|
147
|
+
cl_model.fit(self._X, self._y)
|
|
148
|
+
|
|
149
|
+
# Enhance get_label_issues to include id column, sort, and decode labels
|
|
150
|
+
original_get_label_issues = cl_model.get_label_issues
|
|
151
|
+
id_column = self.id_column
|
|
152
|
+
clean_df = self._clean_df
|
|
153
|
+
model_type = self.model_type
|
|
154
|
+
label_encoder = self._label_encoder
|
|
155
|
+
|
|
156
|
+
def get_label_issues_enhanced():
|
|
157
|
+
issues = original_get_label_issues().copy()
|
|
158
|
+
issues.insert(0, id_column, clean_df[id_column].values)
|
|
159
|
+
if model_type == ModelType.CLASSIFIER and label_encoder is not None:
|
|
160
|
+
for col in ["given_label", "predicted_label"]:
|
|
161
|
+
if col in issues.columns:
|
|
162
|
+
issues[col] = label_encoder.inverse_transform(issues[col])
|
|
163
|
+
return issues.sort_values("label_quality").reset_index(drop=True)
|
|
164
|
+
|
|
165
|
+
cl_model.get_label_issues = get_label_issues_enhanced
|
|
166
|
+
|
|
167
|
+
# For regression, enhance uncertainty methods to use stored data and return DataFrames
|
|
168
|
+
if model_type != ModelType.CLASSIFIER:
|
|
169
|
+
X = self._X
|
|
170
|
+
y = self._y
|
|
171
|
+
original_get_aleatoric = cl_model.get_aleatoric_uncertainty
|
|
172
|
+
original_get_epistemic = cl_model.get_epistemic_uncertainty
|
|
173
|
+
|
|
174
|
+
def get_aleatoric_uncertainty_enhanced():
|
|
175
|
+
residual = cl_model.predict(X) - y
|
|
176
|
+
return original_get_aleatoric(X, residual)
|
|
177
|
+
|
|
178
|
+
def get_epistemic_uncertainty_enhanced():
|
|
179
|
+
values = original_get_epistemic(X, y)
|
|
180
|
+
return (
|
|
181
|
+
pd.DataFrame(
|
|
182
|
+
{
|
|
183
|
+
id_column: clean_df[id_column].values,
|
|
184
|
+
"epistemic_uncertainty": values,
|
|
185
|
+
}
|
|
186
|
+
)
|
|
187
|
+
.sort_values("epistemic_uncertainty", ascending=False)
|
|
188
|
+
.reset_index(drop=True)
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
cl_model.get_aleatoric_uncertainty = get_aleatoric_uncertainty_enhanced
|
|
192
|
+
cl_model.get_epistemic_uncertainty = get_epistemic_uncertainty_enhanced
|
|
193
|
+
|
|
194
|
+
n_issues = original_get_label_issues()["is_label_issue"].sum()
|
|
195
|
+
log.info(f"CleanLearning: {n_issues} potential label issues out of {len(self._clean_df)} samples")
|
|
196
|
+
|
|
197
|
+
self._clean_learning = cl_model
|
|
198
|
+
return cl_model
|
|
199
|
+
|
|
200
|
+
def datalab(self):
|
|
201
|
+
"""Get the Datalab instance (with find_issues already called).
|
|
202
|
+
|
|
203
|
+
Returns the native cleanlab Datalab for comprehensive data quality
|
|
204
|
+
analysis. Issues have already been detected.
|
|
205
|
+
|
|
206
|
+
Note: For classification, this will build the CleanLearning model first
|
|
207
|
+
(if not already built) to reuse its classifier for pred_probs.
|
|
208
|
+
|
|
209
|
+
Returns:
|
|
210
|
+
Datalab: Cleanlab Datalab instance with methods like:
|
|
211
|
+
- report(): Print comprehensive data quality report
|
|
212
|
+
- get_issues(): DataFrame with all detected issues
|
|
213
|
+
- get_issue_summary(): Summary statistics
|
|
214
|
+
"""
|
|
215
|
+
if self._datalab is not None:
|
|
216
|
+
return self._datalab
|
|
217
|
+
|
|
218
|
+
log.info("Building Datalab model...")
|
|
219
|
+
|
|
220
|
+
# Create DataFrame with only numeric columns (features + target) for Datalab
|
|
221
|
+
datalab_df = self._clean_df[self.features + [self.target]]
|
|
222
|
+
|
|
223
|
+
# Create Datalab instance
|
|
224
|
+
if self.model_type == ModelType.CLASSIFIER:
|
|
225
|
+
lab = Datalab(data=datalab_df, label_name=self.target)
|
|
226
|
+
# Build CleanLearning first to reuse its classifier for pred_probs
|
|
227
|
+
cl = self.clean_learning()
|
|
228
|
+
pred_probs = cl.clf.predict_proba(self._X)
|
|
229
|
+
lab.find_issues(features=self._X, pred_probs=pred_probs)
|
|
230
|
+
else:
|
|
231
|
+
lab = Datalab(data=datalab_df, label_name=self.target, task="regression")
|
|
232
|
+
lab.find_issues(features=self._X)
|
|
233
|
+
|
|
234
|
+
self._datalab = lab
|
|
235
|
+
return lab
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
# Keep the old function for backwards compatibility
|
|
239
|
+
def create_cleanlab_model(
|
|
240
|
+
df: pd.DataFrame,
|
|
241
|
+
id_column: str,
|
|
242
|
+
features: List[str],
|
|
243
|
+
target: str,
|
|
244
|
+
model_type: ModelType = ModelType.REGRESSOR,
|
|
245
|
+
):
|
|
246
|
+
"""Create a CleanlabModels instance for label quality detection.
|
|
247
|
+
|
|
248
|
+
Args:
|
|
249
|
+
df: DataFrame containing data for label quality detection.
|
|
250
|
+
id_column: Name of the column used as the identifier.
|
|
251
|
+
features: List of feature column names.
|
|
252
|
+
target: Name of the target column.
|
|
253
|
+
model_type: ModelType (REGRESSOR, CLASSIFIER, etc.).
|
|
254
|
+
|
|
255
|
+
Returns:
|
|
256
|
+
CleanlabModels: Factory providing access to CleanLearning and Datalab models.
|
|
257
|
+
|
|
258
|
+
Example:
|
|
259
|
+
```python
|
|
260
|
+
cleanlab = create_cleanlab_model(df, "id", features, "target")
|
|
261
|
+
|
|
262
|
+
# Get CleanLearning model and label issues
|
|
263
|
+
cl = cleanlab.clean_learning()
|
|
264
|
+
issues = cl.get_label_issues() # Includes ID column, sorted by quality
|
|
265
|
+
|
|
266
|
+
# Get Datalab for comprehensive data quality report
|
|
267
|
+
lab = cleanlab.datalab()
|
|
268
|
+
lab.report()
|
|
269
|
+
```
|
|
270
|
+
|
|
271
|
+
References:
|
|
272
|
+
cleanlab: https://github.com/cleanlab/cleanlab
|
|
273
|
+
"""
|
|
274
|
+
return CleanlabModels(df, id_column, features, target, model_type)
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
if __name__ == "__main__":
|
|
278
|
+
from workbench.api import FeatureSet, Model
|
|
279
|
+
import numpy as np
|
|
280
|
+
|
|
281
|
+
pd.set_option("display.max_columns", None)
|
|
282
|
+
pd.set_option("display.width", 1000)
|
|
283
|
+
|
|
284
|
+
# Create a sample DataFrame with some noisy points
|
|
285
|
+
np.random.seed(42)
|
|
286
|
+
n_samples = 100
|
|
287
|
+
|
|
288
|
+
# Generate clean data: y = 2*x1 + 3*x2 + noise
|
|
289
|
+
x1 = np.random.randn(n_samples)
|
|
290
|
+
x2 = np.random.randn(n_samples)
|
|
291
|
+
y_clean = 2 * x1 + 3 * x2 + np.random.randn(n_samples) * 0.1
|
|
292
|
+
|
|
293
|
+
# Add some noisy points (last 10 samples)
|
|
294
|
+
y_noisy = y_clean.copy()
|
|
295
|
+
y_noisy[-10:] += np.random.randn(10) * 20 # Large noise
|
|
296
|
+
|
|
297
|
+
data = {
|
|
298
|
+
"ID": [f"sample_{i}" for i in range(n_samples)],
|
|
299
|
+
"Feature1": x1,
|
|
300
|
+
"Feature2": x2,
|
|
301
|
+
"target": y_noisy,
|
|
302
|
+
}
|
|
303
|
+
df = pd.DataFrame(data)
|
|
304
|
+
|
|
305
|
+
print("=" * 80)
|
|
306
|
+
print("Testing CleanlabModels with synthetic data...")
|
|
307
|
+
print("=" * 80)
|
|
308
|
+
|
|
309
|
+
# Create CleanlabModels instance
|
|
310
|
+
cleanlab_models = create_cleanlab_model(
|
|
311
|
+
df,
|
|
312
|
+
id_column="ID",
|
|
313
|
+
features=["Feature1", "Feature2"],
|
|
314
|
+
target="target",
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
# Get CleanLearning model and test get_label_issues
|
|
318
|
+
cl = cleanlab_models.clean_learning()
|
|
319
|
+
print(f"CleanLearning type: {type(cl)}")
|
|
320
|
+
|
|
321
|
+
label_issues = cl.get_label_issues()
|
|
322
|
+
print("\nLabel issues (worst first, with ID column):")
|
|
323
|
+
print(label_issues.head(10))
|
|
324
|
+
|
|
325
|
+
# Check if our artificially noisy samples are detected
|
|
326
|
+
noisy_ids = [f"sample_{i}" for i in range(90, 100)]
|
|
327
|
+
worst_10 = label_issues.head(10)
|
|
328
|
+
detected = worst_10[worst_10["ID"].isin(noisy_ids)]
|
|
329
|
+
print(f"\nOf 10 noisy samples, {len(detected)} appear in worst 10")
|
|
330
|
+
|
|
331
|
+
# Test Datalab
|
|
332
|
+
print("\n" + "=" * 80)
|
|
333
|
+
print("Testing Datalab...")
|
|
334
|
+
print("=" * 80)
|
|
335
|
+
lab = cleanlab_models.datalab()
|
|
336
|
+
print(f"Datalab type: {type(lab)}")
|
|
337
|
+
print(f"Datalab issues shape: {lab.get_issues().shape}")
|
|
338
|
+
lab.report(num_examples=3)
|
|
339
|
+
|
|
340
|
+
# Test with real AQSol regression data
|
|
341
|
+
print("\n" + "=" * 80)
|
|
342
|
+
print("Testing with AQSol regression data...")
|
|
343
|
+
print("=" * 80)
|
|
344
|
+
fs = FeatureSet("aqsol_features")
|
|
345
|
+
df = fs.pull_dataframe()
|
|
346
|
+
model = Model("aqsol-regression")
|
|
347
|
+
features = model.features()
|
|
348
|
+
target = model.target()
|
|
349
|
+
|
|
350
|
+
cleanlab_models = create_cleanlab_model(
|
|
351
|
+
df,
|
|
352
|
+
id_column=fs.id_column,
|
|
353
|
+
features=features,
|
|
354
|
+
target=target,
|
|
355
|
+
)
|
|
356
|
+
|
|
357
|
+
# Get CleanLearning and label issues
|
|
358
|
+
cl = cleanlab_models.clean_learning()
|
|
359
|
+
label_issues = cl.get_label_issues()
|
|
360
|
+
print("\nLabel issues summary:")
|
|
361
|
+
print(f"Total samples: {len(label_issues)}")
|
|
362
|
+
print(f"Flagged as issues: {label_issues['is_label_issue'].sum()}")
|
|
363
|
+
|
|
364
|
+
print("\nWorst label quality samples:")
|
|
365
|
+
print(label_issues.head(10))
|
|
366
|
+
|
|
367
|
+
print("\nLabel quality distribution:")
|
|
368
|
+
print(label_issues["label_quality"].describe())
|
|
369
|
+
|
|
370
|
+
# Test uncertainty estimates (regression only)
|
|
371
|
+
print("\nTesting uncertainty estimates...")
|
|
372
|
+
aleatoric = cl.get_aleatoric_uncertainty(cleanlab_models._X, cl.predict(cleanlab_models._X) - cleanlab_models._y)
|
|
373
|
+
print(f"Aleatoric: Data noise (irreducible) = {aleatoric}")
|
|
374
|
+
epistemic = cl.get_epistemic_uncertainty(cleanlab_models._X, cleanlab_models._y)
|
|
375
|
+
print(f"Epistemic: Model uncertainty (reducible) = {epistemic[:10]} ...")
|
|
376
|
+
|
|
377
|
+
# Test Datalab report
|
|
378
|
+
print("\n" + "=" * 80)
|
|
379
|
+
print("Testing Datalab report (regression)...")
|
|
380
|
+
print("=" * 80)
|
|
381
|
+
lab = cleanlab_models.datalab()
|
|
382
|
+
lab.report(num_examples=3)
|
|
@@ -4,7 +4,7 @@ from xgboost import XGBRegressor
|
|
|
4
4
|
from typing import List
|
|
5
5
|
import logging
|
|
6
6
|
|
|
7
|
-
from workbench.algorithms.dataframe.
|
|
7
|
+
from workbench.algorithms.dataframe.feature_space_proximity import FeatureSpaceProximity
|
|
8
8
|
|
|
9
9
|
# Set up logging
|
|
10
10
|
log = logging.getLogger("workbench")
|
|
@@ -228,7 +228,7 @@ class NoiseModel:
|
|
|
228
228
|
|
|
229
229
|
# Proximity model for feature space analysis
|
|
230
230
|
log.info(" Building proximity model...")
|
|
231
|
-
self.proximity =
|
|
231
|
+
self.proximity = FeatureSpaceProximity(
|
|
232
232
|
self.df,
|
|
233
233
|
id_column=self.id_column,
|
|
234
234
|
features=self.features,
|
|
@@ -209,9 +209,9 @@ class Outliers:
|
|
|
209
209
|
else:
|
|
210
210
|
return group.nlargest(n, col)
|
|
211
211
|
|
|
212
|
-
# Group by 'outlier_group' and apply the helper function, explicitly selecting columns
|
|
213
|
-
top_outliers = outlier_df.groupby("outlier_group", group_keys=False).apply(
|
|
214
|
-
get_extreme_values
|
|
212
|
+
# Group by 'outlier_group' and apply the helper function, explicitly selecting columns to silence warning
|
|
213
|
+
top_outliers = outlier_df.groupby("outlier_group", group_keys=False)[outlier_df.columns].apply(
|
|
214
|
+
get_extreme_values
|
|
215
215
|
)
|
|
216
216
|
return top_outliers.reset_index(drop=True)
|
|
217
217
|
|
workbench/api/__init__.py
CHANGED
|
@@ -5,6 +5,7 @@ These class provide high-level APIs for the Workbench package, offering easy acc
|
|
|
5
5
|
- DataSource: Manages AWS Data Catalog and Athena
|
|
6
6
|
- FeatureSet: Manages AWS Feature Store and Feature Groups
|
|
7
7
|
- Model: Manages the training and deployment of AWS Model Groups and Packages
|
|
8
|
+
- MetaModel: A Model that aggregates predictions from multiple child endpoints
|
|
8
9
|
- ModelType: Enum for the different model types supported by Workbench
|
|
9
10
|
- Endpoint: Manages the deployment and invocations/inference on AWS Endpoints
|
|
10
11
|
- Meta: Provides an API to retrieve AWS Metadata for the above classes
|
|
@@ -15,6 +16,7 @@ These class provide high-level APIs for the Workbench package, offering easy acc
|
|
|
15
16
|
from .data_source import DataSource
|
|
16
17
|
from .feature_set import FeatureSet
|
|
17
18
|
from .model import Model, ModelType, ModelFramework
|
|
19
|
+
from .meta_model import MetaModel
|
|
18
20
|
from .endpoint import Endpoint
|
|
19
21
|
from .meta import Meta
|
|
20
22
|
from .parameter_store import ParameterStore
|
|
@@ -24,6 +26,7 @@ __all__ = [
|
|
|
24
26
|
"DataSource",
|
|
25
27
|
"FeatureSet",
|
|
26
28
|
"Model",
|
|
29
|
+
"MetaModel",
|
|
27
30
|
"ModelType",
|
|
28
31
|
"ModelFramework",
|
|
29
32
|
"Endpoint",
|
workbench/api/endpoint.py
CHANGED
|
@@ -44,16 +44,21 @@ class Endpoint(EndpointCore):
|
|
|
44
44
|
"""
|
|
45
45
|
return super().inference(eval_df, capture_name, id_column, drop_error_rows)
|
|
46
46
|
|
|
47
|
-
def auto_inference(self
|
|
48
|
-
"""Run inference on the Endpoint using the
|
|
47
|
+
def auto_inference(self) -> pd.DataFrame:
|
|
48
|
+
"""Run inference on the Endpoint using the test data from the model training view
|
|
49
49
|
|
|
50
|
-
|
|
51
|
-
|
|
50
|
+
Returns:
|
|
51
|
+
pd.DataFrame: The DataFrame with predictions
|
|
52
|
+
"""
|
|
53
|
+
return super().auto_inference()
|
|
54
|
+
|
|
55
|
+
def full_inference(self) -> pd.DataFrame:
|
|
56
|
+
"""Run inference on the Endpoint using the full data from the model training view
|
|
52
57
|
|
|
53
58
|
Returns:
|
|
54
59
|
pd.DataFrame: The DataFrame with predictions
|
|
55
60
|
"""
|
|
56
|
-
return super().
|
|
61
|
+
return super().full_inference()
|
|
57
62
|
|
|
58
63
|
def fast_inference(self, eval_df: pd.DataFrame, threads: int = 4) -> pd.DataFrame:
|
|
59
64
|
"""Run inference on the Endpoint using the provided DataFrame
|
workbench/api/feature_set.py
CHANGED
|
@@ -154,23 +154,93 @@ class FeatureSet(FeatureSetCore):
|
|
|
154
154
|
# Return the Model
|
|
155
155
|
return Model(name)
|
|
156
156
|
|
|
157
|
-
def prox_model(
|
|
158
|
-
|
|
157
|
+
def prox_model(
|
|
158
|
+
self, target: str, features: list, include_all_columns: bool = False
|
|
159
|
+
) -> "FeatureSpaceProximity": # noqa: F821
|
|
160
|
+
"""Create a local FeatureSpaceProximity Model for this FeatureSet
|
|
159
161
|
|
|
160
162
|
Args:
|
|
161
163
|
target (str): The target column name
|
|
162
164
|
features (list): The list of feature column names
|
|
165
|
+
include_all_columns (bool): Include all DataFrame columns in results (default: False)
|
|
163
166
|
|
|
164
167
|
Returns:
|
|
165
|
-
|
|
168
|
+
FeatureSpaceProximity: A local FeatureSpaceProximity Model
|
|
166
169
|
"""
|
|
167
|
-
from workbench.algorithms.dataframe.
|
|
170
|
+
from workbench.algorithms.dataframe.feature_space_proximity import FeatureSpaceProximity # noqa: F401
|
|
168
171
|
|
|
169
172
|
# Create the Proximity Model from the full FeatureSet dataframe
|
|
170
173
|
full_df = self.pull_dataframe()
|
|
171
174
|
|
|
172
|
-
# Create and return the
|
|
173
|
-
return
|
|
175
|
+
# Create and return the FeatureSpaceProximity Model
|
|
176
|
+
return FeatureSpaceProximity(
|
|
177
|
+
full_df, id_column=self.id_column, features=features, target=target, include_all_columns=include_all_columns
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
def fp_prox_model(
|
|
181
|
+
self,
|
|
182
|
+
target: str,
|
|
183
|
+
fingerprint_column: str = None,
|
|
184
|
+
include_all_columns: bool = False,
|
|
185
|
+
radius: int = 2,
|
|
186
|
+
n_bits: int = 1024,
|
|
187
|
+
counts: bool = False,
|
|
188
|
+
) -> "FingerprintProximity": # noqa: F821
|
|
189
|
+
"""Create a local FingerprintProximity Model for this FeatureSet
|
|
190
|
+
|
|
191
|
+
Args:
|
|
192
|
+
target (str): The target column name
|
|
193
|
+
fingerprint_column (str): Column containing fingerprints. If None, uses existing 'fingerprint'
|
|
194
|
+
column or computes from SMILES column.
|
|
195
|
+
include_all_columns (bool): Include all DataFrame columns in results (default: False)
|
|
196
|
+
radius (int): Radius for Morgan fingerprint computation (default: 2)
|
|
197
|
+
n_bits (int): Number of bits for fingerprint (default: 1024)
|
|
198
|
+
counts (bool): Whether to use count simulation (default: False)
|
|
199
|
+
|
|
200
|
+
Returns:
|
|
201
|
+
FingerprintProximity: A local FingerprintProximity Model
|
|
202
|
+
"""
|
|
203
|
+
from workbench.algorithms.dataframe.fingerprint_proximity import FingerprintProximity # noqa: F401
|
|
204
|
+
|
|
205
|
+
# Create the Proximity Model from the full FeatureSet dataframe
|
|
206
|
+
full_df = self.pull_dataframe()
|
|
207
|
+
|
|
208
|
+
# Create and return the FingerprintProximity Model
|
|
209
|
+
return FingerprintProximity(
|
|
210
|
+
full_df,
|
|
211
|
+
id_column=self.id_column,
|
|
212
|
+
fingerprint_column=fingerprint_column,
|
|
213
|
+
target=target,
|
|
214
|
+
include_all_columns=include_all_columns,
|
|
215
|
+
radius=radius,
|
|
216
|
+
n_bits=n_bits,
|
|
217
|
+
counts=counts,
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
def cleanlab_model(
|
|
221
|
+
self,
|
|
222
|
+
target: str,
|
|
223
|
+
features: list,
|
|
224
|
+
model_type: ModelType = ModelType.REGRESSOR,
|
|
225
|
+
) -> "CleanLearning": # noqa: F821
|
|
226
|
+
"""Create a CleanLearning model for detecting label issues in this FeatureSet
|
|
227
|
+
|
|
228
|
+
Args:
|
|
229
|
+
target (str): The target column name
|
|
230
|
+
features (list): The list of feature column names
|
|
231
|
+
model_type (ModelType): The model type (REGRESSOR or CLASSIFIER). Defaults to REGRESSOR.
|
|
232
|
+
|
|
233
|
+
Returns:
|
|
234
|
+
CleanLearning: A fitted cleanlab model. Use get_label_issues() to get
|
|
235
|
+
a DataFrame with id_column, label_quality, predicted_label, given_label, is_label_issue.
|
|
236
|
+
"""
|
|
237
|
+
from workbench.algorithms.models.cleanlab_model import create_cleanlab_model # noqa: F401
|
|
238
|
+
|
|
239
|
+
# Get the full FeatureSet dataframe
|
|
240
|
+
full_df = self.pull_dataframe()
|
|
241
|
+
|
|
242
|
+
# Create and return the CleanLearning model
|
|
243
|
+
return create_cleanlab_model(full_df, self.id_column, features, target, model_type=model_type)
|
|
174
244
|
|
|
175
245
|
|
|
176
246
|
if __name__ == "__main__":
|