workbench 0.8.202__py3-none-any.whl → 0.8.220__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of workbench might be problematic. Click here for more details.

Files changed (84) hide show
  1. workbench/algorithms/dataframe/compound_dataset_overlap.py +321 -0
  2. workbench/algorithms/dataframe/feature_space_proximity.py +168 -75
  3. workbench/algorithms/dataframe/fingerprint_proximity.py +421 -85
  4. workbench/algorithms/dataframe/projection_2d.py +44 -21
  5. workbench/algorithms/dataframe/proximity.py +78 -150
  6. workbench/algorithms/graph/light/proximity_graph.py +5 -5
  7. workbench/algorithms/models/cleanlab_model.py +382 -0
  8. workbench/algorithms/models/noise_model.py +388 -0
  9. workbench/algorithms/sql/outliers.py +3 -3
  10. workbench/api/__init__.py +3 -0
  11. workbench/api/df_store.py +17 -108
  12. workbench/api/endpoint.py +13 -11
  13. workbench/api/feature_set.py +111 -8
  14. workbench/api/meta_model.py +289 -0
  15. workbench/api/model.py +45 -12
  16. workbench/api/parameter_store.py +3 -52
  17. workbench/cached/cached_model.py +4 -4
  18. workbench/core/artifacts/artifact.py +5 -5
  19. workbench/core/artifacts/df_store_core.py +114 -0
  20. workbench/core/artifacts/endpoint_core.py +228 -237
  21. workbench/core/artifacts/feature_set_core.py +185 -230
  22. workbench/core/artifacts/model_core.py +34 -26
  23. workbench/core/artifacts/parameter_store_core.py +98 -0
  24. workbench/core/pipelines/pipeline_executor.py +1 -1
  25. workbench/core/transforms/features_to_model/features_to_model.py +22 -10
  26. workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +41 -10
  27. workbench/core/transforms/pandas_transforms/pandas_to_features.py +11 -2
  28. workbench/model_script_utils/model_script_utils.py +339 -0
  29. workbench/model_script_utils/pytorch_utils.py +405 -0
  30. workbench/model_script_utils/uq_harness.py +278 -0
  31. workbench/model_scripts/chemprop/chemprop.template +428 -631
  32. workbench/model_scripts/chemprop/generated_model_script.py +432 -635
  33. workbench/model_scripts/chemprop/model_script_utils.py +339 -0
  34. workbench/model_scripts/chemprop/requirements.txt +2 -10
  35. workbench/model_scripts/custom_models/chem_info/fingerprints.py +87 -46
  36. workbench/model_scripts/custom_models/proximity/feature_space_proximity.py +194 -0
  37. workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +6 -6
  38. workbench/model_scripts/custom_models/uq_models/feature_space_proximity.py +194 -0
  39. workbench/model_scripts/meta_model/generated_model_script.py +209 -0
  40. workbench/model_scripts/meta_model/meta_model.template +209 -0
  41. workbench/model_scripts/pytorch_model/generated_model_script.py +374 -613
  42. workbench/model_scripts/pytorch_model/model_script_utils.py +339 -0
  43. workbench/model_scripts/pytorch_model/pytorch.template +370 -609
  44. workbench/model_scripts/pytorch_model/pytorch_utils.py +405 -0
  45. workbench/model_scripts/pytorch_model/requirements.txt +1 -1
  46. workbench/model_scripts/pytorch_model/uq_harness.py +278 -0
  47. workbench/model_scripts/script_generation.py +6 -5
  48. workbench/model_scripts/uq_models/generated_model_script.py +65 -422
  49. workbench/model_scripts/xgb_model/generated_model_script.py +372 -395
  50. workbench/model_scripts/xgb_model/model_script_utils.py +339 -0
  51. workbench/model_scripts/xgb_model/uq_harness.py +278 -0
  52. workbench/model_scripts/xgb_model/xgb_model.template +366 -396
  53. workbench/repl/workbench_shell.py +0 -5
  54. workbench/resources/open_source_api.key +1 -1
  55. workbench/scripts/endpoint_test.py +2 -2
  56. workbench/scripts/meta_model_sim.py +35 -0
  57. workbench/scripts/training_test.py +85 -0
  58. workbench/utils/chem_utils/fingerprints.py +87 -46
  59. workbench/utils/chem_utils/projections.py +16 -6
  60. workbench/utils/chemprop_utils.py +36 -655
  61. workbench/utils/meta_model_simulator.py +499 -0
  62. workbench/utils/metrics_utils.py +256 -0
  63. workbench/utils/model_utils.py +192 -54
  64. workbench/utils/pytorch_utils.py +33 -472
  65. workbench/utils/shap_utils.py +1 -55
  66. workbench/utils/xgboost_local_crossfold.py +267 -0
  67. workbench/utils/xgboost_model_utils.py +49 -356
  68. workbench/web_interface/components/model_plot.py +7 -1
  69. workbench/web_interface/components/plugins/model_details.py +30 -68
  70. workbench/web_interface/components/plugins/scatter_plot.py +4 -8
  71. {workbench-0.8.202.dist-info → workbench-0.8.220.dist-info}/METADATA +6 -5
  72. {workbench-0.8.202.dist-info → workbench-0.8.220.dist-info}/RECORD +76 -60
  73. {workbench-0.8.202.dist-info → workbench-0.8.220.dist-info}/entry_points.txt +2 -0
  74. workbench/core/cloud_platform/aws/aws_df_store.py +0 -404
  75. workbench/core/cloud_platform/aws/aws_parameter_store.py +0 -296
  76. workbench/model_scripts/custom_models/meta_endpoints/example.py +0 -53
  77. workbench/model_scripts/custom_models/proximity/proximity.py +0 -410
  78. workbench/model_scripts/custom_models/uq_models/meta_uq.template +0 -377
  79. workbench/model_scripts/custom_models/uq_models/proximity.py +0 -410
  80. workbench/model_scripts/uq_models/mapie.template +0 -605
  81. workbench/model_scripts/uq_models/requirements.txt +0 -1
  82. {workbench-0.8.202.dist-info → workbench-0.8.220.dist-info}/WHEEL +0 -0
  83. {workbench-0.8.202.dist-info → workbench-0.8.220.dist-info}/licenses/LICENSE +0 -0
  84. {workbench-0.8.202.dist-info → workbench-0.8.220.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,382 @@
1
+ """Cleanlab-based label quality detection for regression and classification.
2
+
3
+ Note: Users must install cleanlab separately: pip install cleanlab
4
+ """
5
+
6
+ import logging
7
+ from typing import List, Optional
8
+
9
+ import datasets
10
+ import pandas as pd
11
+ from sklearn.ensemble import HistGradientBoostingRegressor, HistGradientBoostingClassifier
12
+ from sklearn.preprocessing import LabelEncoder
13
+
14
+ from workbench.core.artifacts.model_core import ModelType
15
+
16
+ # Check datasets version - Datalab has a bug with datasets>=4.0.0
17
+ # See: https://github.com/cleanlab/cleanlab/issues/1253
18
+ _datasets_major = int(datasets.__version__.split(".")[0])
19
+ if _datasets_major >= 4:
20
+ raise ImportError(
21
+ "cleanlab's Datalab requires datasets<4.0.0 due to a known bug.\n"
22
+ "See: https://github.com/cleanlab/cleanlab/issues/1253\n"
23
+ "Fix: pip install 'datasets<4.0.0'"
24
+ )
25
+
26
+ # Check for cleanlab package
27
+ try:
28
+ from cleanlab.regression.learn import CleanLearning as CleanLearningRegressor
29
+ from cleanlab.classification import CleanLearning as CleanLearningClassifier
30
+ from cleanlab import Datalab
31
+
32
+ CLEANLAB_AVAILABLE = True
33
+ except ImportError:
34
+ CLEANLAB_AVAILABLE = False
35
+ CleanLearningRegressor = None
36
+ CleanLearningClassifier = None
37
+ Datalab = None
38
+
39
+ # Regressor types for convenience
40
+ REGRESSOR_TYPES = [ModelType.REGRESSOR, ModelType.UQ_REGRESSOR, ModelType.ENSEMBLE_REGRESSOR]
41
+
42
+ # Set up logging
43
+ log = logging.getLogger("workbench")
44
+
45
+
46
+ class CleanlabModels:
47
+ """Factory class for cleanlab models with shared data preparation.
48
+
49
+ This class handles data preparation once and provides lazy-loaded access
50
+ to both CleanLearning and Datalab models. Each model is only created
51
+ when first requested, and the prepared data is shared between them.
52
+
53
+ Attributes:
54
+ id_column: Name of the ID column in the data.
55
+ features: List of feature column names.
56
+ target: Name of the target column.
57
+ model_type: ModelType (REGRESSOR, CLASSIFIER, etc.).
58
+
59
+ Example:
60
+ ```python
61
+ cleanlab = CleanlabModels(df, "id", features, "target", ModelType.REGRESSOR)
62
+
63
+ # Get CleanLearning model for label issues and uncertainty
64
+ cl = cleanlab.clean_learning()
65
+ issues = cl.get_label_issues()
66
+
67
+ # Get Datalab for comprehensive data quality report
68
+ lab = cleanlab.datalab()
69
+ lab.report()
70
+ ```
71
+ """
72
+
73
+ def __init__(
74
+ self,
75
+ df: pd.DataFrame,
76
+ id_column: str,
77
+ features: List[str],
78
+ target: str,
79
+ model_type: ModelType = ModelType.REGRESSOR,
80
+ ):
81
+ """Initialize CleanlabModels with data preparation.
82
+
83
+ Args:
84
+ df: DataFrame containing data for analysis.
85
+ id_column: Name of the column used as the identifier.
86
+ features: List of feature column names.
87
+ target: Name of the target column.
88
+ model_type: ModelType (REGRESSOR, CLASSIFIER, etc.).
89
+ """
90
+ if not CLEANLAB_AVAILABLE:
91
+ raise ImportError("cleanlab is not installed. Install with: pip install 'cleanlab[datalab]'")
92
+
93
+ self.id_column = id_column
94
+ self.target = target
95
+ self.model_type = model_type
96
+
97
+ # Filter to numeric features only
98
+ numeric_cols = df.select_dtypes(include=["number"]).columns
99
+ non_numeric = [f for f in features if f not in numeric_cols]
100
+ if non_numeric:
101
+ log.warning(f"Excluding non-numeric features: {non_numeric}")
102
+ features = [f for f in features if f in numeric_cols]
103
+ self.features = features
104
+
105
+ # Prepare clean data (shared by both models)
106
+ self._clean_df = df.dropna(subset=features + [target])[[id_column] + features + [target]].copy()
107
+ self._clean_df = self._clean_df.reset_index(drop=True)
108
+ self._X = self._clean_df[features].values
109
+ self._y = self._clean_df[target].values
110
+
111
+ # For classification, encode labels
112
+ self._label_encoder: Optional[LabelEncoder] = None
113
+ self._y_encoded = self._y
114
+ if model_type == ModelType.CLASSIFIER:
115
+ self._label_encoder = LabelEncoder()
116
+ self._y_encoded = self._label_encoder.fit_transform(self._y)
117
+
118
+ # Lazy-loaded models
119
+ self._clean_learning = None
120
+ self._datalab = None
121
+
122
+ def clean_learning(self):
123
+ """Get the CleanLearning model (fitted, with label issues computed).
124
+
125
+ Returns the cleanlab CleanLearning model with enhanced get_label_issues()
126
+ that includes the ID column, sorts by label quality, and decodes labels.
127
+
128
+ Returns:
129
+ CleanLearning: Fitted cleanlab model with methods like:
130
+ - get_label_issues(): DataFrame with id_column, sorted by label_quality
131
+ - predict(X): Make predictions
132
+ - For regression: get_epistemic_uncertainty(), get_aleatoric_uncertainty()
133
+ """
134
+ if self._clean_learning is not None:
135
+ return self._clean_learning
136
+
137
+ if self.model_type == ModelType.CLASSIFIER:
138
+ log.info("Building CleanLearning model (classification)...")
139
+ cl_model = CleanLearningClassifier(
140
+ HistGradientBoostingClassifier(),
141
+ find_label_issues_kwargs={"n_jobs": 1},
142
+ )
143
+ cl_model.fit(self._X, self._y_encoded)
144
+ else:
145
+ log.info("Building CleanLearning model (regression)...")
146
+ cl_model = CleanLearningRegressor(HistGradientBoostingRegressor())
147
+ cl_model.fit(self._X, self._y)
148
+
149
+ # Enhance get_label_issues to include id column, sort, and decode labels
150
+ original_get_label_issues = cl_model.get_label_issues
151
+ id_column = self.id_column
152
+ clean_df = self._clean_df
153
+ model_type = self.model_type
154
+ label_encoder = self._label_encoder
155
+
156
+ def get_label_issues_enhanced():
157
+ issues = original_get_label_issues().copy()
158
+ issues.insert(0, id_column, clean_df[id_column].values)
159
+ if model_type == ModelType.CLASSIFIER and label_encoder is not None:
160
+ for col in ["given_label", "predicted_label"]:
161
+ if col in issues.columns:
162
+ issues[col] = label_encoder.inverse_transform(issues[col])
163
+ return issues.sort_values("label_quality").reset_index(drop=True)
164
+
165
+ cl_model.get_label_issues = get_label_issues_enhanced
166
+
167
+ # For regression, enhance uncertainty methods to use stored data and return DataFrames
168
+ if model_type != ModelType.CLASSIFIER:
169
+ X = self._X
170
+ y = self._y
171
+ original_get_aleatoric = cl_model.get_aleatoric_uncertainty
172
+ original_get_epistemic = cl_model.get_epistemic_uncertainty
173
+
174
+ def get_aleatoric_uncertainty_enhanced():
175
+ residual = cl_model.predict(X) - y
176
+ return original_get_aleatoric(X, residual)
177
+
178
+ def get_epistemic_uncertainty_enhanced():
179
+ values = original_get_epistemic(X, y)
180
+ return (
181
+ pd.DataFrame(
182
+ {
183
+ id_column: clean_df[id_column].values,
184
+ "epistemic_uncertainty": values,
185
+ }
186
+ )
187
+ .sort_values("epistemic_uncertainty", ascending=False)
188
+ .reset_index(drop=True)
189
+ )
190
+
191
+ cl_model.get_aleatoric_uncertainty = get_aleatoric_uncertainty_enhanced
192
+ cl_model.get_epistemic_uncertainty = get_epistemic_uncertainty_enhanced
193
+
194
+ n_issues = original_get_label_issues()["is_label_issue"].sum()
195
+ log.info(f"CleanLearning: {n_issues} potential label issues out of {len(self._clean_df)} samples")
196
+
197
+ self._clean_learning = cl_model
198
+ return cl_model
199
+
200
+ def datalab(self):
201
+ """Get the Datalab instance (with find_issues already called).
202
+
203
+ Returns the native cleanlab Datalab for comprehensive data quality
204
+ analysis. Issues have already been detected.
205
+
206
+ Note: For classification, this will build the CleanLearning model first
207
+ (if not already built) to reuse its classifier for pred_probs.
208
+
209
+ Returns:
210
+ Datalab: Cleanlab Datalab instance with methods like:
211
+ - report(): Print comprehensive data quality report
212
+ - get_issues(): DataFrame with all detected issues
213
+ - get_issue_summary(): Summary statistics
214
+ """
215
+ if self._datalab is not None:
216
+ return self._datalab
217
+
218
+ log.info("Building Datalab model...")
219
+
220
+ # Create DataFrame with only numeric columns (features + target) for Datalab
221
+ datalab_df = self._clean_df[self.features + [self.target]]
222
+
223
+ # Create Datalab instance
224
+ if self.model_type == ModelType.CLASSIFIER:
225
+ lab = Datalab(data=datalab_df, label_name=self.target)
226
+ # Build CleanLearning first to reuse its classifier for pred_probs
227
+ cl = self.clean_learning()
228
+ pred_probs = cl.clf.predict_proba(self._X)
229
+ lab.find_issues(features=self._X, pred_probs=pred_probs)
230
+ else:
231
+ lab = Datalab(data=datalab_df, label_name=self.target, task="regression")
232
+ lab.find_issues(features=self._X)
233
+
234
+ self._datalab = lab
235
+ return lab
236
+
237
+
238
+ # Keep the old function for backwards compatibility
239
+ def create_cleanlab_model(
240
+ df: pd.DataFrame,
241
+ id_column: str,
242
+ features: List[str],
243
+ target: str,
244
+ model_type: ModelType = ModelType.REGRESSOR,
245
+ ):
246
+ """Create a CleanlabModels instance for label quality detection.
247
+
248
+ Args:
249
+ df: DataFrame containing data for label quality detection.
250
+ id_column: Name of the column used as the identifier.
251
+ features: List of feature column names.
252
+ target: Name of the target column.
253
+ model_type: ModelType (REGRESSOR, CLASSIFIER, etc.).
254
+
255
+ Returns:
256
+ CleanlabModels: Factory providing access to CleanLearning and Datalab models.
257
+
258
+ Example:
259
+ ```python
260
+ cleanlab = create_cleanlab_model(df, "id", features, "target")
261
+
262
+ # Get CleanLearning model and label issues
263
+ cl = cleanlab.clean_learning()
264
+ issues = cl.get_label_issues() # Includes ID column, sorted by quality
265
+
266
+ # Get Datalab for comprehensive data quality report
267
+ lab = cleanlab.datalab()
268
+ lab.report()
269
+ ```
270
+
271
+ References:
272
+ cleanlab: https://github.com/cleanlab/cleanlab
273
+ """
274
+ return CleanlabModels(df, id_column, features, target, model_type)
275
+
276
+
277
+ if __name__ == "__main__":
278
+ from workbench.api import FeatureSet, Model
279
+ import numpy as np
280
+
281
+ pd.set_option("display.max_columns", None)
282
+ pd.set_option("display.width", 1000)
283
+
284
+ # Create a sample DataFrame with some noisy points
285
+ np.random.seed(42)
286
+ n_samples = 100
287
+
288
+ # Generate clean data: y = 2*x1 + 3*x2 + noise
289
+ x1 = np.random.randn(n_samples)
290
+ x2 = np.random.randn(n_samples)
291
+ y_clean = 2 * x1 + 3 * x2 + np.random.randn(n_samples) * 0.1
292
+
293
+ # Add some noisy points (last 10 samples)
294
+ y_noisy = y_clean.copy()
295
+ y_noisy[-10:] += np.random.randn(10) * 20 # Large noise
296
+
297
+ data = {
298
+ "ID": [f"sample_{i}" for i in range(n_samples)],
299
+ "Feature1": x1,
300
+ "Feature2": x2,
301
+ "target": y_noisy,
302
+ }
303
+ df = pd.DataFrame(data)
304
+
305
+ print("=" * 80)
306
+ print("Testing CleanlabModels with synthetic data...")
307
+ print("=" * 80)
308
+
309
+ # Create CleanlabModels instance
310
+ cleanlab_models = create_cleanlab_model(
311
+ df,
312
+ id_column="ID",
313
+ features=["Feature1", "Feature2"],
314
+ target="target",
315
+ )
316
+
317
+ # Get CleanLearning model and test get_label_issues
318
+ cl = cleanlab_models.clean_learning()
319
+ print(f"CleanLearning type: {type(cl)}")
320
+
321
+ label_issues = cl.get_label_issues()
322
+ print("\nLabel issues (worst first, with ID column):")
323
+ print(label_issues.head(10))
324
+
325
+ # Check if our artificially noisy samples are detected
326
+ noisy_ids = [f"sample_{i}" for i in range(90, 100)]
327
+ worst_10 = label_issues.head(10)
328
+ detected = worst_10[worst_10["ID"].isin(noisy_ids)]
329
+ print(f"\nOf 10 noisy samples, {len(detected)} appear in worst 10")
330
+
331
+ # Test Datalab
332
+ print("\n" + "=" * 80)
333
+ print("Testing Datalab...")
334
+ print("=" * 80)
335
+ lab = cleanlab_models.datalab()
336
+ print(f"Datalab type: {type(lab)}")
337
+ print(f"Datalab issues shape: {lab.get_issues().shape}")
338
+ lab.report(num_examples=3)
339
+
340
+ # Test with real AQSol regression data
341
+ print("\n" + "=" * 80)
342
+ print("Testing with AQSol regression data...")
343
+ print("=" * 80)
344
+ fs = FeatureSet("aqsol_features")
345
+ df = fs.pull_dataframe()
346
+ model = Model("aqsol-regression")
347
+ features = model.features()
348
+ target = model.target()
349
+
350
+ cleanlab_models = create_cleanlab_model(
351
+ df,
352
+ id_column=fs.id_column,
353
+ features=features,
354
+ target=target,
355
+ )
356
+
357
+ # Get CleanLearning and label issues
358
+ cl = cleanlab_models.clean_learning()
359
+ label_issues = cl.get_label_issues()
360
+ print("\nLabel issues summary:")
361
+ print(f"Total samples: {len(label_issues)}")
362
+ print(f"Flagged as issues: {label_issues['is_label_issue'].sum()}")
363
+
364
+ print("\nWorst label quality samples:")
365
+ print(label_issues.head(10))
366
+
367
+ print("\nLabel quality distribution:")
368
+ print(label_issues["label_quality"].describe())
369
+
370
+ # Test uncertainty estimates (regression only)
371
+ print("\nTesting uncertainty estimates...")
372
+ aleatoric = cl.get_aleatoric_uncertainty(cleanlab_models._X, cl.predict(cleanlab_models._X) - cleanlab_models._y)
373
+ print(f"Aleatoric: Data noise (irreducible) = {aleatoric}")
374
+ epistemic = cl.get_epistemic_uncertainty(cleanlab_models._X, cleanlab_models._y)
375
+ print(f"Epistemic: Model uncertainty (reducible) = {epistemic[:10]} ...")
376
+
377
+ # Test Datalab report
378
+ print("\n" + "=" * 80)
379
+ print("Testing Datalab report (regression)...")
380
+ print("=" * 80)
381
+ lab = cleanlab_models.datalab()
382
+ lab.report(num_examples=3)