workbench 0.8.213__py3-none-any.whl → 0.8.219__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. workbench/algorithms/dataframe/feature_space_proximity.py +168 -75
  2. workbench/algorithms/dataframe/fingerprint_proximity.py +257 -80
  3. workbench/algorithms/dataframe/projection_2d.py +38 -21
  4. workbench/algorithms/dataframe/proximity.py +75 -150
  5. workbench/algorithms/graph/light/proximity_graph.py +5 -5
  6. workbench/algorithms/models/cleanlab_model.py +382 -0
  7. workbench/algorithms/models/noise_model.py +2 -2
  8. workbench/algorithms/sql/outliers.py +3 -3
  9. workbench/api/__init__.py +3 -0
  10. workbench/api/endpoint.py +10 -5
  11. workbench/api/feature_set.py +76 -6
  12. workbench/api/meta_model.py +289 -0
  13. workbench/api/model.py +43 -4
  14. workbench/core/artifacts/endpoint_core.py +65 -117
  15. workbench/core/artifacts/feature_set_core.py +3 -3
  16. workbench/core/artifacts/model_core.py +6 -4
  17. workbench/core/pipelines/pipeline_executor.py +1 -1
  18. workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +30 -10
  19. workbench/model_script_utils/model_script_utils.py +15 -11
  20. workbench/model_script_utils/pytorch_utils.py +11 -1
  21. workbench/model_scripts/chemprop/chemprop.template +147 -71
  22. workbench/model_scripts/chemprop/generated_model_script.py +151 -75
  23. workbench/model_scripts/chemprop/model_script_utils.py +15 -11
  24. workbench/model_scripts/custom_models/chem_info/fingerprints.py +87 -46
  25. workbench/model_scripts/custom_models/proximity/feature_space_proximity.py +194 -0
  26. workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +6 -6
  27. workbench/model_scripts/custom_models/uq_models/feature_space_proximity.py +194 -0
  28. workbench/model_scripts/meta_model/generated_model_script.py +209 -0
  29. workbench/model_scripts/meta_model/meta_model.template +209 -0
  30. workbench/model_scripts/pytorch_model/generated_model_script.py +45 -27
  31. workbench/model_scripts/pytorch_model/model_script_utils.py +15 -11
  32. workbench/model_scripts/pytorch_model/pytorch.template +42 -24
  33. workbench/model_scripts/pytorch_model/pytorch_utils.py +11 -1
  34. workbench/model_scripts/script_generation.py +4 -0
  35. workbench/model_scripts/xgb_model/generated_model_script.py +167 -156
  36. workbench/model_scripts/xgb_model/model_script_utils.py +15 -11
  37. workbench/model_scripts/xgb_model/xgb_model.template +163 -152
  38. workbench/repl/workbench_shell.py +0 -5
  39. workbench/scripts/endpoint_test.py +2 -2
  40. workbench/scripts/meta_model_sim.py +35 -0
  41. workbench/utils/chem_utils/fingerprints.py +87 -46
  42. workbench/utils/chemprop_utils.py +23 -5
  43. workbench/utils/meta_model_simulator.py +499 -0
  44. workbench/utils/metrics_utils.py +94 -10
  45. workbench/utils/model_utils.py +91 -9
  46. workbench/utils/pytorch_utils.py +1 -1
  47. workbench/utils/shap_utils.py +1 -55
  48. workbench/web_interface/components/plugins/scatter_plot.py +4 -8
  49. {workbench-0.8.213.dist-info → workbench-0.8.219.dist-info}/METADATA +2 -1
  50. {workbench-0.8.213.dist-info → workbench-0.8.219.dist-info}/RECORD +54 -50
  51. {workbench-0.8.213.dist-info → workbench-0.8.219.dist-info}/entry_points.txt +1 -0
  52. workbench/model_scripts/custom_models/meta_endpoints/example.py +0 -53
  53. workbench/model_scripts/custom_models/proximity/proximity.py +0 -410
  54. workbench/model_scripts/custom_models/uq_models/meta_uq.template +0 -377
  55. workbench/model_scripts/custom_models/uq_models/proximity.py +0 -410
  56. {workbench-0.8.213.dist-info → workbench-0.8.219.dist-info}/WHEEL +0 -0
  57. {workbench-0.8.213.dist-info → workbench-0.8.219.dist-info}/licenses/LICENSE +0 -0
  58. {workbench-0.8.213.dist-info → workbench-0.8.219.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,382 @@
1
+ """Cleanlab-based label quality detection for regression and classification.
2
+
3
+ Note: Users must install cleanlab separately: pip install cleanlab
4
+ """
5
+
6
+ import logging
7
+ from typing import List, Optional
8
+
9
+ import datasets
10
+ import pandas as pd
11
+ from sklearn.ensemble import HistGradientBoostingRegressor, HistGradientBoostingClassifier
12
+ from sklearn.preprocessing import LabelEncoder
13
+
14
+ from workbench.core.artifacts.model_core import ModelType
15
+
16
+ # Check datasets version - Datalab has a bug with datasets>=4.0.0
17
+ # See: https://github.com/cleanlab/cleanlab/issues/1253
18
+ _datasets_major = int(datasets.__version__.split(".")[0])
19
+ if _datasets_major >= 4:
20
+ raise ImportError(
21
+ "cleanlab's Datalab requires datasets<4.0.0 due to a known bug.\n"
22
+ "See: https://github.com/cleanlab/cleanlab/issues/1253\n"
23
+ "Fix: pip install 'datasets<4.0.0'"
24
+ )
25
+
26
+ # Check for cleanlab package
27
+ try:
28
+ from cleanlab.regression.learn import CleanLearning as CleanLearningRegressor
29
+ from cleanlab.classification import CleanLearning as CleanLearningClassifier
30
+ from cleanlab import Datalab
31
+
32
+ CLEANLAB_AVAILABLE = True
33
+ except ImportError:
34
+ CLEANLAB_AVAILABLE = False
35
+ CleanLearningRegressor = None
36
+ CleanLearningClassifier = None
37
+ Datalab = None
38
+
39
+ # Regressor types for convenience
40
+ REGRESSOR_TYPES = [ModelType.REGRESSOR, ModelType.UQ_REGRESSOR, ModelType.ENSEMBLE_REGRESSOR]
41
+
42
+ # Set up logging
43
+ log = logging.getLogger("workbench")
44
+
45
+
46
+ class CleanlabModels:
47
+ """Factory class for cleanlab models with shared data preparation.
48
+
49
+ This class handles data preparation once and provides lazy-loaded access
50
+ to both CleanLearning and Datalab models. Each model is only created
51
+ when first requested, and the prepared data is shared between them.
52
+
53
+ Attributes:
54
+ id_column: Name of the ID column in the data.
55
+ features: List of feature column names.
56
+ target: Name of the target column.
57
+ model_type: ModelType (REGRESSOR, CLASSIFIER, etc.).
58
+
59
+ Example:
60
+ ```python
61
+ cleanlab = CleanlabModels(df, "id", features, "target", ModelType.REGRESSOR)
62
+
63
+ # Get CleanLearning model for label issues and uncertainty
64
+ cl = cleanlab.clean_learning()
65
+ issues = cl.get_label_issues()
66
+
67
+ # Get Datalab for comprehensive data quality report
68
+ lab = cleanlab.datalab()
69
+ lab.report()
70
+ ```
71
+ """
72
+
73
+ def __init__(
74
+ self,
75
+ df: pd.DataFrame,
76
+ id_column: str,
77
+ features: List[str],
78
+ target: str,
79
+ model_type: ModelType = ModelType.REGRESSOR,
80
+ ):
81
+ """Initialize CleanlabModels with data preparation.
82
+
83
+ Args:
84
+ df: DataFrame containing data for analysis.
85
+ id_column: Name of the column used as the identifier.
86
+ features: List of feature column names.
87
+ target: Name of the target column.
88
+ model_type: ModelType (REGRESSOR, CLASSIFIER, etc.).
89
+ """
90
+ if not CLEANLAB_AVAILABLE:
91
+ raise ImportError("cleanlab is not installed. Install with: pip install 'cleanlab[datalab]'")
92
+
93
+ self.id_column = id_column
94
+ self.target = target
95
+ self.model_type = model_type
96
+
97
+ # Filter to numeric features only
98
+ numeric_cols = df.select_dtypes(include=["number"]).columns
99
+ non_numeric = [f for f in features if f not in numeric_cols]
100
+ if non_numeric:
101
+ log.warning(f"Excluding non-numeric features: {non_numeric}")
102
+ features = [f for f in features if f in numeric_cols]
103
+ self.features = features
104
+
105
+ # Prepare clean data (shared by both models)
106
+ self._clean_df = df.dropna(subset=features + [target])[[id_column] + features + [target]].copy()
107
+ self._clean_df = self._clean_df.reset_index(drop=True)
108
+ self._X = self._clean_df[features].values
109
+ self._y = self._clean_df[target].values
110
+
111
+ # For classification, encode labels
112
+ self._label_encoder: Optional[LabelEncoder] = None
113
+ self._y_encoded = self._y
114
+ if model_type == ModelType.CLASSIFIER:
115
+ self._label_encoder = LabelEncoder()
116
+ self._y_encoded = self._label_encoder.fit_transform(self._y)
117
+
118
+ # Lazy-loaded models
119
+ self._clean_learning = None
120
+ self._datalab = None
121
+
122
+ def clean_learning(self):
123
+ """Get the CleanLearning model (fitted, with label issues computed).
124
+
125
+ Returns the cleanlab CleanLearning model with enhanced get_label_issues()
126
+ that includes the ID column, sorts by label quality, and decodes labels.
127
+
128
+ Returns:
129
+ CleanLearning: Fitted cleanlab model with methods like:
130
+ - get_label_issues(): DataFrame with id_column, sorted by label_quality
131
+ - predict(X): Make predictions
132
+ - For regression: get_epistemic_uncertainty(), get_aleatoric_uncertainty()
133
+ """
134
+ if self._clean_learning is not None:
135
+ return self._clean_learning
136
+
137
+ if self.model_type == ModelType.CLASSIFIER:
138
+ log.info("Building CleanLearning model (classification)...")
139
+ cl_model = CleanLearningClassifier(
140
+ HistGradientBoostingClassifier(),
141
+ find_label_issues_kwargs={"n_jobs": 1},
142
+ )
143
+ cl_model.fit(self._X, self._y_encoded)
144
+ else:
145
+ log.info("Building CleanLearning model (regression)...")
146
+ cl_model = CleanLearningRegressor(HistGradientBoostingRegressor())
147
+ cl_model.fit(self._X, self._y)
148
+
149
+ # Enhance get_label_issues to include id column, sort, and decode labels
150
+ original_get_label_issues = cl_model.get_label_issues
151
+ id_column = self.id_column
152
+ clean_df = self._clean_df
153
+ model_type = self.model_type
154
+ label_encoder = self._label_encoder
155
+
156
+ def get_label_issues_enhanced():
157
+ issues = original_get_label_issues().copy()
158
+ issues.insert(0, id_column, clean_df[id_column].values)
159
+ if model_type == ModelType.CLASSIFIER and label_encoder is not None:
160
+ for col in ["given_label", "predicted_label"]:
161
+ if col in issues.columns:
162
+ issues[col] = label_encoder.inverse_transform(issues[col])
163
+ return issues.sort_values("label_quality").reset_index(drop=True)
164
+
165
+ cl_model.get_label_issues = get_label_issues_enhanced
166
+
167
+ # For regression, enhance uncertainty methods to use stored data and return DataFrames
168
+ if model_type != ModelType.CLASSIFIER:
169
+ X = self._X
170
+ y = self._y
171
+ original_get_aleatoric = cl_model.get_aleatoric_uncertainty
172
+ original_get_epistemic = cl_model.get_epistemic_uncertainty
173
+
174
+ def get_aleatoric_uncertainty_enhanced():
175
+ residual = cl_model.predict(X) - y
176
+ return original_get_aleatoric(X, residual)
177
+
178
+ def get_epistemic_uncertainty_enhanced():
179
+ values = original_get_epistemic(X, y)
180
+ return (
181
+ pd.DataFrame(
182
+ {
183
+ id_column: clean_df[id_column].values,
184
+ "epistemic_uncertainty": values,
185
+ }
186
+ )
187
+ .sort_values("epistemic_uncertainty", ascending=False)
188
+ .reset_index(drop=True)
189
+ )
190
+
191
+ cl_model.get_aleatoric_uncertainty = get_aleatoric_uncertainty_enhanced
192
+ cl_model.get_epistemic_uncertainty = get_epistemic_uncertainty_enhanced
193
+
194
+ n_issues = original_get_label_issues()["is_label_issue"].sum()
195
+ log.info(f"CleanLearning: {n_issues} potential label issues out of {len(self._clean_df)} samples")
196
+
197
+ self._clean_learning = cl_model
198
+ return cl_model
199
+
200
+ def datalab(self):
201
+ """Get the Datalab instance (with find_issues already called).
202
+
203
+ Returns the native cleanlab Datalab for comprehensive data quality
204
+ analysis. Issues have already been detected.
205
+
206
+ Note: For classification, this will build the CleanLearning model first
207
+ (if not already built) to reuse its classifier for pred_probs.
208
+
209
+ Returns:
210
+ Datalab: Cleanlab Datalab instance with methods like:
211
+ - report(): Print comprehensive data quality report
212
+ - get_issues(): DataFrame with all detected issues
213
+ - get_issue_summary(): Summary statistics
214
+ """
215
+ if self._datalab is not None:
216
+ return self._datalab
217
+
218
+ log.info("Building Datalab model...")
219
+
220
+ # Create DataFrame with only numeric columns (features + target) for Datalab
221
+ datalab_df = self._clean_df[self.features + [self.target]]
222
+
223
+ # Create Datalab instance
224
+ if self.model_type == ModelType.CLASSIFIER:
225
+ lab = Datalab(data=datalab_df, label_name=self.target)
226
+ # Build CleanLearning first to reuse its classifier for pred_probs
227
+ cl = self.clean_learning()
228
+ pred_probs = cl.clf.predict_proba(self._X)
229
+ lab.find_issues(features=self._X, pred_probs=pred_probs)
230
+ else:
231
+ lab = Datalab(data=datalab_df, label_name=self.target, task="regression")
232
+ lab.find_issues(features=self._X)
233
+
234
+ self._datalab = lab
235
+ return lab
236
+
237
+
238
+ # Keep the old function for backwards compatibility
239
+ def create_cleanlab_model(
240
+ df: pd.DataFrame,
241
+ id_column: str,
242
+ features: List[str],
243
+ target: str,
244
+ model_type: ModelType = ModelType.REGRESSOR,
245
+ ):
246
+ """Create a CleanlabModels instance for label quality detection.
247
+
248
+ Args:
249
+ df: DataFrame containing data for label quality detection.
250
+ id_column: Name of the column used as the identifier.
251
+ features: List of feature column names.
252
+ target: Name of the target column.
253
+ model_type: ModelType (REGRESSOR, CLASSIFIER, etc.).
254
+
255
+ Returns:
256
+ CleanlabModels: Factory providing access to CleanLearning and Datalab models.
257
+
258
+ Example:
259
+ ```python
260
+ cleanlab = create_cleanlab_model(df, "id", features, "target")
261
+
262
+ # Get CleanLearning model and label issues
263
+ cl = cleanlab.clean_learning()
264
+ issues = cl.get_label_issues() # Includes ID column, sorted by quality
265
+
266
+ # Get Datalab for comprehensive data quality report
267
+ lab = cleanlab.datalab()
268
+ lab.report()
269
+ ```
270
+
271
+ References:
272
+ cleanlab: https://github.com/cleanlab/cleanlab
273
+ """
274
+ return CleanlabModels(df, id_column, features, target, model_type)
275
+
276
+
277
+ if __name__ == "__main__":
278
+ from workbench.api import FeatureSet, Model
279
+ import numpy as np
280
+
281
+ pd.set_option("display.max_columns", None)
282
+ pd.set_option("display.width", 1000)
283
+
284
+ # Create a sample DataFrame with some noisy points
285
+ np.random.seed(42)
286
+ n_samples = 100
287
+
288
+ # Generate clean data: y = 2*x1 + 3*x2 + noise
289
+ x1 = np.random.randn(n_samples)
290
+ x2 = np.random.randn(n_samples)
291
+ y_clean = 2 * x1 + 3 * x2 + np.random.randn(n_samples) * 0.1
292
+
293
+ # Add some noisy points (last 10 samples)
294
+ y_noisy = y_clean.copy()
295
+ y_noisy[-10:] += np.random.randn(10) * 20 # Large noise
296
+
297
+ data = {
298
+ "ID": [f"sample_{i}" for i in range(n_samples)],
299
+ "Feature1": x1,
300
+ "Feature2": x2,
301
+ "target": y_noisy,
302
+ }
303
+ df = pd.DataFrame(data)
304
+
305
+ print("=" * 80)
306
+ print("Testing CleanlabModels with synthetic data...")
307
+ print("=" * 80)
308
+
309
+ # Create CleanlabModels instance
310
+ cleanlab_models = create_cleanlab_model(
311
+ df,
312
+ id_column="ID",
313
+ features=["Feature1", "Feature2"],
314
+ target="target",
315
+ )
316
+
317
+ # Get CleanLearning model and test get_label_issues
318
+ cl = cleanlab_models.clean_learning()
319
+ print(f"CleanLearning type: {type(cl)}")
320
+
321
+ label_issues = cl.get_label_issues()
322
+ print("\nLabel issues (worst first, with ID column):")
323
+ print(label_issues.head(10))
324
+
325
+ # Check if our artificially noisy samples are detected
326
+ noisy_ids = [f"sample_{i}" for i in range(90, 100)]
327
+ worst_10 = label_issues.head(10)
328
+ detected = worst_10[worst_10["ID"].isin(noisy_ids)]
329
+ print(f"\nOf 10 noisy samples, {len(detected)} appear in worst 10")
330
+
331
+ # Test Datalab
332
+ print("\n" + "=" * 80)
333
+ print("Testing Datalab...")
334
+ print("=" * 80)
335
+ lab = cleanlab_models.datalab()
336
+ print(f"Datalab type: {type(lab)}")
337
+ print(f"Datalab issues shape: {lab.get_issues().shape}")
338
+ lab.report(num_examples=3)
339
+
340
+ # Test with real AQSol regression data
341
+ print("\n" + "=" * 80)
342
+ print("Testing with AQSol regression data...")
343
+ print("=" * 80)
344
+ fs = FeatureSet("aqsol_features")
345
+ df = fs.pull_dataframe()
346
+ model = Model("aqsol-regression")
347
+ features = model.features()
348
+ target = model.target()
349
+
350
+ cleanlab_models = create_cleanlab_model(
351
+ df,
352
+ id_column=fs.id_column,
353
+ features=features,
354
+ target=target,
355
+ )
356
+
357
+ # Get CleanLearning and label issues
358
+ cl = cleanlab_models.clean_learning()
359
+ label_issues = cl.get_label_issues()
360
+ print("\nLabel issues summary:")
361
+ print(f"Total samples: {len(label_issues)}")
362
+ print(f"Flagged as issues: {label_issues['is_label_issue'].sum()}")
363
+
364
+ print("\nWorst label quality samples:")
365
+ print(label_issues.head(10))
366
+
367
+ print("\nLabel quality distribution:")
368
+ print(label_issues["label_quality"].describe())
369
+
370
+ # Test uncertainty estimates (regression only)
371
+ print("\nTesting uncertainty estimates...")
372
+ aleatoric = cl.get_aleatoric_uncertainty(cleanlab_models._X, cl.predict(cleanlab_models._X) - cleanlab_models._y)
373
+ print(f"Aleatoric: Data noise (irreducible) = {aleatoric}")
374
+ epistemic = cl.get_epistemic_uncertainty(cleanlab_models._X, cleanlab_models._y)
375
+ print(f"Epistemic: Model uncertainty (reducible) = {epistemic[:10]} ...")
376
+
377
+ # Test Datalab report
378
+ print("\n" + "=" * 80)
379
+ print("Testing Datalab report (regression)...")
380
+ print("=" * 80)
381
+ lab = cleanlab_models.datalab()
382
+ lab.report(num_examples=3)
@@ -4,7 +4,7 @@ from xgboost import XGBRegressor
4
4
  from typing import List
5
5
  import logging
6
6
 
7
- from workbench.algorithms.dataframe.proximity import Proximity
7
+ from workbench.algorithms.dataframe.feature_space_proximity import FeatureSpaceProximity
8
8
 
9
9
  # Set up logging
10
10
  log = logging.getLogger("workbench")
@@ -228,7 +228,7 @@ class NoiseModel:
228
228
 
229
229
  # Proximity model for feature space analysis
230
230
  log.info(" Building proximity model...")
231
- self.proximity = Proximity(
231
+ self.proximity = FeatureSpaceProximity(
232
232
  self.df,
233
233
  id_column=self.id_column,
234
234
  features=self.features,
@@ -209,9 +209,9 @@ class Outliers:
209
209
  else:
210
210
  return group.nlargest(n, col)
211
211
 
212
- # Group by 'outlier_group' and apply the helper function, explicitly selecting columns
213
- top_outliers = outlier_df.groupby("outlier_group", group_keys=False).apply(
214
- get_extreme_values, include_groups=True
212
+ # Group by 'outlier_group' and apply the helper function, explicitly selecting columns to silence warning
213
+ top_outliers = outlier_df.groupby("outlier_group", group_keys=False)[outlier_df.columns].apply(
214
+ get_extreme_values
215
215
  )
216
216
  return top_outliers.reset_index(drop=True)
217
217
 
workbench/api/__init__.py CHANGED
@@ -5,6 +5,7 @@ These class provide high-level APIs for the Workbench package, offering easy acc
5
5
  - DataSource: Manages AWS Data Catalog and Athena
6
6
  - FeatureSet: Manages AWS Feature Store and Feature Groups
7
7
  - Model: Manages the training and deployment of AWS Model Groups and Packages
8
+ - MetaModel: A Model that aggregates predictions from multiple child endpoints
8
9
  - ModelType: Enum for the different model types supported by Workbench
9
10
  - Endpoint: Manages the deployment and invocations/inference on AWS Endpoints
10
11
  - Meta: Provides an API to retrieve AWS Metadata for the above classes
@@ -15,6 +16,7 @@ These class provide high-level APIs for the Workbench package, offering easy acc
15
16
  from .data_source import DataSource
16
17
  from .feature_set import FeatureSet
17
18
  from .model import Model, ModelType, ModelFramework
19
+ from .meta_model import MetaModel
18
20
  from .endpoint import Endpoint
19
21
  from .meta import Meta
20
22
  from .parameter_store import ParameterStore
@@ -24,6 +26,7 @@ __all__ = [
24
26
  "DataSource",
25
27
  "FeatureSet",
26
28
  "Model",
29
+ "MetaModel",
27
30
  "ModelType",
28
31
  "ModelFramework",
29
32
  "Endpoint",
workbench/api/endpoint.py CHANGED
@@ -44,16 +44,21 @@ class Endpoint(EndpointCore):
44
44
  """
45
45
  return super().inference(eval_df, capture_name, id_column, drop_error_rows)
46
46
 
47
- def auto_inference(self, capture: bool = False) -> pd.DataFrame:
48
- """Run inference on the Endpoint using the FeatureSet evaluation data
47
+ def auto_inference(self) -> pd.DataFrame:
48
+ """Run inference on the Endpoint using the test data from the model training view
49
49
 
50
- Args:
51
- capture (bool): Capture the inference results
50
+ Returns:
51
+ pd.DataFrame: The DataFrame with predictions
52
+ """
53
+ return super().auto_inference()
54
+
55
+ def full_inference(self) -> pd.DataFrame:
56
+ """Run inference on the Endpoint using the full data from the model training view
52
57
 
53
58
  Returns:
54
59
  pd.DataFrame: The DataFrame with predictions
55
60
  """
56
- return super().auto_inference(capture)
61
+ return super().full_inference()
57
62
 
58
63
  def fast_inference(self, eval_df: pd.DataFrame, threads: int = 4) -> pd.DataFrame:
59
64
  """Run inference on the Endpoint using the provided DataFrame
@@ -154,23 +154,93 @@ class FeatureSet(FeatureSetCore):
154
154
  # Return the Model
155
155
  return Model(name)
156
156
 
157
- def prox_model(self, target: str, features: list) -> "Proximity": # noqa: F821
158
- """Create a local Proximity Model for this Model
157
+ def prox_model(
158
+ self, target: str, features: list, include_all_columns: bool = False
159
+ ) -> "FeatureSpaceProximity": # noqa: F821
160
+ """Create a local FeatureSpaceProximity Model for this FeatureSet
159
161
 
160
162
  Args:
161
163
  target (str): The target column name
162
164
  features (list): The list of feature column names
165
+ include_all_columns (bool): Include all DataFrame columns in results (default: False)
163
166
 
164
167
  Returns:
165
- Proximity: A local Proximity Model
168
+ FeatureSpaceProximity: A local FeatureSpaceProximity Model
166
169
  """
167
- from workbench.algorithms.dataframe.proximity import Proximity # noqa: F401 (avoid circular import)
170
+ from workbench.algorithms.dataframe.feature_space_proximity import FeatureSpaceProximity # noqa: F401
168
171
 
169
172
  # Create the Proximity Model from the full FeatureSet dataframe
170
173
  full_df = self.pull_dataframe()
171
174
 
172
- # Create and return the Proximity Model
173
- return Proximity(full_df, self.id_column, features, target, track_columns=features)
175
+ # Create and return the FeatureSpaceProximity Model
176
+ return FeatureSpaceProximity(
177
+ full_df, id_column=self.id_column, features=features, target=target, include_all_columns=include_all_columns
178
+ )
179
+
180
+ def fp_prox_model(
181
+ self,
182
+ target: str,
183
+ fingerprint_column: str = None,
184
+ include_all_columns: bool = False,
185
+ radius: int = 2,
186
+ n_bits: int = 1024,
187
+ counts: bool = False,
188
+ ) -> "FingerprintProximity": # noqa: F821
189
+ """Create a local FingerprintProximity Model for this FeatureSet
190
+
191
+ Args:
192
+ target (str): The target column name
193
+ fingerprint_column (str): Column containing fingerprints. If None, uses existing 'fingerprint'
194
+ column or computes from SMILES column.
195
+ include_all_columns (bool): Include all DataFrame columns in results (default: False)
196
+ radius (int): Radius for Morgan fingerprint computation (default: 2)
197
+ n_bits (int): Number of bits for fingerprint (default: 1024)
198
+ counts (bool): Whether to use count simulation (default: False)
199
+
200
+ Returns:
201
+ FingerprintProximity: A local FingerprintProximity Model
202
+ """
203
+ from workbench.algorithms.dataframe.fingerprint_proximity import FingerprintProximity # noqa: F401
204
+
205
+ # Create the Proximity Model from the full FeatureSet dataframe
206
+ full_df = self.pull_dataframe()
207
+
208
+ # Create and return the FingerprintProximity Model
209
+ return FingerprintProximity(
210
+ full_df,
211
+ id_column=self.id_column,
212
+ fingerprint_column=fingerprint_column,
213
+ target=target,
214
+ include_all_columns=include_all_columns,
215
+ radius=radius,
216
+ n_bits=n_bits,
217
+ counts=counts,
218
+ )
219
+
220
+ def cleanlab_model(
221
+ self,
222
+ target: str,
223
+ features: list,
224
+ model_type: ModelType = ModelType.REGRESSOR,
225
+ ) -> "CleanLearning": # noqa: F821
226
+ """Create a CleanLearning model for detecting label issues in this FeatureSet
227
+
228
+ Args:
229
+ target (str): The target column name
230
+ features (list): The list of feature column names
231
+ model_type (ModelType): The model type (REGRESSOR or CLASSIFIER). Defaults to REGRESSOR.
232
+
233
+ Returns:
234
+ CleanLearning: A fitted cleanlab model. Use get_label_issues() to get
235
+ a DataFrame with id_column, label_quality, predicted_label, given_label, is_label_issue.
236
+ """
237
+ from workbench.algorithms.models.cleanlab_model import create_cleanlab_model # noqa: F401
238
+
239
+ # Get the full FeatureSet dataframe
240
+ full_df = self.pull_dataframe()
241
+
242
+ # Create and return the CleanLearning model
243
+ return create_cleanlab_model(full_df, self.id_column, features, target, model_type=model_type)
174
244
 
175
245
 
176
246
  if __name__ == "__main__":