workbench 0.8.213__py3-none-any.whl → 0.8.219__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. workbench/algorithms/dataframe/feature_space_proximity.py +168 -75
  2. workbench/algorithms/dataframe/fingerprint_proximity.py +257 -80
  3. workbench/algorithms/dataframe/projection_2d.py +38 -21
  4. workbench/algorithms/dataframe/proximity.py +75 -150
  5. workbench/algorithms/graph/light/proximity_graph.py +5 -5
  6. workbench/algorithms/models/cleanlab_model.py +382 -0
  7. workbench/algorithms/models/noise_model.py +2 -2
  8. workbench/algorithms/sql/outliers.py +3 -3
  9. workbench/api/__init__.py +3 -0
  10. workbench/api/endpoint.py +10 -5
  11. workbench/api/feature_set.py +76 -6
  12. workbench/api/meta_model.py +289 -0
  13. workbench/api/model.py +43 -4
  14. workbench/core/artifacts/endpoint_core.py +65 -117
  15. workbench/core/artifacts/feature_set_core.py +3 -3
  16. workbench/core/artifacts/model_core.py +6 -4
  17. workbench/core/pipelines/pipeline_executor.py +1 -1
  18. workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +30 -10
  19. workbench/model_script_utils/model_script_utils.py +15 -11
  20. workbench/model_script_utils/pytorch_utils.py +11 -1
  21. workbench/model_scripts/chemprop/chemprop.template +147 -71
  22. workbench/model_scripts/chemprop/generated_model_script.py +151 -75
  23. workbench/model_scripts/chemprop/model_script_utils.py +15 -11
  24. workbench/model_scripts/custom_models/chem_info/fingerprints.py +87 -46
  25. workbench/model_scripts/custom_models/proximity/feature_space_proximity.py +194 -0
  26. workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +6 -6
  27. workbench/model_scripts/custom_models/uq_models/feature_space_proximity.py +194 -0
  28. workbench/model_scripts/meta_model/generated_model_script.py +209 -0
  29. workbench/model_scripts/meta_model/meta_model.template +209 -0
  30. workbench/model_scripts/pytorch_model/generated_model_script.py +45 -27
  31. workbench/model_scripts/pytorch_model/model_script_utils.py +15 -11
  32. workbench/model_scripts/pytorch_model/pytorch.template +42 -24
  33. workbench/model_scripts/pytorch_model/pytorch_utils.py +11 -1
  34. workbench/model_scripts/script_generation.py +4 -0
  35. workbench/model_scripts/xgb_model/generated_model_script.py +167 -156
  36. workbench/model_scripts/xgb_model/model_script_utils.py +15 -11
  37. workbench/model_scripts/xgb_model/xgb_model.template +163 -152
  38. workbench/repl/workbench_shell.py +0 -5
  39. workbench/scripts/endpoint_test.py +2 -2
  40. workbench/scripts/meta_model_sim.py +35 -0
  41. workbench/utils/chem_utils/fingerprints.py +87 -46
  42. workbench/utils/chemprop_utils.py +23 -5
  43. workbench/utils/meta_model_simulator.py +499 -0
  44. workbench/utils/metrics_utils.py +94 -10
  45. workbench/utils/model_utils.py +91 -9
  46. workbench/utils/pytorch_utils.py +1 -1
  47. workbench/utils/shap_utils.py +1 -55
  48. workbench/web_interface/components/plugins/scatter_plot.py +4 -8
  49. {workbench-0.8.213.dist-info → workbench-0.8.219.dist-info}/METADATA +2 -1
  50. {workbench-0.8.213.dist-info → workbench-0.8.219.dist-info}/RECORD +54 -50
  51. {workbench-0.8.213.dist-info → workbench-0.8.219.dist-info}/entry_points.txt +1 -0
  52. workbench/model_scripts/custom_models/meta_endpoints/example.py +0 -53
  53. workbench/model_scripts/custom_models/proximity/proximity.py +0 -410
  54. workbench/model_scripts/custom_models/uq_models/meta_uq.template +0 -377
  55. workbench/model_scripts/custom_models/uq_models/proximity.py +0 -410
  56. {workbench-0.8.213.dist-info → workbench-0.8.219.dist-info}/WHEEL +0 -0
  57. {workbench-0.8.213.dist-info → workbench-0.8.219.dist-info}/licenses/LICENSE +0 -0
  58. {workbench-0.8.213.dist-info → workbench-0.8.219.dist-info}/top_level.txt +0 -0
@@ -18,10 +18,32 @@ from sklearn.metrics import (
18
18
  log = logging.getLogger("workbench")
19
19
 
20
20
 
21
+ def validate_proba_columns(predictions_df: pd.DataFrame, class_labels: List[str], guessing: bool = False) -> bool:
22
+ """Validate that probability columns match class labels.
23
+
24
+ Args:
25
+ predictions_df: DataFrame with prediction results
26
+ class_labels: List of class labels
27
+ guessing: Whether class labels were guessed from data
28
+
29
+ Returns:
30
+ True if validation passes
31
+
32
+ Raises:
33
+ ValueError: If probability columns don't match class labels
34
+ """
35
+ proba_columns = [col.replace("_proba", "") for col in predictions_df.columns if col.endswith("_proba")]
36
+
37
+ if sorted(class_labels) != sorted(proba_columns):
38
+ label_type = "GUESSED class_labels" if guessing else "class_labels"
39
+ raise ValueError(f"_proba columns {proba_columns} != {label_type} {class_labels}!")
40
+ return True
41
+
42
+
21
43
  def compute_classification_metrics(
22
44
  predictions_df: pd.DataFrame,
23
45
  target_col: str,
24
- class_labels: List[str],
46
+ class_labels: Optional[List[str]] = None,
25
47
  prediction_col: str = "prediction",
26
48
  ) -> pd.DataFrame:
27
49
  """Compute classification metrics from a predictions DataFrame.
@@ -29,26 +51,62 @@ def compute_classification_metrics(
29
51
  Args:
30
52
  predictions_df: DataFrame with target and prediction columns
31
53
  target_col: Name of the target column
32
- class_labels: List of class labels in order
54
+ class_labels: List of class labels in order (if None, inferred from target column)
33
55
  prediction_col: Name of the prediction column (default: "prediction")
34
56
 
35
57
  Returns:
36
58
  DataFrame with per-class metrics (precision, recall, f1, roc_auc, support)
37
- plus a weighted 'all' row
59
+ plus a weighted 'all' row. Returns empty DataFrame if validation fails.
38
60
  """
39
- y_true = predictions_df[target_col]
40
- y_pred = predictions_df[prediction_col]
61
+ # Validate inputs
62
+ if predictions_df.empty:
63
+ log.warning("Empty DataFrame provided. Returning empty metrics.")
64
+ return pd.DataFrame()
65
+
66
+ if prediction_col not in predictions_df.columns:
67
+ log.warning(f"Prediction column '{prediction_col}' not found in DataFrame. Returning empty metrics.")
68
+ return pd.DataFrame()
69
+
70
+ if target_col not in predictions_df.columns:
71
+ log.warning(f"Target column '{target_col}' not found in DataFrame. Returning empty metrics.")
72
+ return pd.DataFrame()
73
+
74
+ # Handle NaN predictions
75
+ df = predictions_df.copy()
76
+ nan_pred = df[prediction_col].isnull().sum()
77
+ if nan_pred > 0:
78
+ log.warning(f"Dropping {nan_pred} rows with NaN predictions.")
79
+ df = df[~df[prediction_col].isnull()]
80
+
81
+ if df.empty:
82
+ log.warning("No valid rows after dropping NaNs. Returning empty metrics.")
83
+ return pd.DataFrame()
84
+
85
+ # Handle class labels
86
+ guessing = False
87
+ if class_labels is None:
88
+ log.warning("Class labels not provided. Inferring from target column.")
89
+ class_labels = df[target_col].unique().tolist()
90
+ guessing = True
91
+
92
+ # Validate probability columns if present
93
+ proba_cols = [col for col in df.columns if col.endswith("_proba")]
94
+ if proba_cols:
95
+ validate_proba_columns(df, class_labels, guessing=guessing)
96
+
97
+ y_true = df[target_col]
98
+ y_pred = df[prediction_col]
41
99
 
42
100
  # Precision, recall, f1, support per class
43
101
  prec, rec, f1, support = precision_recall_fscore_support(y_true, y_pred, labels=class_labels, zero_division=0)
44
102
 
45
103
  # ROC AUC per class (requires probability columns and sorted labels)
46
- proba_cols = [f"{label}_proba" for label in class_labels]
47
- if all(col in predictions_df.columns for col in proba_cols):
104
+ proba_col_names = [f"{label}_proba" for label in class_labels]
105
+ if all(col in df.columns for col in proba_col_names):
48
106
  # roc_auc_score requires labels to be sorted, so we sort and reorder results back
49
107
  sorted_labels = sorted(class_labels)
50
108
  sorted_proba_cols = [f"{label}_proba" for label in sorted_labels]
51
- y_score_sorted = predictions_df[sorted_proba_cols].values
109
+ y_score_sorted = df[sorted_proba_cols].values
52
110
  roc_auc_sorted = roc_auc_score(y_true, y_score_sorted, labels=sorted_labels, multi_class="ovr", average=None)
53
111
  # Map back to original class_labels order
54
112
  label_to_auc = dict(zip(sorted_labels, roc_auc_sorted))
@@ -97,9 +155,35 @@ def compute_regression_metrics(
97
155
 
98
156
  Returns:
99
157
  DataFrame with regression metrics (rmse, mae, medae, r2, spearmanr, support)
158
+ Returns empty DataFrame if validation fails or no valid data.
100
159
  """
101
- y_true = predictions_df[target_col].values
102
- y_pred = predictions_df[prediction_col].values
160
+ # Validate inputs
161
+ if predictions_df.empty:
162
+ log.warning("Empty DataFrame provided. Returning empty metrics.")
163
+ return pd.DataFrame()
164
+
165
+ if prediction_col not in predictions_df.columns:
166
+ log.warning(f"Prediction column '{prediction_col}' not found in DataFrame. Returning empty metrics.")
167
+ return pd.DataFrame()
168
+
169
+ if target_col not in predictions_df.columns:
170
+ log.warning(f"Target column '{target_col}' not found in DataFrame. Returning empty metrics.")
171
+ return pd.DataFrame()
172
+
173
+ # Handle NaN values
174
+ df = predictions_df[[target_col, prediction_col]].copy()
175
+ nan_target = df[target_col].isnull().sum()
176
+ nan_pred = df[prediction_col].isnull().sum()
177
+ if nan_target > 0 or nan_pred > 0:
178
+ log.warning(f"NaNs found: {target_col}={nan_target}, {prediction_col}={nan_pred}. Dropping NaN rows.")
179
+ df = df.dropna()
180
+
181
+ if df.empty:
182
+ log.warning("No valid rows after dropping NaNs. Returning empty metrics.")
183
+ return pd.DataFrame()
184
+
185
+ y_true = df[target_col].values
186
+ y_pred = df[prediction_col].values
103
187
 
104
188
  return pd.DataFrame(
105
189
  [
@@ -93,16 +93,17 @@ def get_custom_script_path(package: str, script_name: str) -> Path:
93
93
  return script_path
94
94
 
95
95
 
96
- def proximity_model_local(model: "Model"):
97
- """Create a Proximity Model for this Model
96
+ def proximity_model_local(model: "Model", include_all_columns: bool = False):
97
+ """Create a FeatureSpaceProximity Model for this Model
98
98
 
99
99
  Args:
100
100
  model (Model): The Model/FeatureSet used to create the proximity model
101
+ include_all_columns (bool): Include all DataFrame columns in neighbor results (default: False)
101
102
 
102
103
  Returns:
103
- Proximity: The proximity model
104
+ FeatureSpaceProximity: The proximity model
104
105
  """
105
- from workbench.algorithms.dataframe.proximity import Proximity # noqa: F401 (avoid circular import)
106
+ from workbench.algorithms.dataframe.feature_space_proximity import FeatureSpaceProximity # noqa: F401
106
107
  from workbench.api import Model, FeatureSet # noqa: F401 (avoid circular import)
107
108
 
108
109
  # Get Feature and Target Columns from the existing given Model
@@ -121,8 +122,59 @@ def proximity_model_local(model: "Model"):
121
122
  model_ids = set(model_df[id_column])
122
123
  full_df["in_model"] = full_df[id_column].isin(model_ids)
123
124
 
124
- # Create and return the Proximity Model
125
- return Proximity(full_df, id_column, features, target, track_columns=features)
125
+ # Create and return the FeatureSpaceProximity Model
126
+ return FeatureSpaceProximity(
127
+ full_df, id_column=id_column, features=features, target=target, include_all_columns=include_all_columns
128
+ )
129
+
130
+
131
+ def fingerprint_prox_model_local(
132
+ model: "Model",
133
+ include_all_columns: bool = False,
134
+ radius: int = 2,
135
+ n_bits: int = 1024,
136
+ counts: bool = False,
137
+ ):
138
+ """Create a FingerprintProximity Model for this Model
139
+
140
+ Args:
141
+ model (Model): The Model used to create the fingerprint proximity model
142
+ include_all_columns (bool): Include all DataFrame columns in neighbor results (default: False)
143
+ radius (int): Morgan fingerprint radius (default: 2)
144
+ n_bits (int): Number of bits for the fingerprint (default: 1024)
145
+ counts (bool): Use count fingerprints instead of binary (default: False)
146
+
147
+ Returns:
148
+ FingerprintProximity: The fingerprint proximity model
149
+ """
150
+ from workbench.algorithms.dataframe.fingerprint_proximity import FingerprintProximity # noqa: F401
151
+ from workbench.api import Model, FeatureSet # noqa: F401 (avoid circular import)
152
+
153
+ # Get Target Column from the existing given Model
154
+ target = model.target()
155
+
156
+ # Backtrack our FeatureSet to get the ID column
157
+ fs = FeatureSet(model.get_input())
158
+ id_column = fs.id_column
159
+
160
+ # Create the Proximity Model from both the full FeatureSet and the Model training data
161
+ full_df = fs.pull_dataframe()
162
+ model_df = model.training_view().pull_dataframe()
163
+
164
+ # Mark rows that are in the model
165
+ model_ids = set(model_df[id_column])
166
+ full_df["in_model"] = full_df[id_column].isin(model_ids)
167
+
168
+ # Create and return the FingerprintProximity Model
169
+ return FingerprintProximity(
170
+ full_df,
171
+ id_column=id_column,
172
+ target=target,
173
+ include_all_columns=include_all_columns,
174
+ radius=radius,
175
+ n_bits=n_bits,
176
+ counts=counts,
177
+ )
126
178
 
127
179
 
128
180
  def noise_model_local(model: "Model"):
@@ -157,13 +209,43 @@ def noise_model_local(model: "Model"):
157
209
  return NoiseModel(full_df, id_column, features, target)
158
210
 
159
211
 
160
- def published_proximity_model(model: "Model", prox_model_name: str, track_columns: list = None) -> "Model":
212
+ def cleanlab_model_local(model: "Model"):
213
+ """Create a CleanlabModels instance for detecting data quality issues in a Model's training data.
214
+
215
+ Args:
216
+ model (Model): The Model used to create the cleanlab models
217
+
218
+ Returns:
219
+ CleanlabModels: Factory providing access to CleanLearning and Datalab models.
220
+ - clean_learning(): CleanLearning model with enhanced get_label_issues()
221
+ - datalab(): Datalab instance with report(), get_issues()
222
+ """
223
+ from workbench.algorithms.models.cleanlab_model import create_cleanlab_model # noqa: F401 (avoid circular import)
224
+ from workbench.api import Model, FeatureSet # noqa: F401 (avoid circular import)
225
+
226
+ # Get Feature and Target Columns from the existing given Model
227
+ features = model.features()
228
+ target = model.target()
229
+ model_type = model.model_type
230
+
231
+ # Backtrack our FeatureSet to get the ID column
232
+ fs = FeatureSet(model.get_input())
233
+ id_column = fs.id_column
234
+
235
+ # Get the full FeatureSet data
236
+ full_df = fs.pull_dataframe()
237
+
238
+ # Create and return the CleanLearning model
239
+ return create_cleanlab_model(full_df, id_column, features, target, model_type=model_type)
240
+
241
+
242
+ def published_proximity_model(model: "Model", prox_model_name: str, include_all_columns: bool = False) -> "Model":
161
243
  """Create a published proximity model based on the given model
162
244
 
163
245
  Args:
164
246
  model (Model): The model to create the proximity model from
165
247
  prox_model_name (str): The name of the proximity model to create
166
- track_columns (list, optional): List of columns to track in the proximity model
248
+ include_all_columns (bool): Include all DataFrame columns in results (default: False)
167
249
  Returns:
168
250
  Model: The proximity model
169
251
  """
@@ -186,7 +268,7 @@ def published_proximity_model(model: "Model", prox_model_name: str, track_column
186
268
  description=f"Proximity Model for {model.name}",
187
269
  tags=["proximity", model.name],
188
270
  custom_script=script_path,
189
- custom_args={"track_columns": track_columns},
271
+ custom_args={"include_all_columns": include_all_columns},
190
272
  )
191
273
  return prox_model
192
274
 
@@ -75,7 +75,7 @@ if __name__ == "__main__":
75
75
  from workbench.api import Model
76
76
 
77
77
  # Test pulling CV results
78
- model_name = "aqsol-pytorch-reg"
78
+ model_name = "aqsol-reg-pytorch"
79
79
  print(f"Loading Workbench model: {model_name}")
80
80
  model = Model(model_name)
81
81
  print(f"Model Framework: {model.model_framework}")
@@ -9,6 +9,7 @@ from typing import Optional, List, Tuple, Dict, Union
9
9
  from workbench.utils.xgboost_model_utils import xgboost_model_from_s3
10
10
  from workbench.utils.model_utils import load_category_mappings_from_s3
11
11
  from workbench.utils.pandas_utils import convert_categorical_types
12
+ from workbench.model_script_utils.model_script_utils import decompress_features
12
13
 
13
14
  # Set up the log
14
15
  log = logging.getLogger("workbench")
@@ -111,61 +112,6 @@ def shap_values_data(
111
112
  return result_df, feature_df
112
113
 
113
114
 
114
- def decompress_features(
115
- df: pd.DataFrame, features: List[str], compressed_features: List[str]
116
- ) -> Tuple[pd.DataFrame, List[str]]:
117
- """Prepare features for the XGBoost model
118
-
119
- Args:
120
- df (pd.DataFrame): The features DataFrame
121
- features (List[str]): Full list of feature names
122
- compressed_features (List[str]): List of feature names to decompress (bitstrings)
123
-
124
- Returns:
125
- pd.DataFrame: DataFrame with the decompressed features
126
- List[str]: Updated list of feature names after decompression
127
-
128
- Raises:
129
- ValueError: If any missing values are found in the specified features
130
- """
131
-
132
- # Check for any missing values in the required features
133
- missing_counts = df[features].isna().sum()
134
- if missing_counts.any():
135
- missing_features = missing_counts[missing_counts > 0]
136
- print(
137
- f"WARNING: Found missing values in features: {missing_features.to_dict()}. "
138
- "WARNING: You might want to remove/replace all NaN values before processing."
139
- )
140
-
141
- # Decompress the specified compressed features
142
- decompressed_features = features
143
- for feature in compressed_features:
144
- if (feature not in df.columns) or (feature not in features):
145
- print(f"Feature '{feature}' not in the features list, skipping decompression.")
146
- continue
147
-
148
- # Remove the feature from the list of features to avoid duplication
149
- decompressed_features.remove(feature)
150
-
151
- # Handle all compressed features as bitstrings
152
- bit_matrix = np.array([list(bitstring) for bitstring in df[feature]], dtype=np.uint8)
153
- prefix = feature[:3]
154
-
155
- # Create all new columns at once - avoids fragmentation
156
- new_col_names = [f"{prefix}_{i}" for i in range(bit_matrix.shape[1])]
157
- new_df = pd.DataFrame(bit_matrix, columns=new_col_names, index=df.index)
158
-
159
- # Add to features list
160
- decompressed_features.extend(new_col_names)
161
-
162
- # Drop original column and concatenate new ones
163
- df = df.drop(columns=[feature])
164
- df = pd.concat([df, new_df], axis=1)
165
-
166
- return df, decompressed_features
167
-
168
-
169
115
  def _calculate_shap_values(workbench_model, sample_df: pd.DataFrame = None):
170
116
  """
171
117
  Internal function to calculate SHAP values for Workbench Models.
@@ -420,21 +420,17 @@ if __name__ == "__main__":
420
420
  df = pd.DataFrame(data)
421
421
 
422
422
  # Get a UQ regressor model
423
- # from workbench.api import Endpoint, DFStore
424
- # end = Endpoint("aqsol-uq")
425
- # df = end.auto_inference()
426
- # DFStore().upsert("/workbench/models/aqsol-uq/auto_inference", df)
423
+ from workbench.api import Model
427
424
 
428
- from workbench.api import DFStore
429
-
430
- df = DFStore().get("/workbench/models/aqsol-uq-100/full_cross_fold_inference")
425
+ model = Model("logd-reg-xgb")
426
+ df = model.get_inference_predictions("full_cross_fold")
431
427
 
432
428
  # Run the Unit Test on the Plugin
433
429
  PluginUnitTest(
434
430
  ScatterPlot,
435
431
  input_data=df,
436
432
  theme="midnight_blue",
437
- x="solubility",
433
+ x="logd",
438
434
  y="prediction",
439
435
  color="prediction_std",
440
436
  suppress_hover_display=True,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: workbench
3
- Version: 0.8.213
3
+ Version: 0.8.219
4
4
  Summary: Workbench: A Dashboard and Python API for creating and deploying AWS SageMaker Model Pipelines
5
5
  Author-email: SuperCowPowers LLC <support@supercowpowers.com>
6
6
  License: MIT License
@@ -47,6 +47,7 @@ Requires-Dist: cryptography>=44.0.2
47
47
  Requires-Dist: ipython>=8.37.0
48
48
  Requires-Dist: pyreadline3; sys_platform == "win32"
49
49
  Requires-Dist: scikit-learn>=1.5.2
50
+ Requires-Dist: umap-learn>=0.5.8
50
51
  Requires-Dist: xgboost>=3.0.3
51
52
  Requires-Dist: joblib>=1.3.2
52
53
  Requires-Dist: requests>=2.26.0