workbench 0.8.202__py3-none-any.whl → 0.8.220__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of workbench might be problematic. Click here for more details.

Files changed (84) hide show
  1. workbench/algorithms/dataframe/compound_dataset_overlap.py +321 -0
  2. workbench/algorithms/dataframe/feature_space_proximity.py +168 -75
  3. workbench/algorithms/dataframe/fingerprint_proximity.py +421 -85
  4. workbench/algorithms/dataframe/projection_2d.py +44 -21
  5. workbench/algorithms/dataframe/proximity.py +78 -150
  6. workbench/algorithms/graph/light/proximity_graph.py +5 -5
  7. workbench/algorithms/models/cleanlab_model.py +382 -0
  8. workbench/algorithms/models/noise_model.py +388 -0
  9. workbench/algorithms/sql/outliers.py +3 -3
  10. workbench/api/__init__.py +3 -0
  11. workbench/api/df_store.py +17 -108
  12. workbench/api/endpoint.py +13 -11
  13. workbench/api/feature_set.py +111 -8
  14. workbench/api/meta_model.py +289 -0
  15. workbench/api/model.py +45 -12
  16. workbench/api/parameter_store.py +3 -52
  17. workbench/cached/cached_model.py +4 -4
  18. workbench/core/artifacts/artifact.py +5 -5
  19. workbench/core/artifacts/df_store_core.py +114 -0
  20. workbench/core/artifacts/endpoint_core.py +228 -237
  21. workbench/core/artifacts/feature_set_core.py +185 -230
  22. workbench/core/artifacts/model_core.py +34 -26
  23. workbench/core/artifacts/parameter_store_core.py +98 -0
  24. workbench/core/pipelines/pipeline_executor.py +1 -1
  25. workbench/core/transforms/features_to_model/features_to_model.py +22 -10
  26. workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +41 -10
  27. workbench/core/transforms/pandas_transforms/pandas_to_features.py +11 -2
  28. workbench/model_script_utils/model_script_utils.py +339 -0
  29. workbench/model_script_utils/pytorch_utils.py +405 -0
  30. workbench/model_script_utils/uq_harness.py +278 -0
  31. workbench/model_scripts/chemprop/chemprop.template +428 -631
  32. workbench/model_scripts/chemprop/generated_model_script.py +432 -635
  33. workbench/model_scripts/chemprop/model_script_utils.py +339 -0
  34. workbench/model_scripts/chemprop/requirements.txt +2 -10
  35. workbench/model_scripts/custom_models/chem_info/fingerprints.py +87 -46
  36. workbench/model_scripts/custom_models/proximity/feature_space_proximity.py +194 -0
  37. workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +6 -6
  38. workbench/model_scripts/custom_models/uq_models/feature_space_proximity.py +194 -0
  39. workbench/model_scripts/meta_model/generated_model_script.py +209 -0
  40. workbench/model_scripts/meta_model/meta_model.template +209 -0
  41. workbench/model_scripts/pytorch_model/generated_model_script.py +374 -613
  42. workbench/model_scripts/pytorch_model/model_script_utils.py +339 -0
  43. workbench/model_scripts/pytorch_model/pytorch.template +370 -609
  44. workbench/model_scripts/pytorch_model/pytorch_utils.py +405 -0
  45. workbench/model_scripts/pytorch_model/requirements.txt +1 -1
  46. workbench/model_scripts/pytorch_model/uq_harness.py +278 -0
  47. workbench/model_scripts/script_generation.py +6 -5
  48. workbench/model_scripts/uq_models/generated_model_script.py +65 -422
  49. workbench/model_scripts/xgb_model/generated_model_script.py +372 -395
  50. workbench/model_scripts/xgb_model/model_script_utils.py +339 -0
  51. workbench/model_scripts/xgb_model/uq_harness.py +278 -0
  52. workbench/model_scripts/xgb_model/xgb_model.template +366 -396
  53. workbench/repl/workbench_shell.py +0 -5
  54. workbench/resources/open_source_api.key +1 -1
  55. workbench/scripts/endpoint_test.py +2 -2
  56. workbench/scripts/meta_model_sim.py +35 -0
  57. workbench/scripts/training_test.py +85 -0
  58. workbench/utils/chem_utils/fingerprints.py +87 -46
  59. workbench/utils/chem_utils/projections.py +16 -6
  60. workbench/utils/chemprop_utils.py +36 -655
  61. workbench/utils/meta_model_simulator.py +499 -0
  62. workbench/utils/metrics_utils.py +256 -0
  63. workbench/utils/model_utils.py +192 -54
  64. workbench/utils/pytorch_utils.py +33 -472
  65. workbench/utils/shap_utils.py +1 -55
  66. workbench/utils/xgboost_local_crossfold.py +267 -0
  67. workbench/utils/xgboost_model_utils.py +49 -356
  68. workbench/web_interface/components/model_plot.py +7 -1
  69. workbench/web_interface/components/plugins/model_details.py +30 -68
  70. workbench/web_interface/components/plugins/scatter_plot.py +4 -8
  71. {workbench-0.8.202.dist-info → workbench-0.8.220.dist-info}/METADATA +6 -5
  72. {workbench-0.8.202.dist-info → workbench-0.8.220.dist-info}/RECORD +76 -60
  73. {workbench-0.8.202.dist-info → workbench-0.8.220.dist-info}/entry_points.txt +2 -0
  74. workbench/core/cloud_platform/aws/aws_df_store.py +0 -404
  75. workbench/core/cloud_platform/aws/aws_parameter_store.py +0 -296
  76. workbench/model_scripts/custom_models/meta_endpoints/example.py +0 -53
  77. workbench/model_scripts/custom_models/proximity/proximity.py +0 -410
  78. workbench/model_scripts/custom_models/uq_models/meta_uq.template +0 -377
  79. workbench/model_scripts/custom_models/uq_models/proximity.py +0 -410
  80. workbench/model_scripts/uq_models/mapie.template +0 -605
  81. workbench/model_scripts/uq_models/requirements.txt +0 -1
  82. {workbench-0.8.202.dist-info → workbench-0.8.220.dist-info}/WHEEL +0 -0
  83. {workbench-0.8.202.dist-info → workbench-0.8.220.dist-info}/licenses/LICENSE +0 -0
  84. {workbench-0.8.202.dist-info → workbench-0.8.220.dist-info}/top_level.txt +0 -0
@@ -1,410 +0,0 @@
1
- import pandas as pd
2
- import numpy as np
3
- from sklearn.preprocessing import StandardScaler
4
- from sklearn.neighbors import NearestNeighbors
5
- from typing import List, Dict, Optional, Union
6
- import logging
7
-
8
- # Set up logging
9
- log = logging.getLogger("workbench")
10
-
11
-
12
- class Proximity:
13
- def __init__(
14
- self,
15
- df: pd.DataFrame,
16
- id_column: str,
17
- features: List[str],
18
- target: Optional[str] = None,
19
- track_columns: Optional[List[str]] = None,
20
- ):
21
- """
22
- Initialize the Proximity class.
23
-
24
- Args:
25
- df: DataFrame containing data for neighbor computations.
26
- id_column: Name of the column used as the identifier.
27
- features: List of feature column names to be used for neighbor computations.
28
- target: Name of the target column. Defaults to None.
29
- track_columns: Additional columns to track in results. Defaults to None.
30
- """
31
- self.id_column = id_column
32
- self.target = target
33
- self.track_columns = track_columns or []
34
-
35
- # Filter out non-numeric features
36
- self.features = self._validate_features(df, features)
37
-
38
- # Drop NaN rows and set up DataFrame
39
- self.df = df.dropna(subset=self.features).copy()
40
-
41
- # Compute target range if target is provided
42
- self.target_range = None
43
- if self.target and self.target in self.df.columns:
44
- self.target_range = self.df[self.target].max() - self.df[self.target].min()
45
-
46
- # Build the proximity model
47
- self._build_model()
48
-
49
- # Precompute landscape metrics
50
- self._precompute_metrics()
51
-
52
- def isolated(self, top_percent: float = 1.0) -> pd.DataFrame:
53
- """
54
- Find isolated data points based on distance to nearest neighbor.
55
-
56
- Args:
57
- top_percent: Percentage of most isolated data points to return (e.g., 1.0 returns top 1%)
58
-
59
- Returns:
60
- DataFrame of observations above the percentile threshold, sorted by distance (descending)
61
- """
62
- percentile = 100 - top_percent
63
- threshold = np.percentile(self.df["nn_distance"], percentile)
64
- isolated = self.df[self.df["nn_distance"] >= threshold].copy()
65
- return isolated.sort_values("nn_distance", ascending=False).reset_index(drop=True)
66
-
67
- def target_gradients(
68
- self,
69
- top_percent: float = 1.0,
70
- min_delta: Optional[float] = None,
71
- k_neighbors: int = 4,
72
- only_coincident: bool = False,
73
- ) -> pd.DataFrame:
74
- """
75
- Find compounds with steep target gradients (data quality issues and activity cliffs).
76
-
77
- Uses a two-phase approach:
78
- 1. Quick filter using nearest neighbor gradient
79
- 2. Verify using k-neighbor median to handle cases where the nearest neighbor is the outlier
80
-
81
- Args:
82
- top_percent: Percentage of compounds with steepest gradients to return (e.g., 1.0 = top 1%)
83
- min_delta: Minimum absolute target difference to consider. If None, defaults to target_range/100
84
- k_neighbors: Number of neighbors to use for median calculation (default: 4)
85
- only_coincident: If True, only consider compounds that are coincident (default: False)
86
-
87
- Returns:
88
- DataFrame of compounds with steepest gradients, sorted by gradient (descending)
89
- """
90
- if self.target is None:
91
- raise ValueError("Target column must be specified")
92
-
93
- epsilon = 1e-5
94
-
95
- # Phase 1: Quick filter using precomputed nearest neighbor
96
- candidates = self.df.copy()
97
- candidates["gradient"] = candidates["nn_target_diff"] / (candidates["nn_distance"] + epsilon)
98
-
99
- # Apply min_delta
100
- if min_delta is None:
101
- min_delta = self.target_range / 100.0 if self.target_range > 0 else 0.0
102
- candidates = candidates[candidates["nn_target_diff"] >= min_delta]
103
-
104
- # Filter based on mode
105
- if only_coincident:
106
- # Only keep coincident points (nn_distance ~= 0)
107
- candidates = candidates[candidates["nn_distance"] < epsilon].copy()
108
- else:
109
- # Get top X% by initial gradient
110
- percentile = 100 - top_percent
111
- threshold = np.percentile(candidates["gradient"], percentile)
112
- candidates = candidates[candidates["gradient"] >= threshold].copy()
113
-
114
- # Phase 2: Verify with k-neighbor median to filter out cases where nearest neighbor is the outlier
115
- results = []
116
- for _, row in candidates.iterrows():
117
- cmpd_id = row[self.id_column]
118
- cmpd_target = row[self.target]
119
-
120
- # Get k nearest neighbors (excluding self)
121
- nbrs = self.neighbors(cmpd_id, n_neighbors=k_neighbors, include_self=False)
122
-
123
- # Calculate median target of k neighbors, excluding the nearest neighbor (index 0)
124
- neighbor_median = nbrs.iloc[1:k_neighbors][self.target].median()
125
- median_diff = abs(cmpd_target - neighbor_median)
126
-
127
- # Only keep if compound differs from neighborhood median
128
- # This filters out cases where the nearest neighbor is the outlier
129
- if median_diff >= min_delta:
130
- results.append(
131
- {
132
- self.id_column: cmpd_id,
133
- self.target: cmpd_target,
134
- "nn_target": row["nn_target"],
135
- "nn_target_diff": row["nn_target_diff"],
136
- "nn_distance": row["nn_distance"],
137
- "gradient": row["gradient"], # Keep Phase 1 gradient
138
- "neighbor_median": neighbor_median,
139
- "neighbor_median_diff": median_diff,
140
- }
141
- )
142
-
143
- # Handle empty results
144
- if not results:
145
- return pd.DataFrame(
146
- columns=[
147
- self.id_column,
148
- self.target,
149
- "neighbor_median",
150
- "neighbor_median_diff",
151
- "mean_distance",
152
- "gradient",
153
- ]
154
- )
155
-
156
- results_df = pd.DataFrame(results)
157
- results_df = results_df.sort_values("gradient", ascending=False).reset_index(drop=True)
158
- return results_df
159
-
160
- def neighbors(
161
- self,
162
- id_or_ids: Union[str, int, List[Union[str, int]]],
163
- n_neighbors: Optional[int] = 5,
164
- radius: Optional[float] = None,
165
- include_self: bool = True,
166
- ) -> pd.DataFrame:
167
- """
168
- Return neighbors for ID(s) from the existing dataset.
169
-
170
- Args:
171
- id_or_ids: Single ID or list of IDs to look up
172
- n_neighbors: Number of neighbors to return (default: 5, ignored if radius is set)
173
- radius: If provided, find all neighbors within this radius
174
- include_self: Whether to include self in results (default: True)
175
-
176
- Returns:
177
- DataFrame containing neighbors and distances
178
- """
179
- # Normalize to list
180
- ids = [id_or_ids] if not isinstance(id_or_ids, list) else id_or_ids
181
-
182
- # Validate IDs exist
183
- missing_ids = set(ids) - set(self.df[self.id_column])
184
- if missing_ids:
185
- raise ValueError(f"IDs not found in dataset: {missing_ids}")
186
-
187
- # Filter to requested IDs and preserve order
188
- query_df = self.df[self.df[self.id_column].isin(ids)]
189
- query_df = query_df.set_index(self.id_column).loc[ids].reset_index()
190
-
191
- # Transform query features
192
- X_query = self.scaler.transform(query_df[self.features])
193
-
194
- # Get neighbors
195
- if radius is not None:
196
- distances, indices = self.nn.radius_neighbors(X_query, radius=radius)
197
- else:
198
- distances, indices = self.nn.kneighbors(X_query, n_neighbors=n_neighbors)
199
-
200
- # Build results
201
- results = []
202
- for i, (dists, nbrs) in enumerate(zip(distances, indices)):
203
- query_id = query_df.iloc[i][self.id_column]
204
-
205
- for neighbor_idx, dist in zip(nbrs, dists):
206
- neighbor_id = self.df.iloc[neighbor_idx][self.id_column]
207
-
208
- # Skip self if requested
209
- if not include_self and neighbor_id == query_id:
210
- continue
211
-
212
- results.append(self._build_neighbor_result(query_id=query_id, neighbor_idx=neighbor_idx, distance=dist))
213
-
214
- df_results = pd.DataFrame(results)
215
- df_results["is_self"] = df_results["neighbor_id"] == df_results[self.id_column]
216
- df_results = df_results.sort_values([self.id_column, "is_self", "distance"], ascending=[True, False, True])
217
- return df_results.drop("is_self", axis=1).reset_index(drop=True)
218
-
219
- def _validate_features(self, df: pd.DataFrame, features: List[str]) -> List[str]:
220
- """Remove non-numeric features and log warnings."""
221
- non_numeric = [f for f in features if f not in df.select_dtypes(include=["number"]).columns]
222
- if non_numeric:
223
- log.warning(f"Non-numeric features {non_numeric} aren't currently supported, excluding them")
224
- return [f for f in features if f not in non_numeric]
225
-
226
- def _build_model(self) -> None:
227
- """Standardize features and fit Nearest Neighbors model."""
228
- self.scaler = StandardScaler()
229
- X = self.scaler.fit_transform(self.df[self.features])
230
- self.nn = NearestNeighbors().fit(X)
231
-
232
- def _precompute_metrics(self, n_neighbors: int = 10) -> None:
233
- """
234
- Precompute landscape metrics for all compounds.
235
-
236
- Adds columns to self.df:
237
- - nn_distance: Distance to nearest neighbor
238
- - nn_id: ID of nearest neighbor
239
-
240
- If target is specified, also adds:
241
- - nn_target: Target value of nearest neighbor
242
- - nn_target_diff: Absolute difference from nearest neighbor target
243
- """
244
- log.info("Precomputing proximity metrics...")
245
-
246
- # Make sure n_neighbors isn't greater than dataset size
247
- n_neighbors = min(n_neighbors, len(self.df) - 1)
248
-
249
- # Get nearest neighbors for all points (including self)
250
- X = self.scaler.transform(self.df[self.features])
251
- distances, indices = self.nn.kneighbors(X, n_neighbors=2) # Just need nearest neighbor
252
-
253
- # Extract nearest neighbor (index 1, since index 0 is self)
254
- self.df["nn_distance"] = distances[:, 1]
255
- self.df["nn_id"] = self.df.iloc[indices[:, 1]][self.id_column].values
256
-
257
- # If target exists, compute target-based metrics
258
- if self.target and self.target in self.df.columns:
259
- # Get target values for nearest neighbor
260
- nn_target_values = self.df.iloc[indices[:, 1]][self.target].values
261
- self.df["nn_target"] = nn_target_values
262
- self.df["nn_target_diff"] = np.abs(self.df[self.target].values - nn_target_values)
263
-
264
- # Precompute target range for min_delta default
265
- self.target_range = self.df[self.target].max() - self.df[self.target].min()
266
-
267
- log.info("Proximity metrics precomputed successfully")
268
-
269
- def _build_neighbor_result(self, query_id, neighbor_idx: int, distance: float) -> Dict:
270
- """
271
- Build a result dictionary for a single neighbor.
272
-
273
- Args:
274
- query_id: ID of the query point
275
- neighbor_idx: Index of the neighbor in the original DataFrame
276
- distance: Distance between query and neighbor
277
-
278
- Returns:
279
- Dictionary containing neighbor information
280
- """
281
- neighbor_row = self.df.iloc[neighbor_idx]
282
- neighbor_id = neighbor_row[self.id_column]
283
-
284
- # Start with basic info
285
- result = {
286
- self.id_column: query_id,
287
- "neighbor_id": neighbor_id,
288
- "distance": 0.0 if distance < 1e-5 else distance,
289
- }
290
-
291
- # Add target if present
292
- if self.target and self.target in self.df.columns:
293
- result[self.target] = neighbor_row[self.target]
294
-
295
- # Add tracked columns
296
- for col in self.track_columns:
297
- if col in self.df.columns:
298
- result[col] = neighbor_row[col]
299
-
300
- # Add prediction/probability columns if they exist
301
- for col in self.df.columns:
302
- if col == "prediction" or "_proba" in col or "residual" in col or col == "in_model":
303
- result[col] = neighbor_row[col]
304
-
305
- return result
306
-
307
-
308
- # Testing the Proximity class
309
- if __name__ == "__main__":
310
-
311
- pd.set_option("display.max_columns", None)
312
- pd.set_option("display.width", 1000)
313
-
314
- # Create a sample DataFrame
315
- data = {
316
- "ID": [1, 2, 3, 4, 5],
317
- "Feature1": [0.1, 0.2, 0.3, 0.4, 0.5],
318
- "Feature2": [0.5, 0.4, 0.3, 0.2, 0.1],
319
- "Feature3": [2.5, 2.4, 2.3, 2.3, np.nan],
320
- }
321
- df = pd.DataFrame(data)
322
-
323
- # Test the Proximity class
324
- features = ["Feature1", "Feature2", "Feature3"]
325
- prox = Proximity(df, id_column="ID", features=features)
326
- print(prox.neighbors(1, n_neighbors=2))
327
-
328
- # Test the neighbors method with radius
329
- print(prox.neighbors(1, radius=2.0))
330
-
331
- # Test with Features list
332
- prox = Proximity(df, id_column="ID", features=["Feature1"])
333
- print(prox.neighbors(1))
334
-
335
- # Create a sample DataFrame
336
- data = {
337
- "foo_id": ["a", "b", "c", "d", "e"], # Testing string IDs
338
- "Feature1": [0.1, 0.2, 0.3, 0.4, 0.5],
339
- "Feature2": [0.5, 0.4, 0.3, 0.2, 0.1],
340
- "target": [1, 0, 1, 0, 5],
341
- }
342
- df = pd.DataFrame(data)
343
-
344
- # Test with String Ids
345
- prox = Proximity(
346
- df,
347
- id_column="foo_id",
348
- features=["Feature1", "Feature2"],
349
- target="target",
350
- track_columns=["Feature1", "Feature2"],
351
- )
352
- print(prox.neighbors(["a", "b"]))
353
-
354
- # Test duplicate IDs
355
- data = {
356
- "foo_id": ["a", "b", "c", "d", "d"], # Duplicate ID (d)
357
- "Feature1": [0.1, 0.2, 0.3, 0.4, 0.5],
358
- "Feature2": [0.5, 0.4, 0.3, 0.2, 0.1],
359
- "target": [1, 0, 1, 0, 5],
360
- }
361
- df = pd.DataFrame(data)
362
- prox = Proximity(df, id_column="foo_id", features=["Feature1", "Feature2"], target="target")
363
- print(df.equals(prox.df))
364
-
365
- # Test with a categorical feature
366
- from workbench.api import FeatureSet, Model
367
-
368
- fs = FeatureSet("aqsol_features")
369
- model = Model("aqsol-regression")
370
- features = model.features()
371
- df = fs.pull_dataframe()
372
- prox = Proximity(
373
- df, id_column=fs.id_column, features=model.features(), target=model.target(), track_columns=features
374
- )
375
- print(prox.neighbors(df[fs.id_column].tolist()[:3]))
376
-
377
- print("\n" + "=" * 80)
378
- print("Testing isolated_compounds...")
379
- print("=" * 80)
380
-
381
- # Test isolated data in the top 1%
382
- isolated_1pct = prox.isolated(top_percent=1.0)
383
- print(f"\nTop 1% most isolated compounds (n={len(isolated_1pct)}):")
384
- print(isolated_1pct[[fs.id_column, "nn_distance", "nn_id"]].head(10))
385
-
386
- # Test isolated data in the top 5%
387
- isolated_5pct = prox.isolated(top_percent=5.0)
388
- print(f"\nTop 5% most isolated compounds (n={len(isolated_5pct)}):")
389
- print(isolated_5pct[[fs.id_column, "nn_distance", "nn_id"]].head(10))
390
-
391
- print("\n" + "=" * 80)
392
- print("Testing target_gradients...")
393
- print("=" * 80)
394
-
395
- # Test with different parameters
396
- gradients_1pct = prox.target_gradients(top_percent=1.0, min_delta=1.0)
397
- print(f"\nTop 1% target gradients (min_delta=5.0) (n={len(gradients_1pct)}):")
398
- print(
399
- gradients_1pct[
400
- [fs.id_column, model.target(), "neighbor_median", "neighbor_median_diff", "mean_distance", "gradient"]
401
- ].head(10)
402
- )
403
-
404
- gradients_5pct = prox.target_gradients(top_percent=5.0, min_delta=5.0)
405
- print(f"\nTop 5% target gradients (min_delta=5.0) (n={len(gradients_5pct)}):")
406
- print(
407
- gradients_5pct[
408
- [fs.id_column, model.target(), "neighbor_median", "neighbor_median_diff", "mean_distance", "gradient"]
409
- ].head(10)
410
- )