workbench 0.8.212__py3-none-any.whl → 0.8.217__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- workbench/algorithms/dataframe/feature_space_proximity.py +168 -75
- workbench/algorithms/dataframe/fingerprint_proximity.py +257 -80
- workbench/algorithms/dataframe/projection_2d.py +38 -21
- workbench/algorithms/dataframe/proximity.py +75 -150
- workbench/algorithms/graph/light/proximity_graph.py +5 -5
- workbench/algorithms/models/cleanlab_model.py +382 -0
- workbench/algorithms/models/noise_model.py +2 -2
- workbench/api/__init__.py +3 -0
- workbench/api/endpoint.py +10 -5
- workbench/api/feature_set.py +76 -6
- workbench/api/meta_model.py +289 -0
- workbench/api/model.py +43 -4
- workbench/core/artifacts/endpoint_core.py +75 -129
- workbench/core/artifacts/feature_set_core.py +1 -1
- workbench/core/artifacts/model_core.py +6 -4
- workbench/core/pipelines/pipeline_executor.py +1 -1
- workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +30 -10
- workbench/model_script_utils/pytorch_utils.py +11 -1
- workbench/model_scripts/chemprop/chemprop.template +145 -69
- workbench/model_scripts/chemprop/generated_model_script.py +147 -71
- workbench/model_scripts/custom_models/chem_info/fingerprints.py +7 -3
- workbench/model_scripts/custom_models/proximity/feature_space_proximity.py +194 -0
- workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +6 -6
- workbench/model_scripts/custom_models/uq_models/feature_space_proximity.py +194 -0
- workbench/model_scripts/custom_models/uq_models/meta_uq.template +6 -6
- workbench/model_scripts/meta_model/generated_model_script.py +209 -0
- workbench/model_scripts/meta_model/meta_model.template +209 -0
- workbench/model_scripts/pytorch_model/generated_model_script.py +42 -24
- workbench/model_scripts/pytorch_model/pytorch.template +42 -24
- workbench/model_scripts/pytorch_model/pytorch_utils.py +11 -1
- workbench/model_scripts/script_generation.py +4 -0
- workbench/model_scripts/xgb_model/generated_model_script.py +169 -158
- workbench/model_scripts/xgb_model/xgb_model.template +163 -152
- workbench/repl/workbench_shell.py +0 -5
- workbench/scripts/endpoint_test.py +2 -2
- workbench/utils/chem_utils/fingerprints.py +7 -3
- workbench/utils/chemprop_utils.py +23 -5
- workbench/utils/meta_model_simulator.py +471 -0
- workbench/utils/metrics_utils.py +94 -10
- workbench/utils/model_utils.py +91 -9
- workbench/utils/pytorch_utils.py +1 -1
- workbench/web_interface/components/plugins/scatter_plot.py +4 -8
- {workbench-0.8.212.dist-info → workbench-0.8.217.dist-info}/METADATA +2 -1
- {workbench-0.8.212.dist-info → workbench-0.8.217.dist-info}/RECORD +48 -43
- workbench/model_scripts/custom_models/proximity/proximity.py +0 -410
- workbench/model_scripts/custom_models/uq_models/proximity.py +0 -410
- {workbench-0.8.212.dist-info → workbench-0.8.217.dist-info}/WHEEL +0 -0
- {workbench-0.8.212.dist-info → workbench-0.8.217.dist-info}/entry_points.txt +0 -0
- {workbench-0.8.212.dist-info → workbench-0.8.217.dist-info}/licenses/LICENSE +0 -0
- {workbench-0.8.212.dist-info → workbench-0.8.217.dist-info}/top_level.txt +0 -0
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import pandas as pd
|
|
2
2
|
import numpy as np
|
|
3
|
-
from
|
|
4
|
-
from sklearn.neighbors import NearestNeighbors
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
5
4
|
from typing import List, Dict, Optional, Union
|
|
6
5
|
import logging
|
|
7
6
|
|
|
@@ -9,14 +8,16 @@ import logging
|
|
|
9
8
|
log = logging.getLogger("workbench")
|
|
10
9
|
|
|
11
10
|
|
|
12
|
-
class Proximity:
|
|
11
|
+
class Proximity(ABC):
|
|
12
|
+
"""Abstract base class for proximity/neighbor computations."""
|
|
13
|
+
|
|
13
14
|
def __init__(
|
|
14
15
|
self,
|
|
15
16
|
df: pd.DataFrame,
|
|
16
17
|
id_column: str,
|
|
17
18
|
features: List[str],
|
|
18
19
|
target: Optional[str] = None,
|
|
19
|
-
|
|
20
|
+
include_all_columns: bool = False,
|
|
20
21
|
):
|
|
21
22
|
"""
|
|
22
23
|
Initialize the Proximity class.
|
|
@@ -26,29 +27,61 @@ class Proximity:
|
|
|
26
27
|
id_column: Name of the column used as the identifier.
|
|
27
28
|
features: List of feature column names to be used for neighbor computations.
|
|
28
29
|
target: Name of the target column. Defaults to None.
|
|
29
|
-
|
|
30
|
+
include_all_columns: Include all DataFrame columns in neighbor results. Defaults to False.
|
|
30
31
|
"""
|
|
31
32
|
self.id_column = id_column
|
|
33
|
+
self.features = features
|
|
32
34
|
self.target = target
|
|
33
|
-
self.
|
|
35
|
+
self.include_all_columns = include_all_columns
|
|
34
36
|
|
|
35
|
-
#
|
|
36
|
-
self.
|
|
37
|
+
# Store the DataFrame (subclasses may filter/modify in _prepare_data)
|
|
38
|
+
self.df = df.copy()
|
|
37
39
|
|
|
38
|
-
#
|
|
39
|
-
self.
|
|
40
|
+
# Prepare data (subclasses can override)
|
|
41
|
+
self._prepare_data()
|
|
40
42
|
|
|
41
43
|
# Compute target range if target is provided
|
|
42
44
|
self.target_range = None
|
|
43
45
|
if self.target and self.target in self.df.columns:
|
|
44
46
|
self.target_range = self.df[self.target].max() - self.df[self.target].min()
|
|
45
47
|
|
|
46
|
-
# Build the proximity model
|
|
48
|
+
# Build the proximity model (subclass-specific)
|
|
47
49
|
self._build_model()
|
|
48
50
|
|
|
49
51
|
# Precompute landscape metrics
|
|
50
52
|
self._precompute_metrics()
|
|
51
53
|
|
|
54
|
+
# Define core columns for output (subclasses can override)
|
|
55
|
+
self._set_core_columns()
|
|
56
|
+
|
|
57
|
+
# Project the data to 2D (subclass-specific)
|
|
58
|
+
self._project_2d()
|
|
59
|
+
|
|
60
|
+
def _prepare_data(self) -> None:
|
|
61
|
+
"""Prepare the data before building the model. Subclasses can override."""
|
|
62
|
+
pass
|
|
63
|
+
|
|
64
|
+
def _set_core_columns(self) -> None:
|
|
65
|
+
"""Set the core columns for output. Subclasses can override."""
|
|
66
|
+
self.core_columns = [self.id_column, "nn_distance", "nn_id"]
|
|
67
|
+
if self.target:
|
|
68
|
+
self.core_columns.extend([self.target, "nn_target", "nn_target_diff"])
|
|
69
|
+
|
|
70
|
+
@abstractmethod
|
|
71
|
+
def _build_model(self) -> None:
|
|
72
|
+
"""Build the proximity model. Must set self.nn (NearestNeighbors instance)."""
|
|
73
|
+
pass
|
|
74
|
+
|
|
75
|
+
@abstractmethod
|
|
76
|
+
def _transform_features(self, df: pd.DataFrame) -> np.ndarray:
|
|
77
|
+
"""Transform features for querying. Returns feature matrix for nearest neighbor lookup."""
|
|
78
|
+
pass
|
|
79
|
+
|
|
80
|
+
@abstractmethod
|
|
81
|
+
def _project_2d(self) -> None:
|
|
82
|
+
"""Project the data to 2D for visualization. Updates self.df with 'x' and 'y' columns."""
|
|
83
|
+
pass
|
|
84
|
+
|
|
52
85
|
def isolated(self, top_percent: float = 1.0) -> pd.DataFrame:
|
|
53
86
|
"""
|
|
54
87
|
Find isolated data points based on distance to nearest neighbor.
|
|
@@ -62,7 +95,19 @@ class Proximity:
|
|
|
62
95
|
percentile = 100 - top_percent
|
|
63
96
|
threshold = np.percentile(self.df["nn_distance"], percentile)
|
|
64
97
|
isolated = self.df[self.df["nn_distance"] >= threshold].copy()
|
|
65
|
-
|
|
98
|
+
isolated = isolated.sort_values("nn_distance", ascending=False).reset_index(drop=True)
|
|
99
|
+
return isolated if self.include_all_columns else isolated[self.core_columns]
|
|
100
|
+
|
|
101
|
+
def proximity_stats(self) -> pd.DataFrame:
|
|
102
|
+
"""
|
|
103
|
+
Return distribution statistics for nearest neighbor distances.
|
|
104
|
+
|
|
105
|
+
Returns:
|
|
106
|
+
DataFrame with proximity distribution statistics (count, mean, std, percentiles)
|
|
107
|
+
"""
|
|
108
|
+
return (
|
|
109
|
+
self.df["nn_distance"].describe(percentiles=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]).to_frame()
|
|
110
|
+
)
|
|
66
111
|
|
|
67
112
|
def target_gradients(
|
|
68
113
|
self,
|
|
@@ -90,7 +135,7 @@ class Proximity:
|
|
|
90
135
|
if self.target is None:
|
|
91
136
|
raise ValueError("Target column must be specified")
|
|
92
137
|
|
|
93
|
-
epsilon = 1e-
|
|
138
|
+
epsilon = 1e-6
|
|
94
139
|
|
|
95
140
|
# Phase 1: Quick filter using precomputed nearest neighbor
|
|
96
141
|
candidates = self.df.copy()
|
|
@@ -111,13 +156,13 @@ class Proximity:
|
|
|
111
156
|
threshold = np.percentile(candidates["gradient"], percentile)
|
|
112
157
|
candidates = candidates[candidates["gradient"] >= threshold].copy()
|
|
113
158
|
|
|
114
|
-
# Phase 2: Verify with
|
|
159
|
+
# Phase 2: Verify with K-neighbor median to filter out cases where nearest neighbor is the outlier
|
|
115
160
|
results = []
|
|
116
161
|
for _, row in candidates.iterrows():
|
|
117
162
|
cmpd_id = row[self.id_column]
|
|
118
163
|
cmpd_target = row[self.target]
|
|
119
164
|
|
|
120
|
-
# Get
|
|
165
|
+
# Get K nearest neighbors (excluding self)
|
|
121
166
|
nbrs = self.neighbors(cmpd_id, n_neighbors=k_neighbors, include_self=False)
|
|
122
167
|
|
|
123
168
|
# Calculate median target of k neighbors, excluding the nearest neighbor (index 0)
|
|
@@ -146,10 +191,12 @@ class Proximity:
|
|
|
146
191
|
columns=[
|
|
147
192
|
self.id_column,
|
|
148
193
|
self.target,
|
|
194
|
+
"nn_target",
|
|
195
|
+
"nn_target_diff",
|
|
196
|
+
"nn_distance",
|
|
197
|
+
"gradient",
|
|
149
198
|
"neighbor_median",
|
|
150
199
|
"neighbor_median_diff",
|
|
151
|
-
"mean_distance",
|
|
152
|
-
"gradient",
|
|
153
200
|
]
|
|
154
201
|
)
|
|
155
202
|
|
|
@@ -188,8 +235,8 @@ class Proximity:
|
|
|
188
235
|
query_df = self.df[self.df[self.id_column].isin(ids)]
|
|
189
236
|
query_df = query_df.set_index(self.id_column).loc[ids].reset_index()
|
|
190
237
|
|
|
191
|
-
# Transform query features
|
|
192
|
-
X_query = self.
|
|
238
|
+
# Transform query features (subclass-specific)
|
|
239
|
+
X_query = self._transform_features(query_df)
|
|
193
240
|
|
|
194
241
|
# Get neighbors
|
|
195
242
|
if radius is not None:
|
|
@@ -216,20 +263,7 @@ class Proximity:
|
|
|
216
263
|
df_results = df_results.sort_values([self.id_column, "is_self", "distance"], ascending=[True, False, True])
|
|
217
264
|
return df_results.drop("is_self", axis=1).reset_index(drop=True)
|
|
218
265
|
|
|
219
|
-
def
|
|
220
|
-
"""Remove non-numeric features and log warnings."""
|
|
221
|
-
non_numeric = [f for f in features if f not in df.select_dtypes(include=["number"]).columns]
|
|
222
|
-
if non_numeric:
|
|
223
|
-
log.warning(f"Non-numeric features {non_numeric} aren't currently supported, excluding them")
|
|
224
|
-
return [f for f in features if f not in non_numeric]
|
|
225
|
-
|
|
226
|
-
def _build_model(self) -> None:
|
|
227
|
-
"""Standardize features and fit Nearest Neighbors model."""
|
|
228
|
-
self.scaler = StandardScaler()
|
|
229
|
-
X = self.scaler.fit_transform(self.df[self.features])
|
|
230
|
-
self.nn = NearestNeighbors().fit(X)
|
|
231
|
-
|
|
232
|
-
def _precompute_metrics(self, n_neighbors: int = 10) -> None:
|
|
266
|
+
def _precompute_metrics(self) -> None:
|
|
233
267
|
"""
|
|
234
268
|
Precompute landscape metrics for all compounds.
|
|
235
269
|
|
|
@@ -243,12 +277,9 @@ class Proximity:
|
|
|
243
277
|
"""
|
|
244
278
|
log.info("Precomputing proximity metrics...")
|
|
245
279
|
|
|
246
|
-
#
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
# Get nearest neighbors for all points (including self)
|
|
250
|
-
X = self.scaler.transform(self.df[self.features])
|
|
251
|
-
distances, indices = self.nn.kneighbors(X, n_neighbors=2) # Just need nearest neighbor
|
|
280
|
+
# Get nearest neighbors for all points (n=2 because index 0 is self)
|
|
281
|
+
X = self._transform_features(self.df)
|
|
282
|
+
distances, indices = self.nn.kneighbors(X, n_neighbors=2)
|
|
252
283
|
|
|
253
284
|
# Extract nearest neighbor (index 1, since index 0 is self)
|
|
254
285
|
self.df["nn_distance"] = distances[:, 1]
|
|
@@ -285,126 +316,20 @@ class Proximity:
|
|
|
285
316
|
result = {
|
|
286
317
|
self.id_column: query_id,
|
|
287
318
|
"neighbor_id": neighbor_id,
|
|
288
|
-
"distance": 0.0 if distance < 1e-
|
|
319
|
+
"distance": 0.0 if distance < 1e-6 else distance,
|
|
289
320
|
}
|
|
290
321
|
|
|
291
322
|
# Add target if present
|
|
292
323
|
if self.target and self.target in self.df.columns:
|
|
293
324
|
result[self.target] = neighbor_row[self.target]
|
|
294
325
|
|
|
295
|
-
# Add tracked columns
|
|
296
|
-
for col in self.track_columns:
|
|
297
|
-
if col in self.df.columns:
|
|
298
|
-
result[col] = neighbor_row[col]
|
|
299
|
-
|
|
300
326
|
# Add prediction/probability columns if they exist
|
|
301
327
|
for col in self.df.columns:
|
|
302
328
|
if col == "prediction" or "_proba" in col or "residual" in col or col == "in_model":
|
|
303
329
|
result[col] = neighbor_row[col]
|
|
304
330
|
|
|
305
|
-
|
|
306
|
-
|
|
331
|
+
# Include all columns if requested
|
|
332
|
+
if self.include_all_columns:
|
|
333
|
+
result.update(neighbor_row.to_dict())
|
|
307
334
|
|
|
308
|
-
|
|
309
|
-
if __name__ == "__main__":
|
|
310
|
-
|
|
311
|
-
pd.set_option("display.max_columns", None)
|
|
312
|
-
pd.set_option("display.width", 1000)
|
|
313
|
-
|
|
314
|
-
# Create a sample DataFrame
|
|
315
|
-
data = {
|
|
316
|
-
"ID": [1, 2, 3, 4, 5],
|
|
317
|
-
"Feature1": [0.1, 0.2, 0.3, 0.4, 0.5],
|
|
318
|
-
"Feature2": [0.5, 0.4, 0.3, 0.2, 0.1],
|
|
319
|
-
"Feature3": [2.5, 2.4, 2.3, 2.3, np.nan],
|
|
320
|
-
}
|
|
321
|
-
df = pd.DataFrame(data)
|
|
322
|
-
|
|
323
|
-
# Test the Proximity class
|
|
324
|
-
features = ["Feature1", "Feature2", "Feature3"]
|
|
325
|
-
prox = Proximity(df, id_column="ID", features=features)
|
|
326
|
-
print(prox.neighbors(1, n_neighbors=2))
|
|
327
|
-
|
|
328
|
-
# Test the neighbors method with radius
|
|
329
|
-
print(prox.neighbors(1, radius=2.0))
|
|
330
|
-
|
|
331
|
-
# Test with Features list
|
|
332
|
-
prox = Proximity(df, id_column="ID", features=["Feature1"])
|
|
333
|
-
print(prox.neighbors(1))
|
|
334
|
-
|
|
335
|
-
# Create a sample DataFrame
|
|
336
|
-
data = {
|
|
337
|
-
"foo_id": ["a", "b", "c", "d", "e"], # Testing string IDs
|
|
338
|
-
"Feature1": [0.1, 0.2, 0.3, 0.4, 0.5],
|
|
339
|
-
"Feature2": [0.5, 0.4, 0.3, 0.2, 0.1],
|
|
340
|
-
"target": [1, 0, 1, 0, 5],
|
|
341
|
-
}
|
|
342
|
-
df = pd.DataFrame(data)
|
|
343
|
-
|
|
344
|
-
# Test with String Ids
|
|
345
|
-
prox = Proximity(
|
|
346
|
-
df,
|
|
347
|
-
id_column="foo_id",
|
|
348
|
-
features=["Feature1", "Feature2"],
|
|
349
|
-
target="target",
|
|
350
|
-
track_columns=["Feature1", "Feature2"],
|
|
351
|
-
)
|
|
352
|
-
print(prox.neighbors(["a", "b"]))
|
|
353
|
-
|
|
354
|
-
# Test duplicate IDs
|
|
355
|
-
data = {
|
|
356
|
-
"foo_id": ["a", "b", "c", "d", "d"], # Duplicate ID (d)
|
|
357
|
-
"Feature1": [0.1, 0.2, 0.3, 0.4, 0.5],
|
|
358
|
-
"Feature2": [0.5, 0.4, 0.3, 0.2, 0.1],
|
|
359
|
-
"target": [1, 0, 1, 0, 5],
|
|
360
|
-
}
|
|
361
|
-
df = pd.DataFrame(data)
|
|
362
|
-
prox = Proximity(df, id_column="foo_id", features=["Feature1", "Feature2"], target="target")
|
|
363
|
-
print(df.equals(prox.df))
|
|
364
|
-
|
|
365
|
-
# Test with a categorical feature
|
|
366
|
-
from workbench.api import FeatureSet, Model
|
|
367
|
-
|
|
368
|
-
fs = FeatureSet("aqsol_features")
|
|
369
|
-
model = Model("aqsol-regression")
|
|
370
|
-
features = model.features()
|
|
371
|
-
df = fs.pull_dataframe()
|
|
372
|
-
prox = Proximity(
|
|
373
|
-
df, id_column=fs.id_column, features=model.features(), target=model.target(), track_columns=features
|
|
374
|
-
)
|
|
375
|
-
print(prox.neighbors(df[fs.id_column].tolist()[:3]))
|
|
376
|
-
|
|
377
|
-
print("\n" + "=" * 80)
|
|
378
|
-
print("Testing isolated_compounds...")
|
|
379
|
-
print("=" * 80)
|
|
380
|
-
|
|
381
|
-
# Test isolated data in the top 1%
|
|
382
|
-
isolated_1pct = prox.isolated(top_percent=1.0)
|
|
383
|
-
print(f"\nTop 1% most isolated compounds (n={len(isolated_1pct)}):")
|
|
384
|
-
print(isolated_1pct[[fs.id_column, "nn_distance", "nn_id"]].head(10))
|
|
385
|
-
|
|
386
|
-
# Test isolated data in the top 5%
|
|
387
|
-
isolated_5pct = prox.isolated(top_percent=5.0)
|
|
388
|
-
print(f"\nTop 5% most isolated compounds (n={len(isolated_5pct)}):")
|
|
389
|
-
print(isolated_5pct[[fs.id_column, "nn_distance", "nn_id"]].head(10))
|
|
390
|
-
|
|
391
|
-
print("\n" + "=" * 80)
|
|
392
|
-
print("Testing target_gradients...")
|
|
393
|
-
print("=" * 80)
|
|
394
|
-
|
|
395
|
-
# Test with different parameters
|
|
396
|
-
gradients_1pct = prox.target_gradients(top_percent=1.0, min_delta=1.0)
|
|
397
|
-
print(f"\nTop 1% target gradients (min_delta=5.0) (n={len(gradients_1pct)}):")
|
|
398
|
-
print(
|
|
399
|
-
gradients_1pct[
|
|
400
|
-
[fs.id_column, model.target(), "neighbor_median", "neighbor_median_diff", "mean_distance", "gradient"]
|
|
401
|
-
].head(10)
|
|
402
|
-
)
|
|
403
|
-
|
|
404
|
-
gradients_5pct = prox.target_gradients(top_percent=5.0, min_delta=5.0)
|
|
405
|
-
print(f"\nTop 5% target gradients (min_delta=5.0) (n={len(gradients_5pct)}):")
|
|
406
|
-
print(
|
|
407
|
-
gradients_5pct[
|
|
408
|
-
[fs.id_column, model.target(), "neighbor_median", "neighbor_median_diff", "mean_distance", "gradient"]
|
|
409
|
-
].head(10)
|
|
410
|
-
)
|
|
335
|
+
return result
|
|
@@ -4,7 +4,7 @@ from typing import Union
|
|
|
4
4
|
import logging
|
|
5
5
|
|
|
6
6
|
# Workbench Imports
|
|
7
|
-
from workbench.algorithms.dataframe import Proximity
|
|
7
|
+
from workbench.algorithms.dataframe.proximity import Proximity
|
|
8
8
|
from workbench.api.graph_store import GraphStore
|
|
9
9
|
|
|
10
10
|
# Set up logging
|
|
@@ -132,7 +132,7 @@ class ProximityGraph:
|
|
|
132
132
|
|
|
133
133
|
|
|
134
134
|
if __name__ == "__main__":
|
|
135
|
-
from workbench.algorithms.dataframe.
|
|
135
|
+
from workbench.algorithms.dataframe.feature_space_proximity import FeatureSpaceProximity
|
|
136
136
|
from workbench.algorithms.dataframe.fingerprint_proximity import FingerprintProximity
|
|
137
137
|
from workbench.web_interface.components.plugins.graph_plot import GraphPlot
|
|
138
138
|
from workbench.api import DFStore
|
|
@@ -157,9 +157,9 @@ if __name__ == "__main__":
|
|
|
157
157
|
}
|
|
158
158
|
feature_df = pd.DataFrame(feature_data)
|
|
159
159
|
|
|
160
|
-
# Build a graph using
|
|
161
|
-
print("\n---
|
|
162
|
-
prox =
|
|
160
|
+
# Build a graph using FeatureSpaceProximity
|
|
161
|
+
print("\n--- FeatureSpaceProximity Class ---")
|
|
162
|
+
prox = FeatureSpaceProximity(feature_df, id_column="id", features=["Feature1", "Feature2"], target="target")
|
|
163
163
|
feature_graph = ProximityGraph()
|
|
164
164
|
feature_graph.build_graph(prox)
|
|
165
165
|
nx_graph = feature_graph.nx_graph
|