workbench 0.8.193__py3-none-any.whl → 0.8.197__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- workbench/algorithms/dataframe/__init__.py +1 -2
- workbench/algorithms/dataframe/fingerprint_proximity.py +2 -2
- workbench/algorithms/dataframe/proximity.py +212 -234
- workbench/algorithms/graph/light/proximity_graph.py +8 -7
- workbench/api/endpoint.py +2 -3
- workbench/api/model.py +2 -5
- workbench/core/artifacts/endpoint_core.py +25 -16
- workbench/core/artifacts/feature_set_core.py +126 -4
- workbench/core/artifacts/model_core.py +9 -14
- workbench/core/transforms/features_to_model/features_to_model.py +3 -3
- workbench/core/views/training_view.py +75 -0
- workbench/core/views/view.py +1 -1
- workbench/model_scripts/custom_models/proximity/proximity.py +212 -234
- workbench/model_scripts/custom_models/uq_models/proximity.py +212 -234
- workbench/model_scripts/pytorch_model/generated_model_script.py +567 -0
- workbench/model_scripts/uq_models/generated_model_script.py +589 -0
- workbench/model_scripts/uq_models/mapie.template +103 -6
- workbench/model_scripts/xgb_model/generated_model_script.py +4 -4
- workbench/repl/workbench_shell.py +3 -3
- workbench/utils/model_utils.py +10 -7
- workbench/utils/xgboost_model_utils.py +93 -34
- workbench/web_interface/components/plugin_unit_test.py +5 -2
- workbench/web_interface/components/plugins/model_details.py +2 -5
- {workbench-0.8.193.dist-info → workbench-0.8.197.dist-info}/METADATA +1 -1
- {workbench-0.8.193.dist-info → workbench-0.8.197.dist-info}/RECORD +29 -27
- {workbench-0.8.193.dist-info → workbench-0.8.197.dist-info}/WHEEL +0 -0
- {workbench-0.8.193.dist-info → workbench-0.8.197.dist-info}/entry_points.txt +0 -0
- {workbench-0.8.193.dist-info → workbench-0.8.197.dist-info}/licenses/LICENSE +0 -0
- {workbench-0.8.193.dist-info → workbench-0.8.197.dist-info}/top_level.txt +0 -0
|
@@ -2,22 +2,13 @@ import pandas as pd
|
|
|
2
2
|
import numpy as np
|
|
3
3
|
from sklearn.preprocessing import StandardScaler
|
|
4
4
|
from sklearn.neighbors import NearestNeighbors
|
|
5
|
-
from typing import List, Dict, Optional
|
|
5
|
+
from typing import List, Dict, Optional, Union
|
|
6
6
|
import logging
|
|
7
|
-
import pickle
|
|
8
|
-
import json
|
|
9
|
-
from pathlib import Path
|
|
10
|
-
from enum import Enum
|
|
11
7
|
|
|
12
8
|
# Set up logging
|
|
13
9
|
log = logging.getLogger("workbench")
|
|
14
10
|
|
|
15
11
|
|
|
16
|
-
class ProximityType(Enum):
|
|
17
|
-
DISTANCE = "distance"
|
|
18
|
-
SIMILARITY = "similarity"
|
|
19
|
-
|
|
20
|
-
|
|
21
12
|
class Proximity:
|
|
22
13
|
def __init__(
|
|
23
14
|
self,
|
|
@@ -26,7 +17,6 @@ class Proximity:
|
|
|
26
17
|
features: List[str],
|
|
27
18
|
target: Optional[str] = None,
|
|
28
19
|
track_columns: Optional[List[str]] = None,
|
|
29
|
-
n_neighbors: int = 10,
|
|
30
20
|
):
|
|
31
21
|
"""
|
|
32
22
|
Initialize the Proximity class.
|
|
@@ -37,64 +27,132 @@ class Proximity:
|
|
|
37
27
|
features: List of feature column names to be used for neighbor computations.
|
|
38
28
|
target: Name of the target column. Defaults to None.
|
|
39
29
|
track_columns: Additional columns to track in results. Defaults to None.
|
|
40
|
-
n_neighbors: Number of neighbors to compute. Defaults to 10.
|
|
41
30
|
"""
|
|
42
31
|
self.id_column = id_column
|
|
43
32
|
self.target = target
|
|
44
33
|
self.track_columns = track_columns or []
|
|
45
|
-
self.proximity_type = None
|
|
46
|
-
self.scaler = None
|
|
47
|
-
self.X = None
|
|
48
|
-
self.nn = None
|
|
49
34
|
|
|
50
35
|
# Filter out non-numeric features
|
|
51
36
|
self.features = self._validate_features(df, features)
|
|
52
37
|
|
|
53
38
|
# Drop NaN rows and set up DataFrame
|
|
54
39
|
self.df = df.dropna(subset=self.features).copy()
|
|
55
|
-
|
|
40
|
+
|
|
41
|
+
# Compute target range if target is provided
|
|
42
|
+
self.target_range = None
|
|
43
|
+
if self.target and self.target in self.df.columns:
|
|
44
|
+
self.target_range = self.df[self.target].max() - self.df[self.target].min()
|
|
56
45
|
|
|
57
46
|
# Build the proximity model
|
|
58
|
-
self.
|
|
47
|
+
self._build_model()
|
|
59
48
|
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
non_numeric = df[features].select_dtypes(exclude=["number"]).columns.tolist()
|
|
63
|
-
if non_numeric:
|
|
64
|
-
log.warning(f"Non-numeric features {non_numeric} aren't currently supported...")
|
|
65
|
-
return [f for f in features if f not in non_numeric]
|
|
66
|
-
return features
|
|
49
|
+
# Precompute landscape metrics
|
|
50
|
+
self._precompute_metrics()
|
|
67
51
|
|
|
68
|
-
def
|
|
69
|
-
"""
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
52
|
+
def isolated(self, top_percent: float = 1.0) -> pd.DataFrame:
|
|
53
|
+
"""
|
|
54
|
+
Find isolated data points based on distance to nearest neighbor.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
top_percent: Percentage of most isolated data points to return (e.g., 1.0 returns top 1%)
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
DataFrame of observations above the percentile threshold, sorted by distance (descending)
|
|
61
|
+
"""
|
|
62
|
+
percentile = 100 - top_percent
|
|
63
|
+
threshold = np.percentile(self.df["nn_distance"], percentile)
|
|
64
|
+
isolated = self.df[self.df["nn_distance"] >= threshold].copy()
|
|
65
|
+
return isolated.sort_values("nn_distance", ascending=False).reset_index(drop=True)
|
|
74
66
|
|
|
75
|
-
def
|
|
67
|
+
def target_gradients(
|
|
68
|
+
self,
|
|
69
|
+
top_percent: float = 1.0,
|
|
70
|
+
min_delta: Optional[float] = None,
|
|
71
|
+
k_neighbors: int = 5,
|
|
72
|
+
) -> pd.DataFrame:
|
|
76
73
|
"""
|
|
77
|
-
|
|
74
|
+
Find compounds with steep target gradients (data quality issues and activity cliffs).
|
|
75
|
+
|
|
76
|
+
Uses a two-phase approach:
|
|
77
|
+
1. Quick filter using nearest neighbor gradient
|
|
78
|
+
2. Verify using k-neighbor median to handle cases where the nearest neighbor is the outlier
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
top_percent: Percentage of compounds with steepest gradients to return (e.g., 1.0 = top 1%)
|
|
82
|
+
min_delta: Minimum absolute target difference to consider. If None, defaults to target_range/100
|
|
83
|
+
k_neighbors: Number of neighbors to use for median calculation (default: 5)
|
|
78
84
|
|
|
79
85
|
Returns:
|
|
80
|
-
DataFrame of
|
|
86
|
+
DataFrame of compounds with steepest gradients, sorted by gradient (descending)
|
|
81
87
|
"""
|
|
82
|
-
|
|
88
|
+
if self.target is None:
|
|
89
|
+
raise ValueError("Target column must be specified")
|
|
90
|
+
|
|
91
|
+
epsilon = 1e-5
|
|
92
|
+
|
|
93
|
+
# Phase 1: Quick filter using precomputed nearest neighbor
|
|
94
|
+
candidates = self.df.copy()
|
|
95
|
+
candidates["gradient"] = candidates["nn_target_diff"] / (candidates["nn_distance"] + epsilon)
|
|
83
96
|
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
97
|
+
# Apply min_delta
|
|
98
|
+
if min_delta is None:
|
|
99
|
+
min_delta = self.target_range / 100.0 if self.target_range > 0 else 0.0
|
|
100
|
+
candidates = candidates[candidates["nn_target_diff"] >= min_delta]
|
|
101
|
+
|
|
102
|
+
# Get top X% by initial gradient
|
|
103
|
+
percentile = 100 - top_percent
|
|
104
|
+
threshold = np.percentile(candidates["gradient"], percentile)
|
|
105
|
+
candidates = candidates[candidates["gradient"] >= threshold].copy()
|
|
106
|
+
|
|
107
|
+
# Phase 2: Verify with k-neighbor median to filter out cases where nearest neighbor is the outlier
|
|
108
|
+
results = []
|
|
109
|
+
for _, row in candidates.iterrows():
|
|
110
|
+
cmpd_id = row[self.id_column]
|
|
111
|
+
cmpd_target = row[self.target]
|
|
112
|
+
|
|
113
|
+
# Get k nearest neighbors (excluding self)
|
|
114
|
+
nbrs = self.neighbors(cmpd_id, n_neighbors=k_neighbors, include_self=False)
|
|
115
|
+
|
|
116
|
+
# Calculate median target of k nearest neighbors
|
|
117
|
+
neighbor_median = nbrs.head(k_neighbors)[self.target].median()
|
|
118
|
+
median_diff = abs(cmpd_target - neighbor_median)
|
|
119
|
+
|
|
120
|
+
# Only keep if compound differs from neighborhood median
|
|
121
|
+
# This filters out cases where the nearest neighbor is the outlier
|
|
122
|
+
if median_diff >= min_delta:
|
|
123
|
+
mean_distance = nbrs.head(k_neighbors)["distance"].mean()
|
|
124
|
+
|
|
125
|
+
results.append(
|
|
126
|
+
{
|
|
127
|
+
self.id_column: cmpd_id,
|
|
128
|
+
self.target: cmpd_target,
|
|
129
|
+
"neighbor_median": neighbor_median,
|
|
130
|
+
"neighbor_median_diff": median_diff,
|
|
131
|
+
"mean_distance": mean_distance,
|
|
132
|
+
"gradient": median_diff / (mean_distance + epsilon),
|
|
133
|
+
}
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
# Handle empty results
|
|
137
|
+
if not results:
|
|
138
|
+
return pd.DataFrame(
|
|
139
|
+
columns=[
|
|
140
|
+
self.id_column,
|
|
141
|
+
self.target,
|
|
142
|
+
"neighbor_median",
|
|
143
|
+
"neighbor_median_diff",
|
|
144
|
+
"mean_distance",
|
|
145
|
+
"gradient",
|
|
146
|
+
]
|
|
87
147
|
)
|
|
88
|
-
for i, (dists, nbrs) in enumerate(zip(distances, indices))
|
|
89
|
-
for neighbor_idx, dist in zip(nbrs, dists)
|
|
90
|
-
if neighbor_idx != i # Skip self
|
|
91
|
-
]
|
|
92
148
|
|
|
93
|
-
|
|
149
|
+
results_df = pd.DataFrame(results)
|
|
150
|
+
results_df = results_df.sort_values("gradient", ascending=False).reset_index(drop=True)
|
|
151
|
+
return results_df
|
|
94
152
|
|
|
95
153
|
def neighbors(
|
|
96
154
|
self,
|
|
97
|
-
id_or_ids,
|
|
155
|
+
id_or_ids: Union[str, int, List[Union[str, int]]],
|
|
98
156
|
n_neighbors: Optional[int] = 5,
|
|
99
157
|
radius: Optional[float] = None,
|
|
100
158
|
include_self: bool = True,
|
|
@@ -104,9 +162,9 @@ class Proximity:
|
|
|
104
162
|
|
|
105
163
|
Args:
|
|
106
164
|
id_or_ids: Single ID or list of IDs to look up
|
|
107
|
-
n_neighbors: Number of neighbors to return (default: 5)
|
|
165
|
+
n_neighbors: Number of neighbors to return (default: 5, ignored if radius is set)
|
|
108
166
|
radius: If provided, find all neighbors within this radius
|
|
109
|
-
include_self: Whether to include self in results (
|
|
167
|
+
include_self: Whether to include self in results (default: True)
|
|
110
168
|
|
|
111
169
|
Returns:
|
|
112
170
|
DataFrame containing neighbors and distances
|
|
@@ -123,38 +181,6 @@ class Proximity:
|
|
|
123
181
|
query_df = self.df[self.df[self.id_column].isin(ids)]
|
|
124
182
|
query_df = query_df.set_index(self.id_column).loc[ids].reset_index()
|
|
125
183
|
|
|
126
|
-
# Use the core implementation
|
|
127
|
-
return self.find_neighbors(query_df, n_neighbors=n_neighbors, radius=radius, include_self=include_self)
|
|
128
|
-
|
|
129
|
-
def find_neighbors(
|
|
130
|
-
self,
|
|
131
|
-
query_df: pd.DataFrame,
|
|
132
|
-
n_neighbors: Optional[int] = 5,
|
|
133
|
-
radius: Optional[float] = None,
|
|
134
|
-
include_self: bool = True,
|
|
135
|
-
) -> pd.DataFrame:
|
|
136
|
-
"""
|
|
137
|
-
Return neighbors for rows in a query DataFrame.
|
|
138
|
-
|
|
139
|
-
Args:
|
|
140
|
-
query_df: DataFrame containing query points
|
|
141
|
-
n_neighbors: Number of neighbors to return (default: 5)
|
|
142
|
-
radius: If provided, find all neighbors within this radius
|
|
143
|
-
include_self: Whether to include self in results (if present)
|
|
144
|
-
|
|
145
|
-
Returns:
|
|
146
|
-
DataFrame containing neighbors and distances
|
|
147
|
-
"""
|
|
148
|
-
# Validate features
|
|
149
|
-
missing = set(self.features) - set(query_df.columns)
|
|
150
|
-
if missing:
|
|
151
|
-
raise ValueError(f"Query DataFrame is missing required feature columns: {missing}")
|
|
152
|
-
|
|
153
|
-
id_column_present = self.id_column in query_df.columns
|
|
154
|
-
|
|
155
|
-
# Handle NaN rows
|
|
156
|
-
query_df = self._handle_nan_rows(query_df, id_column_present)
|
|
157
|
-
|
|
158
184
|
# Transform query features
|
|
159
185
|
X_query = self.scaler.transform(query_df[self.features])
|
|
160
186
|
|
|
@@ -167,30 +193,71 @@ class Proximity:
|
|
|
167
193
|
# Build results
|
|
168
194
|
results = []
|
|
169
195
|
for i, (dists, nbrs) in enumerate(zip(distances, indices)):
|
|
170
|
-
query_id = query_df.iloc[i][self.id_column]
|
|
196
|
+
query_id = query_df.iloc[i][self.id_column]
|
|
171
197
|
|
|
172
198
|
for neighbor_idx, dist in zip(nbrs, dists):
|
|
173
199
|
neighbor_id = self.df.iloc[neighbor_idx][self.id_column]
|
|
174
200
|
|
|
175
|
-
# Skip
|
|
201
|
+
# Skip self if requested
|
|
176
202
|
if not include_self and neighbor_id == query_id:
|
|
177
203
|
continue
|
|
178
204
|
|
|
179
205
|
results.append(self._build_neighbor_result(query_id=query_id, neighbor_idx=neighbor_idx, distance=dist))
|
|
180
206
|
|
|
181
|
-
|
|
182
|
-
|
|
207
|
+
df_results = pd.DataFrame(results)
|
|
208
|
+
df_results["is_self"] = df_results["neighbor_id"] == df_results[self.id_column]
|
|
209
|
+
df_results = df_results.sort_values([self.id_column, "is_self", "distance"], ascending=[True, False, True])
|
|
210
|
+
return df_results.drop("is_self", axis=1).reset_index(drop=True)
|
|
183
211
|
|
|
184
|
-
def
|
|
185
|
-
"""
|
|
186
|
-
|
|
212
|
+
def _validate_features(self, df: pd.DataFrame, features: List[str]) -> List[str]:
|
|
213
|
+
"""Remove non-numeric features and log warnings."""
|
|
214
|
+
non_numeric = [f for f in features if f not in df.select_dtypes(include=["number"]).columns]
|
|
215
|
+
if non_numeric:
|
|
216
|
+
log.warning(f"Non-numeric features {non_numeric} aren't currently supported, excluding them")
|
|
217
|
+
return [f for f in features if f not in non_numeric]
|
|
187
218
|
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
219
|
+
def _build_model(self) -> None:
|
|
220
|
+
"""Standardize features and fit Nearest Neighbors model."""
|
|
221
|
+
self.scaler = StandardScaler()
|
|
222
|
+
X = self.scaler.fit_transform(self.df[self.features])
|
|
223
|
+
self.nn = NearestNeighbors().fit(X)
|
|
192
224
|
|
|
193
|
-
|
|
225
|
+
def _precompute_metrics(self, n_neighbors: int = 10) -> None:
|
|
226
|
+
"""
|
|
227
|
+
Precompute landscape metrics for all compounds.
|
|
228
|
+
|
|
229
|
+
Adds columns to self.df:
|
|
230
|
+
- nn_distance: Distance to nearest neighbor
|
|
231
|
+
- nn_id: ID of nearest neighbor
|
|
232
|
+
|
|
233
|
+
If target is specified, also adds:
|
|
234
|
+
- nn_target: Target value of nearest neighbor
|
|
235
|
+
- nn_target_diff: Absolute difference from nearest neighbor target
|
|
236
|
+
"""
|
|
237
|
+
log.info("Precomputing proximity metrics...")
|
|
238
|
+
|
|
239
|
+
# Make sure n_neighbors isn't greater than dataset size
|
|
240
|
+
n_neighbors = min(n_neighbors, len(self.df) - 1)
|
|
241
|
+
|
|
242
|
+
# Get nearest neighbors for all points (including self)
|
|
243
|
+
X = self.scaler.transform(self.df[self.features])
|
|
244
|
+
distances, indices = self.nn.kneighbors(X, n_neighbors=2) # Just need nearest neighbor
|
|
245
|
+
|
|
246
|
+
# Extract nearest neighbor (index 1, since index 0 is self)
|
|
247
|
+
self.df["nn_distance"] = distances[:, 1]
|
|
248
|
+
self.df["nn_id"] = self.df.iloc[indices[:, 1]][self.id_column].values
|
|
249
|
+
|
|
250
|
+
# If target exists, compute target-based metrics
|
|
251
|
+
if self.target and self.target in self.df.columns:
|
|
252
|
+
# Get target values for nearest neighbor
|
|
253
|
+
nn_target_values = self.df.iloc[indices[:, 1]][self.target].values
|
|
254
|
+
self.df["nn_target"] = nn_target_values
|
|
255
|
+
self.df["nn_target_diff"] = np.abs(self.df[self.target].values - nn_target_values)
|
|
256
|
+
|
|
257
|
+
# Precompute target range for min_delta default
|
|
258
|
+
self.target_range = self.df[self.target].max() - self.df[self.target].min()
|
|
259
|
+
|
|
260
|
+
log.info("Proximity metrics precomputed successfully")
|
|
194
261
|
|
|
195
262
|
def _build_neighbor_result(self, query_id, neighbor_idx: int, distance: float) -> Dict:
|
|
196
263
|
"""
|
|
@@ -204,111 +271,31 @@ class Proximity:
|
|
|
204
271
|
Returns:
|
|
205
272
|
Dictionary containing neighbor information
|
|
206
273
|
"""
|
|
207
|
-
neighbor_id = self.df.iloc[neighbor_idx][self.id_column]
|
|
208
274
|
neighbor_row = self.df.iloc[neighbor_idx]
|
|
275
|
+
neighbor_id = neighbor_row[self.id_column]
|
|
209
276
|
|
|
210
277
|
# Start with basic info
|
|
211
278
|
result = {
|
|
212
279
|
self.id_column: query_id,
|
|
213
280
|
"neighbor_id": neighbor_id,
|
|
214
|
-
"distance": distance,
|
|
281
|
+
"distance": 0.0 if distance < 1e-5 else distance,
|
|
215
282
|
}
|
|
216
283
|
|
|
217
|
-
#
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
+ self.track_columns
|
|
221
|
-
+ [col for col in self.df.columns if "_proba" in col or "residual" in col or col == "outlier"]
|
|
222
|
-
)
|
|
284
|
+
# Add target if present
|
|
285
|
+
if self.target and self.target in self.df.columns:
|
|
286
|
+
result[self.target] = neighbor_row[self.target]
|
|
223
287
|
|
|
224
|
-
# Add
|
|
225
|
-
for col in
|
|
288
|
+
# Add tracked columns
|
|
289
|
+
for col in self.track_columns:
|
|
226
290
|
if col in self.df.columns:
|
|
227
291
|
result[col] = neighbor_row[col]
|
|
228
292
|
|
|
229
|
-
#
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
def serialize(self, directory: str) -> None:
|
|
234
|
-
"""
|
|
235
|
-
Serialize the Proximity model to a directory.
|
|
236
|
-
|
|
237
|
-
Args:
|
|
238
|
-
directory: Directory path to save the model components
|
|
239
|
-
"""
|
|
240
|
-
dir_path = Path(directory)
|
|
241
|
-
dir_path.mkdir(parents=True, exist_ok=True)
|
|
242
|
-
|
|
243
|
-
# Save metadata
|
|
244
|
-
metadata = {
|
|
245
|
-
"id_column": self.id_column,
|
|
246
|
-
"features": self.features,
|
|
247
|
-
"target": self.target,
|
|
248
|
-
"track_columns": self.track_columns,
|
|
249
|
-
"n_neighbors": self.n_neighbors,
|
|
250
|
-
}
|
|
251
|
-
|
|
252
|
-
(dir_path / "metadata.json").write_text(json.dumps(metadata))
|
|
253
|
-
|
|
254
|
-
# Save DataFrame
|
|
255
|
-
self.df.to_pickle(dir_path / "df.pkl")
|
|
256
|
-
|
|
257
|
-
# Save models
|
|
258
|
-
with open(dir_path / "scaler.pkl", "wb") as f:
|
|
259
|
-
pickle.dump(self.scaler, f)
|
|
260
|
-
|
|
261
|
-
with open(dir_path / "nn_model.pkl", "wb") as f:
|
|
262
|
-
pickle.dump(self.nn, f)
|
|
263
|
-
|
|
264
|
-
log.info(f"Proximity model serialized to {directory}")
|
|
265
|
-
|
|
266
|
-
@classmethod
|
|
267
|
-
def deserialize(cls, directory: str) -> "Proximity":
|
|
268
|
-
"""
|
|
269
|
-
Deserialize a Proximity model from a directory.
|
|
270
|
-
|
|
271
|
-
Args:
|
|
272
|
-
directory: Directory path containing the serialized model components
|
|
273
|
-
|
|
274
|
-
Returns:
|
|
275
|
-
A new Proximity instance
|
|
276
|
-
"""
|
|
277
|
-
dir_path = Path(directory)
|
|
278
|
-
if not dir_path.is_dir():
|
|
279
|
-
raise ValueError(f"Directory {directory} does not exist or is not a directory")
|
|
280
|
-
|
|
281
|
-
# Load metadata
|
|
282
|
-
metadata = json.loads((dir_path / "metadata.json").read_text())
|
|
283
|
-
|
|
284
|
-
# Load DataFrame
|
|
285
|
-
df_path = dir_path / "df.pkl"
|
|
286
|
-
if not df_path.exists():
|
|
287
|
-
raise FileNotFoundError(f"DataFrame file not found at {df_path}")
|
|
288
|
-
df = pd.read_pickle(df_path)
|
|
289
|
-
|
|
290
|
-
# Create instance without calling __init__
|
|
291
|
-
instance = cls.__new__(cls)
|
|
292
|
-
instance.df = df
|
|
293
|
-
instance.id_column = metadata["id_column"]
|
|
294
|
-
instance.features = metadata["features"]
|
|
295
|
-
instance.target = metadata["target"]
|
|
296
|
-
instance.track_columns = metadata["track_columns"]
|
|
297
|
-
instance.n_neighbors = metadata["n_neighbors"]
|
|
298
|
-
|
|
299
|
-
# Load models
|
|
300
|
-
with open(dir_path / "scaler.pkl", "rb") as f:
|
|
301
|
-
instance.scaler = pickle.load(f)
|
|
302
|
-
|
|
303
|
-
with open(dir_path / "nn_model.pkl", "rb") as f:
|
|
304
|
-
instance.nn = pickle.load(f)
|
|
305
|
-
|
|
306
|
-
# Restore X
|
|
307
|
-
instance.X = instance.scaler.transform(instance.df[instance.features])
|
|
308
|
-
instance.proximity_type = ProximityType.DISTANCE
|
|
293
|
+
# Add prediction/probability columns if they exist
|
|
294
|
+
for col in self.df.columns:
|
|
295
|
+
if col == "prediction" or "_proba" in col or "residual" in col or col == "in_model":
|
|
296
|
+
result[col] = neighbor_row[col]
|
|
309
297
|
|
|
310
|
-
|
|
311
|
-
return instance
|
|
298
|
+
return result
|
|
312
299
|
|
|
313
300
|
|
|
314
301
|
# Testing the Proximity class
|
|
@@ -328,28 +315,15 @@ if __name__ == "__main__":
|
|
|
328
315
|
|
|
329
316
|
# Test the Proximity class
|
|
330
317
|
features = ["Feature1", "Feature2", "Feature3"]
|
|
331
|
-
prox = Proximity(df, id_column="ID", features=features
|
|
332
|
-
print(prox.
|
|
333
|
-
|
|
334
|
-
# Test the neighbors method
|
|
335
|
-
print(prox.neighbors(1))
|
|
318
|
+
prox = Proximity(df, id_column="ID", features=features)
|
|
319
|
+
print(prox.neighbors(1, n_neighbors=2))
|
|
336
320
|
|
|
337
321
|
# Test the neighbors method with radius
|
|
338
322
|
print(prox.neighbors(1, radius=2.0))
|
|
339
323
|
|
|
340
|
-
# Test with data that isn't in the 'train' dataframe
|
|
341
|
-
query_data = {
|
|
342
|
-
"ID": [6],
|
|
343
|
-
"Feature1": [0.31],
|
|
344
|
-
"Feature2": [0.31],
|
|
345
|
-
"Feature3": [2.31],
|
|
346
|
-
}
|
|
347
|
-
query_df = pd.DataFrame(query_data)
|
|
348
|
-
print(prox.find_neighbors(query_df=query_df)) # For new data we use find_neighbors()
|
|
349
|
-
|
|
350
324
|
# Test with Features list
|
|
351
|
-
prox = Proximity(df, id_column="ID", features=["Feature1"]
|
|
352
|
-
print(prox.
|
|
325
|
+
prox = Proximity(df, id_column="ID", features=["Feature1"])
|
|
326
|
+
print(prox.neighbors(1))
|
|
353
327
|
|
|
354
328
|
# Create a sample DataFrame
|
|
355
329
|
data = {
|
|
@@ -367,40 +341,9 @@ if __name__ == "__main__":
|
|
|
367
341
|
features=["Feature1", "Feature2"],
|
|
368
342
|
target="target",
|
|
369
343
|
track_columns=["Feature1", "Feature2"],
|
|
370
|
-
n_neighbors=3,
|
|
371
344
|
)
|
|
372
|
-
print(prox.all_neighbors())
|
|
373
|
-
|
|
374
|
-
# Test the neighbors method
|
|
375
345
|
print(prox.neighbors(["a", "b"]))
|
|
376
346
|
|
|
377
|
-
# Time neighbors with all IDs versus calling all_neighbors
|
|
378
|
-
import time
|
|
379
|
-
|
|
380
|
-
start_time = time.time()
|
|
381
|
-
prox_df = prox.find_neighbors(query_df=df, include_self=False)
|
|
382
|
-
end_time = time.time()
|
|
383
|
-
print(f"Time taken for neighbors: {end_time - start_time:.4f} seconds")
|
|
384
|
-
start_time = time.time()
|
|
385
|
-
prox_df_all = prox.all_neighbors()
|
|
386
|
-
end_time = time.time()
|
|
387
|
-
print(f"Time taken for all_neighbors: {end_time - start_time:.4f} seconds")
|
|
388
|
-
|
|
389
|
-
# Now compare the two dataframes
|
|
390
|
-
print("Neighbors DataFrame:")
|
|
391
|
-
print(prox_df)
|
|
392
|
-
print("\nAll Neighbors DataFrame:")
|
|
393
|
-
print(prox_df_all)
|
|
394
|
-
# Check for any discrepancies
|
|
395
|
-
if prox_df.equals(prox_df_all):
|
|
396
|
-
print("The two DataFrames are equal :)")
|
|
397
|
-
else:
|
|
398
|
-
print("ERROR: The two DataFrames are not equal!")
|
|
399
|
-
|
|
400
|
-
# Test querying without the id_column
|
|
401
|
-
df_no_id = df.drop(columns=["foo_id"])
|
|
402
|
-
print(prox.find_neighbors(query_df=df_no_id, include_self=False))
|
|
403
|
-
|
|
404
347
|
# Test duplicate IDs
|
|
405
348
|
data = {
|
|
406
349
|
"foo_id": ["a", "b", "c", "d", "d"], # Duplicate ID (d)
|
|
@@ -409,17 +352,52 @@ if __name__ == "__main__":
|
|
|
409
352
|
"target": [1, 0, 1, 0, 5],
|
|
410
353
|
}
|
|
411
354
|
df = pd.DataFrame(data)
|
|
412
|
-
prox = Proximity(df, id_column="foo_id", features=["Feature1", "Feature2"], target="target"
|
|
355
|
+
prox = Proximity(df, id_column="foo_id", features=["Feature1", "Feature2"], target="target")
|
|
413
356
|
print(df.equals(prox.df))
|
|
414
357
|
|
|
415
358
|
# Test with a categorical feature
|
|
416
359
|
from workbench.api import FeatureSet, Model
|
|
417
360
|
|
|
418
|
-
fs = FeatureSet("
|
|
419
|
-
model = Model("
|
|
361
|
+
fs = FeatureSet("aqsol_features")
|
|
362
|
+
model = Model("aqsol-regression")
|
|
420
363
|
features = model.features()
|
|
421
364
|
df = fs.pull_dataframe()
|
|
422
365
|
prox = Proximity(
|
|
423
366
|
df, id_column=fs.id_column, features=model.features(), target=model.target(), track_columns=features
|
|
424
367
|
)
|
|
425
|
-
print(prox.
|
|
368
|
+
print(prox.neighbors(df[fs.id_column].tolist()[:3]))
|
|
369
|
+
|
|
370
|
+
print("\n" + "=" * 80)
|
|
371
|
+
print("Testing isolated_compounds...")
|
|
372
|
+
print("=" * 80)
|
|
373
|
+
|
|
374
|
+
# Test isolated data in the top 1%
|
|
375
|
+
isolated_1pct = prox.isolated(top_percent=1.0)
|
|
376
|
+
print(f"\nTop 1% most isolated compounds (n={len(isolated_1pct)}):")
|
|
377
|
+
print(isolated_1pct[[fs.id_column, "nn_distance", "nn_id"]].head(10))
|
|
378
|
+
|
|
379
|
+
# Test isolated data in the top 5%
|
|
380
|
+
isolated_5pct = prox.isolated(top_percent=5.0)
|
|
381
|
+
print(f"\nTop 5% most isolated compounds (n={len(isolated_5pct)}):")
|
|
382
|
+
print(isolated_5pct[[fs.id_column, "nn_distance", "nn_id"]].head(10))
|
|
383
|
+
|
|
384
|
+
print("\n" + "=" * 80)
|
|
385
|
+
print("Testing target_gradients...")
|
|
386
|
+
print("=" * 80)
|
|
387
|
+
|
|
388
|
+
# Test with different parameters
|
|
389
|
+
gradients_1pct = prox.target_gradients(top_percent=1.0, min_delta=1.0)
|
|
390
|
+
print(f"\nTop 1% target gradients (min_delta=5.0) (n={len(gradients_1pct)}):")
|
|
391
|
+
print(
|
|
392
|
+
gradients_1pct[
|
|
393
|
+
[fs.id_column, model.target(), "neighbor_median", "neighbor_median_diff", "mean_distance", "gradient"]
|
|
394
|
+
].head(10)
|
|
395
|
+
)
|
|
396
|
+
|
|
397
|
+
gradients_5pct = prox.target_gradients(top_percent=5.0, min_delta=5.0)
|
|
398
|
+
print(f"\nTop 5% target gradients (min_delta=5.0) (n={len(gradients_5pct)}):")
|
|
399
|
+
print(
|
|
400
|
+
gradients_5pct[
|
|
401
|
+
[fs.id_column, model.target(), "neighbor_median", "neighbor_median_diff", "mean_distance", "gradient"]
|
|
402
|
+
].head(10)
|
|
403
|
+
)
|