workbench 0.8.193__py3-none-any.whl → 0.8.197__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. workbench/algorithms/dataframe/__init__.py +1 -2
  2. workbench/algorithms/dataframe/fingerprint_proximity.py +2 -2
  3. workbench/algorithms/dataframe/proximity.py +212 -234
  4. workbench/algorithms/graph/light/proximity_graph.py +8 -7
  5. workbench/api/endpoint.py +2 -3
  6. workbench/api/model.py +2 -5
  7. workbench/core/artifacts/endpoint_core.py +25 -16
  8. workbench/core/artifacts/feature_set_core.py +126 -4
  9. workbench/core/artifacts/model_core.py +9 -14
  10. workbench/core/transforms/features_to_model/features_to_model.py +3 -3
  11. workbench/core/views/training_view.py +75 -0
  12. workbench/core/views/view.py +1 -1
  13. workbench/model_scripts/custom_models/proximity/proximity.py +212 -234
  14. workbench/model_scripts/custom_models/uq_models/proximity.py +212 -234
  15. workbench/model_scripts/pytorch_model/generated_model_script.py +567 -0
  16. workbench/model_scripts/uq_models/generated_model_script.py +589 -0
  17. workbench/model_scripts/uq_models/mapie.template +103 -6
  18. workbench/model_scripts/xgb_model/generated_model_script.py +4 -4
  19. workbench/repl/workbench_shell.py +3 -3
  20. workbench/utils/model_utils.py +10 -7
  21. workbench/utils/xgboost_model_utils.py +93 -34
  22. workbench/web_interface/components/plugin_unit_test.py +5 -2
  23. workbench/web_interface/components/plugins/model_details.py +2 -5
  24. {workbench-0.8.193.dist-info → workbench-0.8.197.dist-info}/METADATA +1 -1
  25. {workbench-0.8.193.dist-info → workbench-0.8.197.dist-info}/RECORD +29 -27
  26. {workbench-0.8.193.dist-info → workbench-0.8.197.dist-info}/WHEEL +0 -0
  27. {workbench-0.8.193.dist-info → workbench-0.8.197.dist-info}/entry_points.txt +0 -0
  28. {workbench-0.8.193.dist-info → workbench-0.8.197.dist-info}/licenses/LICENSE +0 -0
  29. {workbench-0.8.193.dist-info → workbench-0.8.197.dist-info}/top_level.txt +0 -0
@@ -5,14 +5,13 @@ These classes provide functionality for Pandas Dataframes
5
5
  - TBD: TBD
6
6
  """
7
7
 
8
- from .proximity import Proximity, ProximityType
8
+ from .proximity import Proximity
9
9
  from .feature_space_proximity import FeatureSpaceProximity
10
10
  from .fingerprint_proximity import FingerprintProximity
11
11
  from .projection_2d import Projection2D
12
12
 
13
13
  __all__ = [
14
14
  "Proximity",
15
- "ProximityType",
16
15
  "FeatureSpaceProximity",
17
16
  "FingerprintProximity",
18
17
  "Projection2D",
@@ -5,7 +5,7 @@ from typing import Union, List
5
5
  import logging
6
6
 
7
7
  # Workbench Imports
8
- from workbench.algorithms.dataframe.proximity import Proximity, ProximityType
8
+ from workbench.algorithms.dataframe.proximity import Proximity
9
9
 
10
10
  # Set up logging
11
11
  log = logging.getLogger("workbench")
@@ -36,7 +36,7 @@ class FingerprintProximity(Proximity):
36
36
  Converts fingerprint strings to binary arrays and initializes NearestNeighbors.
37
37
  """
38
38
  log.info("Converting fingerprints to binary feature matrix...")
39
- self.proximity_type = ProximityType.SIMILARITY
39
+ # self.proximity_type = ProximityType.SIMILARITY
40
40
 
41
41
  # Convert fingerprint strings to binary arrays
42
42
 
@@ -2,22 +2,13 @@ import pandas as pd
2
2
  import numpy as np
3
3
  from sklearn.preprocessing import StandardScaler
4
4
  from sklearn.neighbors import NearestNeighbors
5
- from typing import List, Dict, Optional
5
+ from typing import List, Dict, Optional, Union
6
6
  import logging
7
- import pickle
8
- import json
9
- from pathlib import Path
10
- from enum import Enum
11
7
 
12
8
  # Set up logging
13
9
  log = logging.getLogger("workbench")
14
10
 
15
11
 
16
- class ProximityType(Enum):
17
- DISTANCE = "distance"
18
- SIMILARITY = "similarity"
19
-
20
-
21
12
  class Proximity:
22
13
  def __init__(
23
14
  self,
@@ -26,7 +17,6 @@ class Proximity:
26
17
  features: List[str],
27
18
  target: Optional[str] = None,
28
19
  track_columns: Optional[List[str]] = None,
29
- n_neighbors: int = 10,
30
20
  ):
31
21
  """
32
22
  Initialize the Proximity class.
@@ -37,64 +27,132 @@ class Proximity:
37
27
  features: List of feature column names to be used for neighbor computations.
38
28
  target: Name of the target column. Defaults to None.
39
29
  track_columns: Additional columns to track in results. Defaults to None.
40
- n_neighbors: Number of neighbors to compute. Defaults to 10.
41
30
  """
42
31
  self.id_column = id_column
43
32
  self.target = target
44
33
  self.track_columns = track_columns or []
45
- self.proximity_type = None
46
- self.scaler = None
47
- self.X = None
48
- self.nn = None
49
34
 
50
35
  # Filter out non-numeric features
51
36
  self.features = self._validate_features(df, features)
52
37
 
53
38
  # Drop NaN rows and set up DataFrame
54
39
  self.df = df.dropna(subset=self.features).copy()
55
- self.n_neighbors = min(n_neighbors, len(self.df) - 1)
40
+
41
+ # Compute target range if target is provided
42
+ self.target_range = None
43
+ if self.target and self.target in self.df.columns:
44
+ self.target_range = self.df[self.target].max() - self.df[self.target].min()
56
45
 
57
46
  # Build the proximity model
58
- self.build_proximity_model()
47
+ self._build_model()
59
48
 
60
- def _validate_features(self, df: pd.DataFrame, features: List[str]) -> List[str]:
61
- """Remove non-numeric features and log warnings."""
62
- non_numeric = df[features].select_dtypes(exclude=["number"]).columns.tolist()
63
- if non_numeric:
64
- log.warning(f"Non-numeric features {non_numeric} aren't currently supported...")
65
- return [f for f in features if f not in non_numeric]
66
- return features
49
+ # Precompute landscape metrics
50
+ self._precompute_metrics()
67
51
 
68
- def build_proximity_model(self) -> None:
69
- """Standardize features and fit Nearest Neighbors model."""
70
- self.proximity_type = ProximityType.DISTANCE
71
- self.scaler = StandardScaler()
72
- self.X = self.scaler.fit_transform(self.df[self.features])
73
- self.nn = NearestNeighbors(n_neighbors=self.n_neighbors + 1).fit(self.X)
52
+ def isolated(self, top_percent: float = 1.0) -> pd.DataFrame:
53
+ """
54
+ Find isolated data points based on distance to nearest neighbor.
55
+
56
+ Args:
57
+ top_percent: Percentage of most isolated data points to return (e.g., 1.0 returns top 1%)
58
+
59
+ Returns:
60
+ DataFrame of observations above the percentile threshold, sorted by distance (descending)
61
+ """
62
+ percentile = 100 - top_percent
63
+ threshold = np.percentile(self.df["nn_distance"], percentile)
64
+ isolated = self.df[self.df["nn_distance"] >= threshold].copy()
65
+ return isolated.sort_values("nn_distance", ascending=False).reset_index(drop=True)
74
66
 
75
- def all_neighbors(self) -> pd.DataFrame:
67
+ def target_gradients(
68
+ self,
69
+ top_percent: float = 1.0,
70
+ min_delta: Optional[float] = None,
71
+ k_neighbors: int = 5,
72
+ ) -> pd.DataFrame:
76
73
  """
77
- Compute nearest neighbors for all rows in the dataset.
74
+ Find compounds with steep target gradients (data quality issues and activity cliffs).
75
+
76
+ Uses a two-phase approach:
77
+ 1. Quick filter using nearest neighbor gradient
78
+ 2. Verify using k-neighbor median to handle cases where the nearest neighbor is the outlier
79
+
80
+ Args:
81
+ top_percent: Percentage of compounds with steepest gradients to return (e.g., 1.0 = top 1%)
82
+ min_delta: Minimum absolute target difference to consider. If None, defaults to target_range/100
83
+ k_neighbors: Number of neighbors to use for median calculation (default: 5)
78
84
 
79
85
  Returns:
80
- DataFrame of neighbors and their distances.
86
+ DataFrame of compounds with steepest gradients, sorted by gradient (descending)
81
87
  """
82
- distances, indices = self.nn.kneighbors(self.X)
88
+ if self.target is None:
89
+ raise ValueError("Target column must be specified")
90
+
91
+ epsilon = 1e-5
92
+
93
+ # Phase 1: Quick filter using precomputed nearest neighbor
94
+ candidates = self.df.copy()
95
+ candidates["gradient"] = candidates["nn_target_diff"] / (candidates["nn_distance"] + epsilon)
83
96
 
84
- results = [
85
- self._build_neighbor_result(
86
- query_id=self.df.iloc[i][self.id_column], neighbor_idx=neighbor_idx, distance=dist
97
+ # Apply min_delta
98
+ if min_delta is None:
99
+ min_delta = self.target_range / 100.0 if self.target_range > 0 else 0.0
100
+ candidates = candidates[candidates["nn_target_diff"] >= min_delta]
101
+
102
+ # Get top X% by initial gradient
103
+ percentile = 100 - top_percent
104
+ threshold = np.percentile(candidates["gradient"], percentile)
105
+ candidates = candidates[candidates["gradient"] >= threshold].copy()
106
+
107
+ # Phase 2: Verify with k-neighbor median to filter out cases where nearest neighbor is the outlier
108
+ results = []
109
+ for _, row in candidates.iterrows():
110
+ cmpd_id = row[self.id_column]
111
+ cmpd_target = row[self.target]
112
+
113
+ # Get k nearest neighbors (excluding self)
114
+ nbrs = self.neighbors(cmpd_id, n_neighbors=k_neighbors, include_self=False)
115
+
116
+ # Calculate median target of k nearest neighbors
117
+ neighbor_median = nbrs.head(k_neighbors)[self.target].median()
118
+ median_diff = abs(cmpd_target - neighbor_median)
119
+
120
+ # Only keep if compound differs from neighborhood median
121
+ # This filters out cases where the nearest neighbor is the outlier
122
+ if median_diff >= min_delta:
123
+ mean_distance = nbrs.head(k_neighbors)["distance"].mean()
124
+
125
+ results.append(
126
+ {
127
+ self.id_column: cmpd_id,
128
+ self.target: cmpd_target,
129
+ "neighbor_median": neighbor_median,
130
+ "neighbor_median_diff": median_diff,
131
+ "mean_distance": mean_distance,
132
+ "gradient": median_diff / (mean_distance + epsilon),
133
+ }
134
+ )
135
+
136
+ # Handle empty results
137
+ if not results:
138
+ return pd.DataFrame(
139
+ columns=[
140
+ self.id_column,
141
+ self.target,
142
+ "neighbor_median",
143
+ "neighbor_median_diff",
144
+ "mean_distance",
145
+ "gradient",
146
+ ]
87
147
  )
88
- for i, (dists, nbrs) in enumerate(zip(distances, indices))
89
- for neighbor_idx, dist in zip(nbrs, dists)
90
- if neighbor_idx != i # Skip self
91
- ]
92
148
 
93
- return pd.DataFrame(results)
149
+ results_df = pd.DataFrame(results)
150
+ results_df = results_df.sort_values("gradient", ascending=False).reset_index(drop=True)
151
+ return results_df
94
152
 
95
153
  def neighbors(
96
154
  self,
97
- id_or_ids,
155
+ id_or_ids: Union[str, int, List[Union[str, int]]],
98
156
  n_neighbors: Optional[int] = 5,
99
157
  radius: Optional[float] = None,
100
158
  include_self: bool = True,
@@ -104,9 +162,9 @@ class Proximity:
104
162
 
105
163
  Args:
106
164
  id_or_ids: Single ID or list of IDs to look up
107
- n_neighbors: Number of neighbors to return (default: 5)
165
+ n_neighbors: Number of neighbors to return (default: 5, ignored if radius is set)
108
166
  radius: If provided, find all neighbors within this radius
109
- include_self: Whether to include self in results (if present)
167
+ include_self: Whether to include self in results (default: True)
110
168
 
111
169
  Returns:
112
170
  DataFrame containing neighbors and distances
@@ -123,38 +181,6 @@ class Proximity:
123
181
  query_df = self.df[self.df[self.id_column].isin(ids)]
124
182
  query_df = query_df.set_index(self.id_column).loc[ids].reset_index()
125
183
 
126
- # Use the core implementation
127
- return self.find_neighbors(query_df, n_neighbors=n_neighbors, radius=radius, include_self=include_self)
128
-
129
- def find_neighbors(
130
- self,
131
- query_df: pd.DataFrame,
132
- n_neighbors: Optional[int] = 5,
133
- radius: Optional[float] = None,
134
- include_self: bool = True,
135
- ) -> pd.DataFrame:
136
- """
137
- Return neighbors for rows in a query DataFrame.
138
-
139
- Args:
140
- query_df: DataFrame containing query points
141
- n_neighbors: Number of neighbors to return (default: 5)
142
- radius: If provided, find all neighbors within this radius
143
- include_self: Whether to include self in results (if present)
144
-
145
- Returns:
146
- DataFrame containing neighbors and distances
147
- """
148
- # Validate features
149
- missing = set(self.features) - set(query_df.columns)
150
- if missing:
151
- raise ValueError(f"Query DataFrame is missing required feature columns: {missing}")
152
-
153
- id_column_present = self.id_column in query_df.columns
154
-
155
- # Handle NaN rows
156
- query_df = self._handle_nan_rows(query_df, id_column_present)
157
-
158
184
  # Transform query features
159
185
  X_query = self.scaler.transform(query_df[self.features])
160
186
 
@@ -167,30 +193,71 @@ class Proximity:
167
193
  # Build results
168
194
  results = []
169
195
  for i, (dists, nbrs) in enumerate(zip(distances, indices)):
170
- query_id = query_df.iloc[i][self.id_column] if id_column_present else f"query_{i}"
196
+ query_id = query_df.iloc[i][self.id_column]
171
197
 
172
198
  for neighbor_idx, dist in zip(nbrs, dists):
173
199
  neighbor_id = self.df.iloc[neighbor_idx][self.id_column]
174
200
 
175
- # Skip if neighbor is self and include_self is False
201
+ # Skip self if requested
176
202
  if not include_self and neighbor_id == query_id:
177
203
  continue
178
204
 
179
205
  results.append(self._build_neighbor_result(query_id=query_id, neighbor_idx=neighbor_idx, distance=dist))
180
206
 
181
- results_df = pd.DataFrame(results).sort_values([self.id_column, "distance"]).reset_index(drop=True)
182
- return results_df
207
+ df_results = pd.DataFrame(results)
208
+ df_results["is_self"] = df_results["neighbor_id"] == df_results[self.id_column]
209
+ df_results = df_results.sort_values([self.id_column, "is_self", "distance"], ascending=[True, False, True])
210
+ return df_results.drop("is_self", axis=1).reset_index(drop=True)
183
211
 
184
- def _handle_nan_rows(self, query_df: pd.DataFrame, id_column_present: bool) -> pd.DataFrame:
185
- """Drop rows with NaN values in feature columns and log warnings."""
186
- rows_with_nan = query_df[self.features].isna().any(axis=1)
212
+ def _validate_features(self, df: pd.DataFrame, features: List[str]) -> List[str]:
213
+ """Remove non-numeric features and log warnings."""
214
+ non_numeric = [f for f in features if f not in df.select_dtypes(include=["number"]).columns]
215
+ if non_numeric:
216
+ log.warning(f"Non-numeric features {non_numeric} aren't currently supported, excluding them")
217
+ return [f for f in features if f not in non_numeric]
187
218
 
188
- if rows_with_nan.any():
189
- log.warning(f"Found {rows_with_nan.sum()} rows with NaNs in feature columns:")
190
- if id_column_present:
191
- log.warning(query_df.loc[rows_with_nan, self.id_column])
219
+ def _build_model(self) -> None:
220
+ """Standardize features and fit Nearest Neighbors model."""
221
+ self.scaler = StandardScaler()
222
+ X = self.scaler.fit_transform(self.df[self.features])
223
+ self.nn = NearestNeighbors().fit(X)
192
224
 
193
- return query_df.dropna(subset=self.features)
225
+ def _precompute_metrics(self, n_neighbors: int = 10) -> None:
226
+ """
227
+ Precompute landscape metrics for all compounds.
228
+
229
+ Adds columns to self.df:
230
+ - nn_distance: Distance to nearest neighbor
231
+ - nn_id: ID of nearest neighbor
232
+
233
+ If target is specified, also adds:
234
+ - nn_target: Target value of nearest neighbor
235
+ - nn_target_diff: Absolute difference from nearest neighbor target
236
+ """
237
+ log.info("Precomputing proximity metrics...")
238
+
239
+ # Make sure n_neighbors isn't greater than dataset size
240
+ n_neighbors = min(n_neighbors, len(self.df) - 1)
241
+
242
+ # Get nearest neighbors for all points (including self)
243
+ X = self.scaler.transform(self.df[self.features])
244
+ distances, indices = self.nn.kneighbors(X, n_neighbors=2) # Just need nearest neighbor
245
+
246
+ # Extract nearest neighbor (index 1, since index 0 is self)
247
+ self.df["nn_distance"] = distances[:, 1]
248
+ self.df["nn_id"] = self.df.iloc[indices[:, 1]][self.id_column].values
249
+
250
+ # If target exists, compute target-based metrics
251
+ if self.target and self.target in self.df.columns:
252
+ # Get target values for nearest neighbor
253
+ nn_target_values = self.df.iloc[indices[:, 1]][self.target].values
254
+ self.df["nn_target"] = nn_target_values
255
+ self.df["nn_target_diff"] = np.abs(self.df[self.target].values - nn_target_values)
256
+
257
+ # Precompute target range for min_delta default
258
+ self.target_range = self.df[self.target].max() - self.df[self.target].min()
259
+
260
+ log.info("Proximity metrics precomputed successfully")
194
261
 
195
262
  def _build_neighbor_result(self, query_id, neighbor_idx: int, distance: float) -> Dict:
196
263
  """
@@ -204,111 +271,31 @@ class Proximity:
204
271
  Returns:
205
272
  Dictionary containing neighbor information
206
273
  """
207
- neighbor_id = self.df.iloc[neighbor_idx][self.id_column]
208
274
  neighbor_row = self.df.iloc[neighbor_idx]
275
+ neighbor_id = neighbor_row[self.id_column]
209
276
 
210
277
  # Start with basic info
211
278
  result = {
212
279
  self.id_column: query_id,
213
280
  "neighbor_id": neighbor_id,
214
- "distance": distance,
281
+ "distance": 0.0 if distance < 1e-5 else distance,
215
282
  }
216
283
 
217
- # Columns to automatically include if they exist
218
- auto_include = (
219
- ([self.target, "prediction"] if self.target else [])
220
- + self.track_columns
221
- + [col for col in self.df.columns if "_proba" in col or "residual" in col or col == "outlier"]
222
- )
284
+ # Add target if present
285
+ if self.target and self.target in self.df.columns:
286
+ result[self.target] = neighbor_row[self.target]
223
287
 
224
- # Add values for existing columns
225
- for col in auto_include:
288
+ # Add tracked columns
289
+ for col in self.track_columns:
226
290
  if col in self.df.columns:
227
291
  result[col] = neighbor_row[col]
228
292
 
229
- # Truncate very small distances to zero
230
- result["distance"] = 0.0 if distance < 1e-7 else distance
231
- return result
232
-
233
- def serialize(self, directory: str) -> None:
234
- """
235
- Serialize the Proximity model to a directory.
236
-
237
- Args:
238
- directory: Directory path to save the model components
239
- """
240
- dir_path = Path(directory)
241
- dir_path.mkdir(parents=True, exist_ok=True)
242
-
243
- # Save metadata
244
- metadata = {
245
- "id_column": self.id_column,
246
- "features": self.features,
247
- "target": self.target,
248
- "track_columns": self.track_columns,
249
- "n_neighbors": self.n_neighbors,
250
- }
251
-
252
- (dir_path / "metadata.json").write_text(json.dumps(metadata))
253
-
254
- # Save DataFrame
255
- self.df.to_pickle(dir_path / "df.pkl")
256
-
257
- # Save models
258
- with open(dir_path / "scaler.pkl", "wb") as f:
259
- pickle.dump(self.scaler, f)
260
-
261
- with open(dir_path / "nn_model.pkl", "wb") as f:
262
- pickle.dump(self.nn, f)
263
-
264
- log.info(f"Proximity model serialized to {directory}")
265
-
266
- @classmethod
267
- def deserialize(cls, directory: str) -> "Proximity":
268
- """
269
- Deserialize a Proximity model from a directory.
270
-
271
- Args:
272
- directory: Directory path containing the serialized model components
273
-
274
- Returns:
275
- A new Proximity instance
276
- """
277
- dir_path = Path(directory)
278
- if not dir_path.is_dir():
279
- raise ValueError(f"Directory {directory} does not exist or is not a directory")
280
-
281
- # Load metadata
282
- metadata = json.loads((dir_path / "metadata.json").read_text())
283
-
284
- # Load DataFrame
285
- df_path = dir_path / "df.pkl"
286
- if not df_path.exists():
287
- raise FileNotFoundError(f"DataFrame file not found at {df_path}")
288
- df = pd.read_pickle(df_path)
289
-
290
- # Create instance without calling __init__
291
- instance = cls.__new__(cls)
292
- instance.df = df
293
- instance.id_column = metadata["id_column"]
294
- instance.features = metadata["features"]
295
- instance.target = metadata["target"]
296
- instance.track_columns = metadata["track_columns"]
297
- instance.n_neighbors = metadata["n_neighbors"]
298
-
299
- # Load models
300
- with open(dir_path / "scaler.pkl", "rb") as f:
301
- instance.scaler = pickle.load(f)
302
-
303
- with open(dir_path / "nn_model.pkl", "rb") as f:
304
- instance.nn = pickle.load(f)
305
-
306
- # Restore X
307
- instance.X = instance.scaler.transform(instance.df[instance.features])
308
- instance.proximity_type = ProximityType.DISTANCE
293
+ # Add prediction/probability columns if they exist
294
+ for col in self.df.columns:
295
+ if col == "prediction" or "_proba" in col or "residual" in col or col == "in_model":
296
+ result[col] = neighbor_row[col]
309
297
 
310
- log.info(f"Proximity model deserialized from {directory}")
311
- return instance
298
+ return result
312
299
 
313
300
 
314
301
  # Testing the Proximity class
@@ -328,28 +315,15 @@ if __name__ == "__main__":
328
315
 
329
316
  # Test the Proximity class
330
317
  features = ["Feature1", "Feature2", "Feature3"]
331
- prox = Proximity(df, id_column="ID", features=features, n_neighbors=3)
332
- print(prox.all_neighbors())
333
-
334
- # Test the neighbors method
335
- print(prox.neighbors(1))
318
+ prox = Proximity(df, id_column="ID", features=features)
319
+ print(prox.neighbors(1, n_neighbors=2))
336
320
 
337
321
  # Test the neighbors method with radius
338
322
  print(prox.neighbors(1, radius=2.0))
339
323
 
340
- # Test with data that isn't in the 'train' dataframe
341
- query_data = {
342
- "ID": [6],
343
- "Feature1": [0.31],
344
- "Feature2": [0.31],
345
- "Feature3": [2.31],
346
- }
347
- query_df = pd.DataFrame(query_data)
348
- print(prox.find_neighbors(query_df=query_df)) # For new data we use find_neighbors()
349
-
350
324
  # Test with Features list
351
- prox = Proximity(df, id_column="ID", features=["Feature1"], n_neighbors=2)
352
- print(prox.all_neighbors())
325
+ prox = Proximity(df, id_column="ID", features=["Feature1"])
326
+ print(prox.neighbors(1))
353
327
 
354
328
  # Create a sample DataFrame
355
329
  data = {
@@ -367,40 +341,9 @@ if __name__ == "__main__":
367
341
  features=["Feature1", "Feature2"],
368
342
  target="target",
369
343
  track_columns=["Feature1", "Feature2"],
370
- n_neighbors=3,
371
344
  )
372
- print(prox.all_neighbors())
373
-
374
- # Test the neighbors method
375
345
  print(prox.neighbors(["a", "b"]))
376
346
 
377
- # Time neighbors with all IDs versus calling all_neighbors
378
- import time
379
-
380
- start_time = time.time()
381
- prox_df = prox.find_neighbors(query_df=df, include_self=False)
382
- end_time = time.time()
383
- print(f"Time taken for neighbors: {end_time - start_time:.4f} seconds")
384
- start_time = time.time()
385
- prox_df_all = prox.all_neighbors()
386
- end_time = time.time()
387
- print(f"Time taken for all_neighbors: {end_time - start_time:.4f} seconds")
388
-
389
- # Now compare the two dataframes
390
- print("Neighbors DataFrame:")
391
- print(prox_df)
392
- print("\nAll Neighbors DataFrame:")
393
- print(prox_df_all)
394
- # Check for any discrepancies
395
- if prox_df.equals(prox_df_all):
396
- print("The two DataFrames are equal :)")
397
- else:
398
- print("ERROR: The two DataFrames are not equal!")
399
-
400
- # Test querying without the id_column
401
- df_no_id = df.drop(columns=["foo_id"])
402
- print(prox.find_neighbors(query_df=df_no_id, include_self=False))
403
-
404
347
  # Test duplicate IDs
405
348
  data = {
406
349
  "foo_id": ["a", "b", "c", "d", "d"], # Duplicate ID (d)
@@ -409,17 +352,52 @@ if __name__ == "__main__":
409
352
  "target": [1, 0, 1, 0, 5],
410
353
  }
411
354
  df = pd.DataFrame(data)
412
- prox = Proximity(df, id_column="foo_id", features=["Feature1", "Feature2"], target="target", n_neighbors=3)
355
+ prox = Proximity(df, id_column="foo_id", features=["Feature1", "Feature2"], target="target")
413
356
  print(df.equals(prox.df))
414
357
 
415
358
  # Test with a categorical feature
416
359
  from workbench.api import FeatureSet, Model
417
360
 
418
- fs = FeatureSet("abalone_features")
419
- model = Model("abalone-regression")
361
+ fs = FeatureSet("aqsol_features")
362
+ model = Model("aqsol-regression")
420
363
  features = model.features()
421
364
  df = fs.pull_dataframe()
422
365
  prox = Proximity(
423
366
  df, id_column=fs.id_column, features=model.features(), target=model.target(), track_columns=features
424
367
  )
425
- print(prox.find_neighbors(query_df=df[0:2]))
368
+ print(prox.neighbors(df[fs.id_column].tolist()[:3]))
369
+
370
+ print("\n" + "=" * 80)
371
+ print("Testing isolated_compounds...")
372
+ print("=" * 80)
373
+
374
+ # Test isolated data in the top 1%
375
+ isolated_1pct = prox.isolated(top_percent=1.0)
376
+ print(f"\nTop 1% most isolated compounds (n={len(isolated_1pct)}):")
377
+ print(isolated_1pct[[fs.id_column, "nn_distance", "nn_id"]].head(10))
378
+
379
+ # Test isolated data in the top 5%
380
+ isolated_5pct = prox.isolated(top_percent=5.0)
381
+ print(f"\nTop 5% most isolated compounds (n={len(isolated_5pct)}):")
382
+ print(isolated_5pct[[fs.id_column, "nn_distance", "nn_id"]].head(10))
383
+
384
+ print("\n" + "=" * 80)
385
+ print("Testing target_gradients...")
386
+ print("=" * 80)
387
+
388
+ # Test with different parameters
389
+ gradients_1pct = prox.target_gradients(top_percent=1.0, min_delta=1.0)
390
+ print(f"\nTop 1% target gradients (min_delta=5.0) (n={len(gradients_1pct)}):")
391
+ print(
392
+ gradients_1pct[
393
+ [fs.id_column, model.target(), "neighbor_median", "neighbor_median_diff", "mean_distance", "gradient"]
394
+ ].head(10)
395
+ )
396
+
397
+ gradients_5pct = prox.target_gradients(top_percent=5.0, min_delta=5.0)
398
+ print(f"\nTop 5% target gradients (min_delta=5.0) (n={len(gradients_5pct)}):")
399
+ print(
400
+ gradients_5pct[
401
+ [fs.id_column, model.target(), "neighbor_median", "neighbor_median_diff", "mean_distance", "gradient"]
402
+ ].head(10)
403
+ )
@@ -4,7 +4,7 @@ from typing import Union
4
4
  import logging
5
5
 
6
6
  # Workbench Imports
7
- from workbench.algorithms.dataframe import Proximity, ProximityType
7
+ from workbench.algorithms.dataframe import Proximity
8
8
  from workbench.api.graph_store import GraphStore
9
9
 
10
10
  # Set up logging
@@ -50,12 +50,13 @@ class ProximityGraph:
50
50
  self._nx_graph.add_nodes_from(node_df.set_index(id_column, drop=False).to_dict("index").items())
51
51
 
52
52
  # Determine edge weights based on proximity type
53
- if prox.proximity_type == ProximityType.SIMILARITY:
54
- all_neighbors_df["weight"] = all_neighbors_df["similarity"]
55
- elif prox.proximity_type == ProximityType.DISTANCE:
56
- # Normalize and invert distance
57
- max_distance = all_neighbors_df["distance"].max()
58
- all_neighbors_df["weight"] = 1.0 - all_neighbors_df["distance"] / max_distance
53
+ # if prox.proximity_type == ProximityType.SIMILARITY:
54
+ # all_neighbors_df["weight"] = all_neighbors_df["similarity"]
55
+ # elif prox.proximity_type == ProximityType.DISTANCE:
56
+
57
+ # Normalize and invert distance
58
+ max_distance = all_neighbors_df["distance"].max()
59
+ all_neighbors_df["weight"] = 1.0 - all_neighbors_df["distance"] / max_distance
59
60
 
60
61
  # Add edges to the graph
61
62
  log.info("Adding edges to the graph...")
workbench/api/endpoint.py CHANGED
@@ -4,7 +4,6 @@ Endpoints can be viewed in the AWS Sagemaker interfaces or in the Workbench
4
4
  Dashboard UI, which provides additional model details and performance metrics"""
5
5
 
6
6
  import pandas as pd
7
- from typing import Tuple
8
7
 
9
8
  # Workbench Imports
10
9
  from workbench.core.artifacts.endpoint_core import EndpointCore
@@ -71,14 +70,14 @@ class Endpoint(EndpointCore):
71
70
  """
72
71
  return super().fast_inference(eval_df, threads=threads)
73
72
 
74
- def cross_fold_inference(self, nfolds: int = 5) -> Tuple[dict, pd.DataFrame]:
73
+ def cross_fold_inference(self, nfolds: int = 5) -> pd.DataFrame:
75
74
  """Run cross-fold inference (only works for XGBoost models)
76
75
 
77
76
  Args:
78
77
  nfolds (int): The number of folds to use for cross-validation (default: 5)
79
78
 
80
79
  Returns:
81
- Tuple(dict, pd.DataFrame): A tuple containing a dictionary of metrics and a DataFrame with predictions
80
+ pd.DataFrame: A DataFrame with cross fold predictions
82
81
  """
83
82
  return super().cross_fold_inference(nfolds)
84
83