workbench 0.8.202__py3-none-any.whl → 0.8.220__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of workbench might be problematic. Click here for more details.

Files changed (84) hide show
  1. workbench/algorithms/dataframe/compound_dataset_overlap.py +321 -0
  2. workbench/algorithms/dataframe/feature_space_proximity.py +168 -75
  3. workbench/algorithms/dataframe/fingerprint_proximity.py +421 -85
  4. workbench/algorithms/dataframe/projection_2d.py +44 -21
  5. workbench/algorithms/dataframe/proximity.py +78 -150
  6. workbench/algorithms/graph/light/proximity_graph.py +5 -5
  7. workbench/algorithms/models/cleanlab_model.py +382 -0
  8. workbench/algorithms/models/noise_model.py +388 -0
  9. workbench/algorithms/sql/outliers.py +3 -3
  10. workbench/api/__init__.py +3 -0
  11. workbench/api/df_store.py +17 -108
  12. workbench/api/endpoint.py +13 -11
  13. workbench/api/feature_set.py +111 -8
  14. workbench/api/meta_model.py +289 -0
  15. workbench/api/model.py +45 -12
  16. workbench/api/parameter_store.py +3 -52
  17. workbench/cached/cached_model.py +4 -4
  18. workbench/core/artifacts/artifact.py +5 -5
  19. workbench/core/artifacts/df_store_core.py +114 -0
  20. workbench/core/artifacts/endpoint_core.py +228 -237
  21. workbench/core/artifacts/feature_set_core.py +185 -230
  22. workbench/core/artifacts/model_core.py +34 -26
  23. workbench/core/artifacts/parameter_store_core.py +98 -0
  24. workbench/core/pipelines/pipeline_executor.py +1 -1
  25. workbench/core/transforms/features_to_model/features_to_model.py +22 -10
  26. workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +41 -10
  27. workbench/core/transforms/pandas_transforms/pandas_to_features.py +11 -2
  28. workbench/model_script_utils/model_script_utils.py +339 -0
  29. workbench/model_script_utils/pytorch_utils.py +405 -0
  30. workbench/model_script_utils/uq_harness.py +278 -0
  31. workbench/model_scripts/chemprop/chemprop.template +428 -631
  32. workbench/model_scripts/chemprop/generated_model_script.py +432 -635
  33. workbench/model_scripts/chemprop/model_script_utils.py +339 -0
  34. workbench/model_scripts/chemprop/requirements.txt +2 -10
  35. workbench/model_scripts/custom_models/chem_info/fingerprints.py +87 -46
  36. workbench/model_scripts/custom_models/proximity/feature_space_proximity.py +194 -0
  37. workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +6 -6
  38. workbench/model_scripts/custom_models/uq_models/feature_space_proximity.py +194 -0
  39. workbench/model_scripts/meta_model/generated_model_script.py +209 -0
  40. workbench/model_scripts/meta_model/meta_model.template +209 -0
  41. workbench/model_scripts/pytorch_model/generated_model_script.py +374 -613
  42. workbench/model_scripts/pytorch_model/model_script_utils.py +339 -0
  43. workbench/model_scripts/pytorch_model/pytorch.template +370 -609
  44. workbench/model_scripts/pytorch_model/pytorch_utils.py +405 -0
  45. workbench/model_scripts/pytorch_model/requirements.txt +1 -1
  46. workbench/model_scripts/pytorch_model/uq_harness.py +278 -0
  47. workbench/model_scripts/script_generation.py +6 -5
  48. workbench/model_scripts/uq_models/generated_model_script.py +65 -422
  49. workbench/model_scripts/xgb_model/generated_model_script.py +372 -395
  50. workbench/model_scripts/xgb_model/model_script_utils.py +339 -0
  51. workbench/model_scripts/xgb_model/uq_harness.py +278 -0
  52. workbench/model_scripts/xgb_model/xgb_model.template +366 -396
  53. workbench/repl/workbench_shell.py +0 -5
  54. workbench/resources/open_source_api.key +1 -1
  55. workbench/scripts/endpoint_test.py +2 -2
  56. workbench/scripts/meta_model_sim.py +35 -0
  57. workbench/scripts/training_test.py +85 -0
  58. workbench/utils/chem_utils/fingerprints.py +87 -46
  59. workbench/utils/chem_utils/projections.py +16 -6
  60. workbench/utils/chemprop_utils.py +36 -655
  61. workbench/utils/meta_model_simulator.py +499 -0
  62. workbench/utils/metrics_utils.py +256 -0
  63. workbench/utils/model_utils.py +192 -54
  64. workbench/utils/pytorch_utils.py +33 -472
  65. workbench/utils/shap_utils.py +1 -55
  66. workbench/utils/xgboost_local_crossfold.py +267 -0
  67. workbench/utils/xgboost_model_utils.py +49 -356
  68. workbench/web_interface/components/model_plot.py +7 -1
  69. workbench/web_interface/components/plugins/model_details.py +30 -68
  70. workbench/web_interface/components/plugins/scatter_plot.py +4 -8
  71. {workbench-0.8.202.dist-info → workbench-0.8.220.dist-info}/METADATA +6 -5
  72. {workbench-0.8.202.dist-info → workbench-0.8.220.dist-info}/RECORD +76 -60
  73. {workbench-0.8.202.dist-info → workbench-0.8.220.dist-info}/entry_points.txt +2 -0
  74. workbench/core/cloud_platform/aws/aws_df_store.py +0 -404
  75. workbench/core/cloud_platform/aws/aws_parameter_store.py +0 -296
  76. workbench/model_scripts/custom_models/meta_endpoints/example.py +0 -53
  77. workbench/model_scripts/custom_models/proximity/proximity.py +0 -410
  78. workbench/model_scripts/custom_models/uq_models/meta_uq.template +0 -377
  79. workbench/model_scripts/custom_models/uq_models/proximity.py +0 -410
  80. workbench/model_scripts/uq_models/mapie.template +0 -605
  81. workbench/model_scripts/uq_models/requirements.txt +0 -1
  82. {workbench-0.8.202.dist-info → workbench-0.8.220.dist-info}/WHEEL +0 -0
  83. {workbench-0.8.202.dist-info → workbench-0.8.220.dist-info}/licenses/LICENSE +0 -0
  84. {workbench-0.8.202.dist-info → workbench-0.8.220.dist-info}/top_level.txt +0 -0
@@ -22,7 +22,14 @@ class Projection2D:
22
22
  self.log = logging.getLogger("workbench")
23
23
  self.projection_model = None
24
24
 
25
- def fit_transform(self, input_df: pd.DataFrame, features: list = None, projection: str = "UMAP") -> pd.DataFrame:
25
+ def fit_transform(
26
+ self,
27
+ input_df: pd.DataFrame,
28
+ features: list = None,
29
+ feature_matrix: np.ndarray = None,
30
+ metric: str = "euclidean",
31
+ projection: str = "UMAP",
32
+ ) -> pd.DataFrame:
26
33
  """Fit and transform a DataFrame using the selected dimensionality reduction method.
27
34
 
28
35
  This method creates a copy of the input DataFrame, processes the specified features
@@ -32,6 +39,9 @@ class Projection2D:
32
39
  Args:
33
40
  input_df (pd.DataFrame): The DataFrame containing features to project.
34
41
  features (list, optional): List of feature column names. If None, numeric columns are auto-selected.
42
+ feature_matrix (np.ndarray, optional): Pre-computed feature matrix. If provided, features is ignored
43
+ and no scaling is applied (caller is responsible for appropriate preprocessing).
44
+ metric (str, optional): Distance metric for UMAP (e.g., 'euclidean', 'jaccard'). Default 'euclidean'.
35
45
  projection (str, optional): The projection to use ('UMAP', 'TSNE', 'MDS' or 'PCA'). Default 'UMAP'.
36
46
 
37
47
  Returns:
@@ -40,36 +50,44 @@ class Projection2D:
40
50
  # Create a copy of the input DataFrame
41
51
  df = input_df.copy()
42
52
 
43
- # Auto-identify numeric features if none are provided
44
- if features is None:
45
- features = [col for col in df.select_dtypes(include="number").columns if not col.endswith("id")]
46
- self.log.info(f"Auto-identified numeric features: {features}")
47
-
48
- if len(features) < 2 or df.empty:
49
- self.log.critical("At least two numeric features are required, and DataFrame must not be empty.")
50
- return df
51
-
52
- # Process a copy of the feature data for projection
53
- X = df[features]
54
- X = X.apply(lambda col: col.fillna(col.mean()))
55
- X_scaled = StandardScaler().fit_transform(X)
53
+ # If a feature matrix is provided, use it directly (no scaling)
54
+ if feature_matrix is not None:
55
+ if len(feature_matrix) != len(df):
56
+ self.log.critical("feature_matrix length must match DataFrame length.")
57
+ return df
58
+ X_processed = feature_matrix
59
+ else:
60
+ # Auto-identify numeric features if none are provided
61
+ if features is None:
62
+ features = [col for col in df.select_dtypes(include="number").columns if not col.endswith("id")]
63
+ self.log.info(f"Auto-identified numeric features: {features}")
64
+
65
+ if len(features) < 2 or df.empty:
66
+ self.log.critical("At least two numeric features are required, and DataFrame must not be empty.")
67
+ return df
68
+
69
+ # Process a copy of the feature data for projection
70
+ X = df[features]
71
+ X = X.apply(lambda col: col.fillna(col.mean()))
72
+ X_processed = StandardScaler().fit_transform(X)
56
73
 
57
74
  # Select the projection method (using df for perplexity calculation)
58
- self.projection_model = self._get_projection_model(projection, df)
75
+ self.projection_model = self._get_projection_model(projection, df, metric=metric)
59
76
 
60
- # Apply the projection on the normalized data
61
- projection_result = self.projection_model.fit_transform(X_scaled)
77
+ # Apply the projection on the processed data
78
+ projection_result = self.projection_model.fit_transform(X_processed)
62
79
  df[["x", "y"]] = projection_result
63
80
 
64
81
  # Resolve coincident points and return the new DataFrame
65
82
  return self.resolve_coincident_points(df)
66
83
 
67
- def _get_projection_model(self, projection: str, df: pd.DataFrame):
84
+ def _get_projection_model(self, projection: str, df: pd.DataFrame, metric: str = "euclidean"):
68
85
  """Select and return the appropriate projection model.
69
86
 
70
87
  Args:
71
88
  projection (str): The projection method ('TSNE', 'MDS', 'PCA', or 'UMAP').
72
89
  df (pd.DataFrame): The DataFrame being transformed (used for computing perplexity).
90
+ metric (str): Distance metric for UMAP (default 'euclidean').
73
91
 
74
92
  Returns:
75
93
  A dimensionality reduction model instance.
@@ -88,8 +106,14 @@ class Projection2D:
88
106
  return PCA(n_components=2)
89
107
 
90
108
  if projection == "UMAP" and UMAP_AVAILABLE:
91
- self.log.info("Projection: UMAP")
92
- return umap.UMAP(n_components=2)
109
+ # UMAP default n_neighbors=15, adjust if dataset is smaller
110
+ n_neighbors = min(15, len(df) - 1)
111
+ if n_neighbors < 15:
112
+ self.log.warning(
113
+ f"Dataset size ({len(df)}) smaller than default n_neighbors, using n_neighbors={n_neighbors}"
114
+ )
115
+ self.log.info(f"Projection: UMAP with metric={metric}, n_neighbors={n_neighbors}")
116
+ return umap.UMAP(n_components=2, metric=metric, n_neighbors=n_neighbors)
93
117
 
94
118
  self.log.warning(
95
119
  f"Projection method '{projection}' not recognized or UMAP not available. Falling back to TSNE."
@@ -118,7 +142,6 @@ class Projection2D:
118
142
 
119
143
  # Find duplicates
120
144
  duplicated = rounded.duplicated(subset=["x_round", "y_round"], keep=False)
121
- print("Coincident Points found:", duplicated.sum())
122
145
  if not duplicated.any():
123
146
  return df
124
147
 
@@ -1,7 +1,6 @@
1
1
  import pandas as pd
2
2
  import numpy as np
3
- from sklearn.preprocessing import StandardScaler
4
- from sklearn.neighbors import NearestNeighbors
3
+ from abc import ABC, abstractmethod
5
4
  from typing import List, Dict, Optional, Union
6
5
  import logging
7
6
 
@@ -9,14 +8,16 @@ import logging
9
8
  log = logging.getLogger("workbench")
10
9
 
11
10
 
12
- class Proximity:
11
+ class Proximity(ABC):
12
+ """Abstract base class for proximity/neighbor computations."""
13
+
13
14
  def __init__(
14
15
  self,
15
16
  df: pd.DataFrame,
16
17
  id_column: str,
17
18
  features: List[str],
18
19
  target: Optional[str] = None,
19
- track_columns: Optional[List[str]] = None,
20
+ include_all_columns: bool = False,
20
21
  ):
21
22
  """
22
23
  Initialize the Proximity class.
@@ -26,29 +27,61 @@ class Proximity:
26
27
  id_column: Name of the column used as the identifier.
27
28
  features: List of feature column names to be used for neighbor computations.
28
29
  target: Name of the target column. Defaults to None.
29
- track_columns: Additional columns to track in results. Defaults to None.
30
+ include_all_columns: Include all DataFrame columns in neighbor results. Defaults to False.
30
31
  """
31
32
  self.id_column = id_column
33
+ self.features = features
32
34
  self.target = target
33
- self.track_columns = track_columns or []
35
+ self.include_all_columns = include_all_columns
34
36
 
35
- # Filter out non-numeric features
36
- self.features = self._validate_features(df, features)
37
+ # Store the DataFrame (subclasses may filter/modify in _prepare_data)
38
+ self.df = df.copy()
37
39
 
38
- # Drop NaN rows and set up DataFrame
39
- self.df = df.dropna(subset=self.features).copy()
40
+ # Prepare data (subclasses can override)
41
+ self._prepare_data()
40
42
 
41
43
  # Compute target range if target is provided
42
44
  self.target_range = None
43
45
  if self.target and self.target in self.df.columns:
44
46
  self.target_range = self.df[self.target].max() - self.df[self.target].min()
45
47
 
46
- # Build the proximity model
48
+ # Build the proximity model (subclass-specific)
47
49
  self._build_model()
48
50
 
49
51
  # Precompute landscape metrics
50
52
  self._precompute_metrics()
51
53
 
54
+ # Define core columns for output (subclasses can override)
55
+ self._set_core_columns()
56
+
57
+ # Project the data to 2D (subclass-specific)
58
+ self._project_2d()
59
+
60
+ def _prepare_data(self) -> None:
61
+ """Prepare the data before building the model. Subclasses can override."""
62
+ pass
63
+
64
+ def _set_core_columns(self) -> None:
65
+ """Set the core columns for output. Subclasses can override."""
66
+ self.core_columns = [self.id_column, "nn_distance", "nn_id"]
67
+ if self.target:
68
+ self.core_columns.extend([self.target, "nn_target", "nn_target_diff"])
69
+
70
+ @abstractmethod
71
+ def _build_model(self) -> None:
72
+ """Build the proximity model. Must set self.nn (NearestNeighbors instance)."""
73
+ pass
74
+
75
+ @abstractmethod
76
+ def _transform_features(self, df: pd.DataFrame) -> np.ndarray:
77
+ """Transform features for querying. Returns feature matrix for nearest neighbor lookup."""
78
+ pass
79
+
80
+ @abstractmethod
81
+ def _project_2d(self) -> None:
82
+ """Project the data to 2D for visualization. Updates self.df with 'x' and 'y' columns."""
83
+ pass
84
+
52
85
  def isolated(self, top_percent: float = 1.0) -> pd.DataFrame:
53
86
  """
54
87
  Find isolated data points based on distance to nearest neighbor.
@@ -62,7 +95,19 @@ class Proximity:
62
95
  percentile = 100 - top_percent
63
96
  threshold = np.percentile(self.df["nn_distance"], percentile)
64
97
  isolated = self.df[self.df["nn_distance"] >= threshold].copy()
65
- return isolated.sort_values("nn_distance", ascending=False).reset_index(drop=True)
98
+ isolated = isolated.sort_values("nn_distance", ascending=False).reset_index(drop=True)
99
+ return isolated if self.include_all_columns else isolated[self.core_columns]
100
+
101
+ def proximity_stats(self) -> pd.DataFrame:
102
+ """
103
+ Return distribution statistics for nearest neighbor distances.
104
+
105
+ Returns:
106
+ DataFrame with proximity distribution statistics (count, mean, std, percentiles)
107
+ """
108
+ return (
109
+ self.df["nn_distance"].describe(percentiles=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]).to_frame()
110
+ )
66
111
 
67
112
  def target_gradients(
68
113
  self,
@@ -90,7 +135,7 @@ class Proximity:
90
135
  if self.target is None:
91
136
  raise ValueError("Target column must be specified")
92
137
 
93
- epsilon = 1e-5
138
+ epsilon = 1e-6
94
139
 
95
140
  # Phase 1: Quick filter using precomputed nearest neighbor
96
141
  candidates = self.df.copy()
@@ -111,13 +156,13 @@ class Proximity:
111
156
  threshold = np.percentile(candidates["gradient"], percentile)
112
157
  candidates = candidates[candidates["gradient"] >= threshold].copy()
113
158
 
114
- # Phase 2: Verify with k-neighbor median to filter out cases where nearest neighbor is the outlier
159
+ # Phase 2: Verify with K-neighbor median to filter out cases where nearest neighbor is the outlier
115
160
  results = []
116
161
  for _, row in candidates.iterrows():
117
162
  cmpd_id = row[self.id_column]
118
163
  cmpd_target = row[self.target]
119
164
 
120
- # Get k nearest neighbors (excluding self)
165
+ # Get K nearest neighbors (excluding self)
121
166
  nbrs = self.neighbors(cmpd_id, n_neighbors=k_neighbors, include_self=False)
122
167
 
123
168
  # Calculate median target of k neighbors, excluding the nearest neighbor (index 0)
@@ -146,10 +191,12 @@ class Proximity:
146
191
  columns=[
147
192
  self.id_column,
148
193
  self.target,
194
+ "nn_target",
195
+ "nn_target_diff",
196
+ "nn_distance",
197
+ "gradient",
149
198
  "neighbor_median",
150
199
  "neighbor_median_diff",
151
- "mean_distance",
152
- "gradient",
153
200
  ]
154
201
  )
155
202
 
@@ -188,8 +235,8 @@ class Proximity:
188
235
  query_df = self.df[self.df[self.id_column].isin(ids)]
189
236
  query_df = query_df.set_index(self.id_column).loc[ids].reset_index()
190
237
 
191
- # Transform query features
192
- X_query = self.scaler.transform(query_df[self.features])
238
+ # Transform query features (subclass-specific)
239
+ X_query = self._transform_features(query_df)
193
240
 
194
241
  # Get neighbors
195
242
  if radius is not None:
@@ -216,20 +263,7 @@ class Proximity:
216
263
  df_results = df_results.sort_values([self.id_column, "is_self", "distance"], ascending=[True, False, True])
217
264
  return df_results.drop("is_self", axis=1).reset_index(drop=True)
218
265
 
219
- def _validate_features(self, df: pd.DataFrame, features: List[str]) -> List[str]:
220
- """Remove non-numeric features and log warnings."""
221
- non_numeric = [f for f in features if f not in df.select_dtypes(include=["number"]).columns]
222
- if non_numeric:
223
- log.warning(f"Non-numeric features {non_numeric} aren't currently supported, excluding them")
224
- return [f for f in features if f not in non_numeric]
225
-
226
- def _build_model(self) -> None:
227
- """Standardize features and fit Nearest Neighbors model."""
228
- self.scaler = StandardScaler()
229
- X = self.scaler.fit_transform(self.df[self.features])
230
- self.nn = NearestNeighbors().fit(X)
231
-
232
- def _precompute_metrics(self, n_neighbors: int = 10) -> None:
266
+ def _precompute_metrics(self) -> None:
233
267
  """
234
268
  Precompute landscape metrics for all compounds.
235
269
 
@@ -243,12 +277,9 @@ class Proximity:
243
277
  """
244
278
  log.info("Precomputing proximity metrics...")
245
279
 
246
- # Make sure n_neighbors isn't greater than dataset size
247
- n_neighbors = min(n_neighbors, len(self.df) - 1)
248
-
249
- # Get nearest neighbors for all points (including self)
250
- X = self.scaler.transform(self.df[self.features])
251
- distances, indices = self.nn.kneighbors(X, n_neighbors=2) # Just need nearest neighbor
280
+ # Get nearest neighbors for all points (n=2 because index 0 is self)
281
+ X = self._transform_features(self.df)
282
+ distances, indices = self.nn.kneighbors(X, n_neighbors=2)
252
283
 
253
284
  # Extract nearest neighbor (index 1, since index 0 is self)
254
285
  self.df["nn_distance"] = distances[:, 1]
@@ -285,126 +316,23 @@ class Proximity:
285
316
  result = {
286
317
  self.id_column: query_id,
287
318
  "neighbor_id": neighbor_id,
288
- "distance": 0.0 if distance < 1e-5 else distance,
319
+ "distance": 0.0 if distance < 1e-6 else distance,
289
320
  }
290
321
 
291
322
  # Add target if present
292
323
  if self.target and self.target in self.df.columns:
293
324
  result[self.target] = neighbor_row[self.target]
294
325
 
295
- # Add tracked columns
296
- for col in self.track_columns:
297
- if col in self.df.columns:
298
- result[col] = neighbor_row[col]
299
-
300
326
  # Add prediction/probability columns if they exist
301
327
  for col in self.df.columns:
302
328
  if col == "prediction" or "_proba" in col or "residual" in col or col == "in_model":
303
329
  result[col] = neighbor_row[col]
304
330
 
305
- return result
306
-
331
+ # Include all columns if requested
332
+ if self.include_all_columns:
333
+ result.update(neighbor_row.to_dict())
334
+ # Restore query_id after update (neighbor_row may have overwritten id column)
335
+ result[self.id_column] = query_id
336
+ result["neighbor_id"] = neighbor_id
307
337
 
308
- # Testing the Proximity class
309
- if __name__ == "__main__":
310
-
311
- pd.set_option("display.max_columns", None)
312
- pd.set_option("display.width", 1000)
313
-
314
- # Create a sample DataFrame
315
- data = {
316
- "ID": [1, 2, 3, 4, 5],
317
- "Feature1": [0.1, 0.2, 0.3, 0.4, 0.5],
318
- "Feature2": [0.5, 0.4, 0.3, 0.2, 0.1],
319
- "Feature3": [2.5, 2.4, 2.3, 2.3, np.nan],
320
- }
321
- df = pd.DataFrame(data)
322
-
323
- # Test the Proximity class
324
- features = ["Feature1", "Feature2", "Feature3"]
325
- prox = Proximity(df, id_column="ID", features=features)
326
- print(prox.neighbors(1, n_neighbors=2))
327
-
328
- # Test the neighbors method with radius
329
- print(prox.neighbors(1, radius=2.0))
330
-
331
- # Test with Features list
332
- prox = Proximity(df, id_column="ID", features=["Feature1"])
333
- print(prox.neighbors(1))
334
-
335
- # Create a sample DataFrame
336
- data = {
337
- "foo_id": ["a", "b", "c", "d", "e"], # Testing string IDs
338
- "Feature1": [0.1, 0.2, 0.3, 0.4, 0.5],
339
- "Feature2": [0.5, 0.4, 0.3, 0.2, 0.1],
340
- "target": [1, 0, 1, 0, 5],
341
- }
342
- df = pd.DataFrame(data)
343
-
344
- # Test with String Ids
345
- prox = Proximity(
346
- df,
347
- id_column="foo_id",
348
- features=["Feature1", "Feature2"],
349
- target="target",
350
- track_columns=["Feature1", "Feature2"],
351
- )
352
- print(prox.neighbors(["a", "b"]))
353
-
354
- # Test duplicate IDs
355
- data = {
356
- "foo_id": ["a", "b", "c", "d", "d"], # Duplicate ID (d)
357
- "Feature1": [0.1, 0.2, 0.3, 0.4, 0.5],
358
- "Feature2": [0.5, 0.4, 0.3, 0.2, 0.1],
359
- "target": [1, 0, 1, 0, 5],
360
- }
361
- df = pd.DataFrame(data)
362
- prox = Proximity(df, id_column="foo_id", features=["Feature1", "Feature2"], target="target")
363
- print(df.equals(prox.df))
364
-
365
- # Test with a categorical feature
366
- from workbench.api import FeatureSet, Model
367
-
368
- fs = FeatureSet("aqsol_features")
369
- model = Model("aqsol-regression")
370
- features = model.features()
371
- df = fs.pull_dataframe()
372
- prox = Proximity(
373
- df, id_column=fs.id_column, features=model.features(), target=model.target(), track_columns=features
374
- )
375
- print(prox.neighbors(df[fs.id_column].tolist()[:3]))
376
-
377
- print("\n" + "=" * 80)
378
- print("Testing isolated_compounds...")
379
- print("=" * 80)
380
-
381
- # Test isolated data in the top 1%
382
- isolated_1pct = prox.isolated(top_percent=1.0)
383
- print(f"\nTop 1% most isolated compounds (n={len(isolated_1pct)}):")
384
- print(isolated_1pct[[fs.id_column, "nn_distance", "nn_id"]].head(10))
385
-
386
- # Test isolated data in the top 5%
387
- isolated_5pct = prox.isolated(top_percent=5.0)
388
- print(f"\nTop 5% most isolated compounds (n={len(isolated_5pct)}):")
389
- print(isolated_5pct[[fs.id_column, "nn_distance", "nn_id"]].head(10))
390
-
391
- print("\n" + "=" * 80)
392
- print("Testing target_gradients...")
393
- print("=" * 80)
394
-
395
- # Test with different parameters
396
- gradients_1pct = prox.target_gradients(top_percent=1.0, min_delta=1.0)
397
- print(f"\nTop 1% target gradients (min_delta=5.0) (n={len(gradients_1pct)}):")
398
- print(
399
- gradients_1pct[
400
- [fs.id_column, model.target(), "neighbor_median", "neighbor_median_diff", "mean_distance", "gradient"]
401
- ].head(10)
402
- )
403
-
404
- gradients_5pct = prox.target_gradients(top_percent=5.0, min_delta=5.0)
405
- print(f"\nTop 5% target gradients (min_delta=5.0) (n={len(gradients_5pct)}):")
406
- print(
407
- gradients_5pct[
408
- [fs.id_column, model.target(), "neighbor_median", "neighbor_median_diff", "mean_distance", "gradient"]
409
- ].head(10)
410
- )
338
+ return result
@@ -4,7 +4,7 @@ from typing import Union
4
4
  import logging
5
5
 
6
6
  # Workbench Imports
7
- from workbench.algorithms.dataframe import Proximity
7
+ from workbench.algorithms.dataframe.proximity import Proximity
8
8
  from workbench.api.graph_store import GraphStore
9
9
 
10
10
  # Set up logging
@@ -132,7 +132,7 @@ class ProximityGraph:
132
132
 
133
133
 
134
134
  if __name__ == "__main__":
135
- from workbench.algorithms.dataframe.proximity import Proximity
135
+ from workbench.algorithms.dataframe.feature_space_proximity import FeatureSpaceProximity
136
136
  from workbench.algorithms.dataframe.fingerprint_proximity import FingerprintProximity
137
137
  from workbench.web_interface.components.plugins.graph_plot import GraphPlot
138
138
  from workbench.api import DFStore
@@ -157,9 +157,9 @@ if __name__ == "__main__":
157
157
  }
158
158
  feature_df = pd.DataFrame(feature_data)
159
159
 
160
- # Build a graph using the base Proximity class
161
- print("\n--- Proximity Class ---")
162
- prox = Proximity(feature_df, id_column="id", features=["Feature1", "Feature2"], target="target")
160
+ # Build a graph using FeatureSpaceProximity
161
+ print("\n--- FeatureSpaceProximity Class ---")
162
+ prox = FeatureSpaceProximity(feature_df, id_column="id", features=["Feature1", "Feature2"], target="target")
163
163
  feature_graph = ProximityGraph()
164
164
  feature_graph.build_graph(prox)
165
165
  nx_graph = feature_graph.nx_graph