workbench 0.8.183__py3-none-any.whl → 0.8.184__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of workbench might be problematic. Click here for more details.

@@ -2,10 +2,9 @@ import pandas as pd
2
2
  import numpy as np
3
3
  from sklearn.preprocessing import StandardScaler
4
4
  from sklearn.neighbors import NearestNeighbors
5
- from typing import List, Dict
5
+ from typing import List, Dict, Optional
6
6
  import logging
7
7
  import pickle
8
- import os
9
8
  import json
10
9
  from pathlib import Path
11
10
  from enum import Enum
@@ -14,7 +13,6 @@ from enum import Enum
14
13
  log = logging.getLogger("workbench")
15
14
 
16
15
 
17
- # ^Enumerated^ Proximity Types (distance or similarity)
18
16
  class ProximityType(Enum):
19
17
  DISTANCE = "distance"
20
18
  SIMILARITY = "similarity"
@@ -26,44 +24,49 @@ class Proximity:
26
24
  df: pd.DataFrame,
27
25
  id_column: str,
28
26
  features: List[str],
29
- target: str = None,
30
- track_columns: List[str] = None,
27
+ target: Optional[str] = None,
28
+ track_columns: Optional[List[str]] = None,
31
29
  n_neighbors: int = 10,
32
30
  ):
33
31
  """
34
32
  Initialize the Proximity class.
35
33
 
36
34
  Args:
37
- df (pd.DataFrame): DataFrame containing data for neighbor computations.
38
- id_column (str): Name of the column used as the identifier.
39
- features (List[str]): List of feature column names to be used for neighbor computations.
40
- target (str, optional): Name of the target column. Defaults to None.
41
- track_columns (List[str], optional): Additional columns to track in results. Defaults to None.
42
- n_neighbors (int): Number of neighbors to compute. Defaults to 10.
35
+ df: DataFrame containing data for neighbor computations.
36
+ id_column: Name of the column used as the identifier.
37
+ features: List of feature column names to be used for neighbor computations.
38
+ target: Name of the target column. Defaults to None.
39
+ track_columns: Additional columns to track in results. Defaults to None.
40
+ n_neighbors: Number of neighbors to compute. Defaults to 10.
43
41
  """
44
- self.df = df.dropna(subset=features).copy()
45
42
  self.id_column = id_column
46
- self.n_neighbors = min(n_neighbors, len(self.df) - 1)
47
43
  self.target = target
48
- self.features = features
44
+ self.track_columns = track_columns or []
45
+ self.proximity_type = None
49
46
  self.scaler = None
50
47
  self.X = None
51
48
  self.nn = None
52
- self.proximity_type = None
53
- self.track_columns = track_columns or []
54
49
 
55
- # Right now we only support numeric features, so remove any columns that are not numeric
56
- non_numeric_features = self.df[self.features].select_dtypes(exclude=["number"]).columns.tolist()
57
- if non_numeric_features:
58
- log.warning(f"Non-numeric features {non_numeric_features} aren't currently supported...")
59
- self.features = [f for f in self.features if f not in non_numeric_features]
50
+ # Filter out non-numeric features
51
+ self.features = self._validate_features(df, features)
52
+
53
+ # Drop NaN rows and set up DataFrame
54
+ self.df = df.dropna(subset=self.features).copy()
55
+ self.n_neighbors = min(n_neighbors, len(self.df) - 1)
60
56
 
61
57
  # Build the proximity model
62
58
  self.build_proximity_model()
63
59
 
60
+ def _validate_features(self, df: pd.DataFrame, features: List[str]) -> List[str]:
61
+ """Remove non-numeric features and log warnings."""
62
+ non_numeric = df[features].select_dtypes(exclude=["number"]).columns.tolist()
63
+ if non_numeric:
64
+ log.warning(f"Non-numeric features {non_numeric} aren't currently supported...")
65
+ return [f for f in features if f not in non_numeric]
66
+ return features
67
+
64
68
  def build_proximity_model(self) -> None:
65
- """Standardize features and fit Nearest Neighbors model.
66
- Note: This method can be overridden in subclasses for custom behavior."""
69
+ """Standardize features and fit Nearest Neighbors model."""
67
70
  self.proximity_type = ProximityType.DISTANCE
68
71
  self.scaler = StandardScaler()
69
72
  self.X = self.scaler.fit_transform(self.df[self.features])
@@ -74,27 +77,60 @@ class Proximity:
74
77
  Compute nearest neighbors for all rows in the dataset.
75
78
 
76
79
  Returns:
77
- pd.DataFrame: A DataFrame of neighbors and their distances.
80
+ DataFrame of neighbors and their distances.
78
81
  """
79
82
  distances, indices = self.nn.kneighbors(self.X)
80
- results = []
81
83
 
82
- for i, (dists, nbrs) in enumerate(zip(distances, indices)):
83
- query_id = self.df.iloc[i][self.id_column]
84
-
85
- # Process neighbors
86
- for neighbor_idx, dist in zip(nbrs, dists):
87
- # Skip self (neighbor index == current row index)
88
- if neighbor_idx == i:
89
- continue
90
- results.append(self._build_neighbor_result(query_id=query_id, neighbor_idx=neighbor_idx, distance=dist))
84
+ results = [
85
+ self._build_neighbor_result(
86
+ query_id=self.df.iloc[i][self.id_column], neighbor_idx=neighbor_idx, distance=dist
87
+ )
88
+ for i, (dists, nbrs) in enumerate(zip(distances, indices))
89
+ for neighbor_idx, dist in zip(nbrs, dists)
90
+ if neighbor_idx != i # Skip self
91
+ ]
91
92
 
92
93
  return pd.DataFrame(results)
93
94
 
94
95
  def neighbors(
96
+ self,
97
+ id_or_ids,
98
+ n_neighbors: Optional[int] = 5,
99
+ radius: Optional[float] = None,
100
+ include_self: bool = True,
101
+ ) -> pd.DataFrame:
102
+ """
103
+ Return neighbors for ID(s) from the existing dataset.
104
+
105
+ Args:
106
+ id_or_ids: Single ID or list of IDs to look up
107
+ n_neighbors: Number of neighbors to return (default: 5)
108
+ radius: If provided, find all neighbors within this radius
109
+ include_self: Whether to include self in results (if present)
110
+
111
+ Returns:
112
+ DataFrame containing neighbors and distances
113
+ """
114
+ # Normalize to list
115
+ ids = [id_or_ids] if not isinstance(id_or_ids, list) else id_or_ids
116
+
117
+ # Validate IDs exist
118
+ missing_ids = set(ids) - set(self.df[self.id_column])
119
+ if missing_ids:
120
+ raise ValueError(f"IDs not found in dataset: {missing_ids}")
121
+
122
+ # Filter to requested IDs and preserve order
123
+ query_df = self.df[self.df[self.id_column].isin(ids)]
124
+ query_df = query_df.set_index(self.id_column).loc[ids].reset_index()
125
+
126
+ # Use the core implementation
127
+ return self.find_neighbors(query_df, n_neighbors=n_neighbors, radius=radius, include_self=include_self)
128
+
129
+ def find_neighbors(
95
130
  self,
96
131
  query_df: pd.DataFrame,
97
- radius: float = None,
132
+ n_neighbors: Optional[int] = 5,
133
+ radius: Optional[float] = None,
98
134
  include_self: bool = True,
99
135
  ) -> pd.DataFrame:
100
136
  """
@@ -102,63 +138,63 @@ class Proximity:
102
138
 
103
139
  Args:
104
140
  query_df: DataFrame containing query points
141
+ n_neighbors: Number of neighbors to return (default: 5)
105
142
  radius: If provided, find all neighbors within this radius
106
143
  include_self: Whether to include self in results (if present)
107
144
 
108
145
  Returns:
109
146
  DataFrame containing neighbors and distances
110
-
111
- Note: The query DataFrame must include the feature columns. The id_column is optional.
112
147
  """
113
- # Check if all required features are present
148
+ # Validate features
114
149
  missing = set(self.features) - set(query_df.columns)
115
150
  if missing:
116
151
  raise ValueError(f"Query DataFrame is missing required feature columns: {missing}")
117
152
 
118
- # Check if id_column is present
119
153
  id_column_present = self.id_column in query_df.columns
120
154
 
121
- # None of the features can be NaNs, so report rows with NaNs and then drop them
122
- rows_with_nan = query_df[self.features].isna().any(axis=1)
123
-
124
- # Print the ID column for rows with NaNs
125
- if rows_with_nan.any():
126
- log.warning(f"Found {rows_with_nan.sum()} rows with NaNs in feature columns:")
127
- log.warning(query_df.loc[rows_with_nan, self.id_column])
128
-
129
- # Drop rows with NaNs in feature columns and reassign to query_df
130
- query_df = query_df.dropna(subset=self.features)
155
+ # Handle NaN rows
156
+ query_df = self._handle_nan_rows(query_df, id_column_present)
131
157
 
132
- # Transform the query features using the model's scaler
158
+ # Transform query features
133
159
  X_query = self.scaler.transform(query_df[self.features])
134
160
 
135
- # Get neighbors using either radius or k-nearest neighbors
161
+ # Get neighbors
136
162
  if radius is not None:
137
163
  distances, indices = self.nn.radius_neighbors(X_query, radius=radius)
138
164
  else:
139
- distances, indices = self.nn.kneighbors(X_query)
165
+ distances, indices = self.nn.kneighbors(X_query, n_neighbors=n_neighbors)
140
166
 
141
167
  # Build results
142
- all_results = []
168
+ results = []
143
169
  for i, (dists, nbrs) in enumerate(zip(distances, indices)):
144
- # Use the ID from the query DataFrame if available, otherwise use the row index
145
170
  query_id = query_df.iloc[i][self.id_column] if id_column_present else f"query_{i}"
146
171
 
147
172
  for neighbor_idx, dist in zip(nbrs, dists):
148
- # Skip if the neighbor is the query itself and include_self is False
149
173
  neighbor_id = self.df.iloc[neighbor_idx][self.id_column]
174
+
175
+ # Skip if neighbor is self and include_self is False
150
176
  if not include_self and neighbor_id == query_id:
151
177
  continue
152
178
 
153
- all_results.append(
154
- self._build_neighbor_result(query_id=query_id, neighbor_idx=neighbor_idx, distance=dist)
155
- )
179
+ results.append(self._build_neighbor_result(query_id=query_id, neighbor_idx=neighbor_idx, distance=dist))
180
+
181
+ results_df = pd.DataFrame(results).sort_values([self.id_column, "distance"]).reset_index(drop=True)
182
+ return results_df
183
+
184
+ def _handle_nan_rows(self, query_df: pd.DataFrame, id_column_present: bool) -> pd.DataFrame:
185
+ """Drop rows with NaN values in feature columns and log warnings."""
186
+ rows_with_nan = query_df[self.features].isna().any(axis=1)
187
+
188
+ if rows_with_nan.any():
189
+ log.warning(f"Found {rows_with_nan.sum()} rows with NaNs in feature columns:")
190
+ if id_column_present:
191
+ log.warning(query_df.loc[rows_with_nan, self.id_column])
156
192
 
157
- return pd.DataFrame(all_results)
193
+ return query_df.dropna(subset=self.features)
158
194
 
159
195
  def _build_neighbor_result(self, query_id, neighbor_idx: int, distance: float) -> Dict:
160
196
  """
161
- Internal: Build a result dictionary for a single neighbor.
197
+ Build a result dictionary for a single neighbor.
162
198
 
163
199
  Args:
164
200
  query_id: ID of the query point
@@ -169,27 +205,30 @@ class Proximity:
169
205
  Dictionary containing neighbor information
170
206
  """
171
207
  neighbor_id = self.df.iloc[neighbor_idx][self.id_column]
208
+ neighbor_row = self.df.iloc[neighbor_idx]
172
209
 
173
- # Basic neighbor info
174
- neighbor_info = {
210
+ # Start with basic info
211
+ result = {
175
212
  self.id_column: query_id,
176
213
  "neighbor_id": neighbor_id,
177
214
  "distance": distance,
178
215
  }
179
216
 
180
- # Determine which additional columns to include
181
- relevant_cols = [self.target, "prediction"] if self.target else []
182
- relevant_cols += [c for c in self.df.columns if "_proba" in c or "residual" in c]
183
- relevant_cols += ["outlier"]
217
+ # Columns to automatically include if they exist
218
+ auto_include = (
219
+ ([self.target, "prediction"] if self.target else [])
220
+ + self.track_columns
221
+ + [col for col in self.df.columns if "_proba" in col or "residual" in col or col == "outlier"]
222
+ )
184
223
 
185
- # Add user-specified columns
186
- relevant_cols += self.track_columns
224
+ # Add values for existing columns
225
+ for col in auto_include:
226
+ if col in self.df.columns:
227
+ result[col] = neighbor_row[col]
187
228
 
188
- # Add values for each relevant column that exists in the dataframe
189
- for col in filter(lambda c: c in self.df.columns, relevant_cols):
190
- neighbor_info[col] = self.df.iloc[neighbor_idx][col]
191
-
192
- return neighbor_info
229
+ # Truncate very small distances to zero
230
+ result["distance"] = 0.0 if distance < 1e-7 else distance
231
+ return result
193
232
 
194
233
  def serialize(self, directory: str) -> None:
195
234
  """
@@ -198,8 +237,8 @@ class Proximity:
198
237
  Args:
199
238
  directory: Directory path to save the model components
200
239
  """
201
- # Create directory if it doesn't exist
202
- os.makedirs(directory, exist_ok=True)
240
+ dir_path = Path(directory)
241
+ dir_path.mkdir(parents=True, exist_ok=True)
203
242
 
204
243
  # Save metadata
205
244
  metadata = {
@@ -210,17 +249,16 @@ class Proximity:
210
249
  "n_neighbors": self.n_neighbors,
211
250
  }
212
251
 
213
- with open(os.path.join(directory, "metadata.json"), "w") as f:
214
- json.dump(metadata, f)
252
+ (dir_path / "metadata.json").write_text(json.dumps(metadata))
215
253
 
216
- # Save the DataFrame
217
- self.df.to_pickle(os.path.join(directory, "df.pkl"))
254
+ # Save DataFrame
255
+ self.df.to_pickle(dir_path / "df.pkl")
218
256
 
219
- # Save the scaler and nearest neighbors model
220
- with open(os.path.join(directory, "scaler.pkl"), "wb") as f:
257
+ # Save models
258
+ with open(dir_path / "scaler.pkl", "wb") as f:
221
259
  pickle.dump(self.scaler, f)
222
260
 
223
- with open(os.path.join(directory, "nn_model.pkl"), "wb") as f:
261
+ with open(dir_path / "nn_model.pkl", "wb") as f:
224
262
  pickle.dump(self.nn, f)
225
263
 
226
264
  log.info(f"Proximity model serialized to {directory}")
@@ -234,23 +272,22 @@ class Proximity:
234
272
  directory: Directory path containing the serialized model components
235
273
 
236
274
  Returns:
237
- Proximity: A new Proximity instance
275
+ A new Proximity instance
238
276
  """
239
- directory_path = Path(directory)
240
- if not directory_path.exists() or not directory_path.is_dir():
277
+ dir_path = Path(directory)
278
+ if not dir_path.is_dir():
241
279
  raise ValueError(f"Directory {directory} does not exist or is not a directory")
242
280
 
243
281
  # Load metadata
244
- with open(os.path.join(directory, "metadata.json"), "r") as f:
245
- metadata = json.load(f)
282
+ metadata = json.loads((dir_path / "metadata.json").read_text())
246
283
 
247
284
  # Load DataFrame
248
- df_path = os.path.join(directory, "df.pkl")
249
- if not os.path.exists(df_path):
285
+ df_path = dir_path / "df.pkl"
286
+ if not df_path.exists():
250
287
  raise FileNotFoundError(f"DataFrame file not found at {df_path}")
251
288
  df = pd.read_pickle(df_path)
252
289
 
253
- # Create instance but skip _prepare_data
290
+ # Create instance without calling __init__
254
291
  instance = cls.__new__(cls)
255
292
  instance.df = df
256
293
  instance.id_column = metadata["id_column"]
@@ -259,15 +296,16 @@ class Proximity:
259
296
  instance.track_columns = metadata["track_columns"]
260
297
  instance.n_neighbors = metadata["n_neighbors"]
261
298
 
262
- # Load scaler and nn model
263
- with open(os.path.join(directory, "scaler.pkl"), "rb") as f:
299
+ # Load models
300
+ with open(dir_path / "scaler.pkl", "rb") as f:
264
301
  instance.scaler = pickle.load(f)
265
302
 
266
- with open(os.path.join(directory, "nn_model.pkl"), "rb") as f:
303
+ with open(dir_path / "nn_model.pkl", "rb") as f:
267
304
  instance.nn = pickle.load(f)
268
305
 
269
- # Load X from scaler transform
306
+ # Restore X
270
307
  instance.X = instance.scaler.transform(instance.df[instance.features])
308
+ instance.proximity_type = ProximityType.DISTANCE
271
309
 
272
310
  log.info(f"Proximity model deserialized from {directory}")
273
311
  return instance
@@ -294,10 +332,10 @@ if __name__ == "__main__":
294
332
  print(prox.all_neighbors())
295
333
 
296
334
  # Test the neighbors method
297
- print(prox.neighbors(query_df=df.iloc[[0]]))
335
+ print(prox.neighbors(1))
298
336
 
299
337
  # Test the neighbors method with radius
300
- print(prox.neighbors(query_df=df.iloc[0:2], radius=2.0))
338
+ print(prox.neighbors(1, radius=2.0))
301
339
 
302
340
  # Test with data that isn't in the 'train' dataframe
303
341
  query_data = {
@@ -307,7 +345,7 @@ if __name__ == "__main__":
307
345
  "Feature3": [2.31],
308
346
  }
309
347
  query_df = pd.DataFrame(query_data)
310
- print(prox.neighbors(query_df=query_df))
348
+ print(prox.find_neighbors(query_df=query_df)) # For new data we use find_neighbors()
311
349
 
312
350
  # Test with Features list
313
351
  prox = Proximity(df, id_column="ID", features=["Feature1"], n_neighbors=2)
@@ -334,13 +372,13 @@ if __name__ == "__main__":
334
372
  print(prox.all_neighbors())
335
373
 
336
374
  # Test the neighbors method
337
- print(prox.neighbors(query_df=df.iloc[0:2]))
375
+ print(prox.neighbors(["a", "b"]))
338
376
 
339
377
  # Time neighbors with all IDs versus calling all_neighbors
340
378
  import time
341
379
 
342
380
  start_time = time.time()
343
- prox_df = prox.neighbors(query_df=df, include_self=False)
381
+ prox_df = prox.find_neighbors(query_df=df, include_self=False)
344
382
  end_time = time.time()
345
383
  print(f"Time taken for neighbors: {end_time - start_time:.4f} seconds")
346
384
  start_time = time.time()
@@ -361,7 +399,7 @@ if __name__ == "__main__":
361
399
 
362
400
  # Test querying without the id_column
363
401
  df_no_id = df.drop(columns=["foo_id"])
364
- print(prox.neighbors(query_df=df_no_id, include_self=False))
402
+ print(prox.find_neighbors(query_df=df_no_id, include_self=False))
365
403
 
366
404
  # Test duplicate IDs
367
405
  data = {
@@ -379,6 +417,9 @@ if __name__ == "__main__":
379
417
 
380
418
  fs = FeatureSet("abalone_features")
381
419
  model = Model("abalone-regression")
420
+ features = model.features()
382
421
  df = fs.pull_dataframe()
383
- prox = Proximity(df, id_column=fs.id_column, features=model.features(), target=model.target())
384
- print(prox.neighbors(query_df=df[0:2]))
422
+ prox = Proximity(
423
+ df, id_column=fs.id_column, features=model.features(), target=model.target(), track_columns=features
424
+ )
425
+ print(prox.find_neighbors(query_df=df[0:2]))
@@ -19,7 +19,7 @@ from typing import List, Tuple
19
19
  # Template Placeholders
20
20
  TEMPLATE_PARAMS = {
21
21
  "target": "udm_asy_res_free_percent",
22
- "features": ['vsa_estate6', 'naromatom', 'mollogp', 'fr_nh2', 'mp', 'c2sp2', 'xch_3d', 'axp_6d', 'bcut2d_mrhi', 'fr_benzene', 'mz', 'slogp_vsa6', 'fr_halogen', 'bcut2d_mwhi', 'vsa_estate4', 'slogp_vsa3', 'estate_vsa5', 'minestateindex', 'axp_3dv', 'estate_vsa3', 'vsa_estate9', 'molwt', 'hallkieralpha', 'fpdensitymorgan1', 'peoe_vsa13', 'xpc_5d', 'bcut2d_chghi', 'peoe_vsa8', 'axp_0dv', 'axp_2d', 'chi2v', 'bcut2d_logphi', 'axp_5d', 'peoe_vsa2', 'estate_vsa6', 'qed', 'numrotatablebonds', 'xc_3dv', 'peoe_vsa3', 'balabanj', 'slogp_vsa5', 'mv', 'vsa_estate2', 'bcut2d_mwlow', 'xch_7d', 'chi3n', 'vsa_estate8', 'estate_vsa4', 'xp_7dv', 'fr_nh1', 'vsa_estate3', 'fr_ketone_topliss', 'minpartialcharge', 'phi', 'peoe_vsa10', 'vsa_estate7', 'estate_vsa7', 'tpsa', 'kappa3', 'kappa2', 'bcut2d_logplow', 'xch_6d', 'maxpartialcharge', 'vsa_estate1', 'peoe_vsa9', 'axp_1d', 'fr_ar_n', 'chi2n', 'vsa_estate5', 'xp_4dv', 'slogp_vsa10', 'num_stereobonds', 'peoe_vsa11', 'bcut2d_chglo', 'chi1v', 'peoe_vsa7', 'bertzct', 'axp_2dv', 'estate_vsa2', 'smr_vsa9', 'peoe_vsa6', 'num_s_centers', 'num_r_centers', 'xch_7dv', 'xc_5d', 'axp_4dv', 'xc_5dv', 'mi', 'xc_3d', 'fpdensitymorgan2', 'xp_0dv', 'nhohcount', 'numatomstereocenters', 'mse', 'smr_vsa3', 'peoe_vsa12', 'nocount', 'fpdensitymorgan3', 'minabsestateindex', 'bcut2d_mrlow', 'axp_5dv', 'sz', 'vsa_estate10', 'axp_3d', 'xch_6dv', 'xch_4d', 'xc_6d', 'estate_vsa8', 'mpe', 'smr_vsa7', 'numhdonors', 'smr_vsa1', 'xp_5d', 'fr_para_hydroxylation', 'chi3v', 'xpc_6dv', 'nbase', 'heavyatommolwt', 'avgipc', 'maxestateindex', 'smr_vsa6', 'fr_bicyclic', 'xc_4dv', 'xp_7d', 'smr_vsa5', 'xpc_4d', 'smr_vsa4', 'peoe_vsa4', 'numheteroatoms', 'fr_nhpyrrole', 'axp_4d', 'smr_vsa10', 'xp_6d', 'sps', 'mare', 'slogp_vsa2', 'axp_0d', 'slogp_vsa4', 'fr_al_oh', 'numheterocycles', 'labuteasa', 'xp_3d', 'chi4n', 'fractioncsp3', 'maxabspartialcharge', 'fr_al_oh_notert', 'peoe_vsa1', 'axp_7dv', 'slogp_vsa11', 'peoe_vsa5', 'xpc_5dv', 'xpc_6d', 'xp_2d', 'xp_3dv', 'fr_ndealkylation1', 'axp_7d', 'estate_vsa9', 'molmr', 'num_stereocenters', 'si', 'estate_vsa1', 'xc_6dv', 'chi0v', 'fr_oxazole', 'axp_6dv', 'xp_6dv', 'xp_4d', 'numaliphaticheterocycles', 'fr_imine', 'fr_imidazole', 'xp_5dv', 'fr_piperdine', 'slogp_vsa7', 'chi1', 'c1sp2', 'numaromaticheterocycles', 'xpc_4dv', 'c3sp2', 'fr_aniline', 'fr_piperzine', 'axp_1dv', 'xch_4dv', 'chi4v', 'chi1n', 'minabspartialcharge', 'slogp_vsa1', 'fr_nh0', 'chi0n', 'c2sp3', 'xc_4d', 'xch_5dv', 'peoe_vsa14', 'xch_5d', 'numsaturatedrings', 'fr_pyridine', 'kappa1', 'slogp_vsa8', 'xp_2dv', 'fr_ar_coo', 'numvalenceelectrons'],
22
+ "features": ['naromatom', 'fr_nh2', 'mollogp', 'numheterocycles', 'bcut2d_mrhi', 'numaromaticrings', 'smr_vsa7', 'peoe_vsa4', 'slogp_vsa6', 'peoe_vsa8', 'vsa_estate3', 'maxabspartialcharge', 'fr_arn', 'bcut2d_logplow', 'chi1v', 'axp_6d', 'bcut2d_chglo', 'balabanj', 'slogp_vsa10', 'hallkieralpha', 'vsa_estate6', 'fpdensitymorgan1', 'sps', 'qed', 'peoe_vsa7', 'maxestateindex', 'estate_vsa8', 'vsa_estate9', 'fr_nhpyrrole', 'mz', 'mp', 'bcut2d_mwhi', 'peoe_vsa13', 'c2sp2', 'numrotatablebonds', 'kappa3', 'peoe_vsa1', 'slogp_vsa2', 'xc_5dv', 'bertzct', 'estate_vsa10', 'axp_0d', 'estate_vsa2', 'xc_4d', 'smr_vsa1', 'phi', 'estate_vsa3', 'vsa_estate2', 'mv', 'estate_vsa4', 'mm', 'fr_nh1', 'slogp_vsa7', 'chi4n', 'estate_vsa6', 'fpdensitymorgan2', 'molmr', 'mse', 'bcut2d_mwlow', 'bcut2d_mrlow', 'chi2v', 'minestateindex', 'xpc_4dv', 'fr_nh0', 'axp_2d', 'vsa_estate8', 'nhohcount', 'smr_vsa6', 'peoe_vsa9', 'smr_vsa5', 'num_r_centers', 'xpc_6dv', 'xc_3d', 'slogp_vsa5', 'axp_7dv', 'minabsestateindex', 'xc_5d', 'vsa_estate10', 'fr_hoccn', 'smr_vsa3', 'vsa_estate1', 'axp_5d', 'num_s_centers', 'axp_1d', 'estate_vsa1', 'fpdensitymorgan3', 'axp_5dv', 'chi3n', 'peoe_vsa6', 'labuteasa', 'chi2n', 'xc_6d', 'xp_7d', 'tpsa', 'xpc_4d', 'avgipc', 'xp_5d', 'vsa_estate5', 'xch_7d', 'xch_5d', 'axp_4dv', 'nbase', 'xc_3dv', 'kappa2', 'axp_3d', 'c1sp3', 'numhacceptors', 'bcut2d_logphi', 'smr_vsa10', 'fr_piperzine', 'peoe_vsa11', 'axp_6dv', 'peoe_vsa10', 'estate_vsa9', 'bcut2d_chghi', 'xp_6d', 'xch_6dv', 'chi0', 'vsa_estate7', 'mi', 'xpc_5d', 'fractioncsp3', 'xp_0dv', 'kappa1', 'minpartialcharge', 'xp_6dv', 'peoe_vsa2', 'chi3v', 'axp_0dv', 'mare', 'xch_5dv', 'vsa_estate4', 'xp_4dv', 'estate_vsa7', 'xp_3d', 'numaliphaticheterocycles', 'chi1', 'xp_3dv', 'fr_ether', 'xch_6d', 'peoe_vsa12', 'xch_7dv', 'axp_1dv', 'axp_7d', 'fr_ndealkylation2', 'smr_vsa9', 'axp_2dv', 'estate_vsa5', 'mpe', 'molwt', 'xch_4d', 'axp_3dv', 'xp_5dv', 'chi4v', 'heavyatommolwt', 'fr_al_oh', 'xpc_5dv', 'xpc_6d', 'maxpartialcharge', 'numatomstereocenters', 'peoe_vsa3', 'fr_aniline', 'minabspartialcharge', 'c3sp3', 'slogp_vsa1', 'exactmolwt', 'chi1n', 'xp_7dv', 'chi0n', 'xp_2d', 'xch_4dv', 'fr_bicyclic', 'xc_4dv', 'axp_4d', 'slogp_vsa4', 'fr_benzene', 'numaromaticheterocycles', 'fr_aryl_methyl', 'fr_pyridine', 'fr_imine', 'chi0v', 'slogp_vsa12'],
23
23
  "compressed_features": [],
24
24
  "train_all_data": True,
25
25
  "hyperparameters": {},
@@ -93,6 +93,33 @@ def get_custom_script_path(package: str, script_name: str) -> Path:
93
93
  return script_path
94
94
 
95
95
 
96
+ def proximity_model_local(model: "Model", filtered: bool = True):
97
+ """Create a Proximity Model for this Model
98
+
99
+ Args:
100
+ model (Model): The model to create the proximity model from
101
+ filtered (bool, optional): Use filtered training data for the Proximity Model (default: True)
102
+
103
+ Returns:
104
+ Proximity: The proximity model
105
+ """
106
+ from workbench.algorithms.dataframe.proximity import Proximity # noqa: F401 (avoid circular import)
107
+ from workbench.api import Model, FeatureSet # noqa: F401 (avoid circular import)
108
+
109
+ # Get Feature and Target Columns from the existing given Model
110
+ features = model.features()
111
+ target = model.target()
112
+
113
+ # Create the Proximity Model from our FeatureSet
114
+ fs = FeatureSet(model.get_input())
115
+ if filtered:
116
+ df = fs.view("training").pull_dataframe()
117
+ else:
118
+ df = fs.pull_dataframe()
119
+ id_column = fs.id_column
120
+ return Proximity(df, id_column, features, target, track_columns=features)
121
+
122
+
96
123
  def proximity_model(model: "Model", prox_model_name: str, track_columns: list = None) -> "Model":
97
124
  """Create a proximity model based on the given model
98
125
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: workbench
3
- Version: 0.8.183
3
+ Version: 0.8.184
4
4
  Summary: Workbench: A Dashboard and Python API for creating and deploying AWS SageMaker Model Pipelines
5
5
  Author-email: SuperCowPowers LLC <support@supercowpowers.com>
6
6
  License-Expression: MIT
@@ -6,7 +6,7 @@ workbench/algorithms/dataframe/data_source_eda.py,sha256=WgVL6tzBCw1tznQr8RQ6daQ
6
6
  workbench/algorithms/dataframe/feature_space_proximity.py,sha256=6RxzvbpLdDkHMm1D49Nv59SFcyYUj8bisd6_5EpBEGI,3515
7
7
  workbench/algorithms/dataframe/fingerprint_proximity.py,sha256=zqvk2WfY_nRFGC5bYcBbooGfSZsAhZBS8HNQ0GSMpwg,5838
8
8
  workbench/algorithms/dataframe/projection_2d.py,sha256=zK4hc0OQrySmfcfFg8y0GxEL34uDNqvZL4OgttB9vRs,7834
9
- workbench/algorithms/dataframe/proximity.py,sha256=zqmNlX70LnWXr5fdtFFQppSNTLjlOciQVrjGr-g9jRE,13716
9
+ workbench/algorithms/dataframe/proximity.py,sha256=P8f3GHRhuc4QHj5KkKW0JMrHhIo2QdBiFG-JituTV1U,14633
10
10
  workbench/algorithms/dataframe/storage/aggregation.py,sha256=VuTb7A6Vh6IS5djZeItvOLnnEOlf7tzMQ8OaYIuftvU,2852
11
11
  workbench/algorithms/dataframe/storage/feature_resolution.py,sha256=w_iLf8EFTg7Jc5laH-bsq8MEtZVqcg05W-GihCqR-r4,9450
12
12
  workbench/algorithms/dataframe/storage/feature_spider.py,sha256=uIZ4JHIKuhpy08wBFReSrohb5DGxx8vGroHUbjPm1jE,14353
@@ -35,7 +35,7 @@ workbench/api/endpoint.py,sha256=RDQs78OFRSp9GYGHHph0gLtAfF4p3We96bzdJ4-9bzw,392
35
35
  workbench/api/feature_set.py,sha256=Yxei3tvWR4gSLcdJnNndux07dNeKNu1HKgsChJtHxEM,6633
36
36
  workbench/api/graph_store.py,sha256=LremJyPrQFgsHb7hxsctuCsoxx3p7TKtaY5qALHe6pc,4372
37
37
  workbench/api/meta.py,sha256=1_9989cPvf3hd3tA-83hLijOGNnhwXAF8aZF45adeDQ,8596
38
- workbench/api/model.py,sha256=RkFVXnlLcMlzNKRUFr_GCmZ7IQJMyhB2lwMwd22HBBo,4691
38
+ workbench/api/model.py,sha256=coIl2QwbfyeVBM_Y2SaU587ioBh2TTDC9r9bOuYme9U,4605
39
39
  workbench/api/monitor.py,sha256=Cez89Uac7Tzt47FxkjoX-YDGccEhvBcxw3sZFtw4ud8,4506
40
40
  workbench/api/parameter_store.py,sha256=7BObkuATuP6C5AG_46kCWsmuCwuh1vgMJDBSN0gTkwM,4294
41
41
  workbench/api/pipeline.py,sha256=MSYGrDSXrRB_oQELtAlOwBfxSBTw3REAkHy5XBHau0Y,6261
@@ -56,7 +56,7 @@ workbench/core/artifacts/data_source_abstract.py,sha256=5IRCzFVK-17cd4NXPMRfx99v
56
56
  workbench/core/artifacts/data_source_factory.py,sha256=YL_tA5fsgubbB3dPF6T4tO0rGgz-6oo3ge4i_YXVC-M,2380
57
57
  workbench/core/artifacts/endpoint_core.py,sha256=b3cNj1UnlHmQdG1C8bmD2jWpD4h-O6F-75fWSm01uGU,51850
58
58
  workbench/core/artifacts/feature_set_core.py,sha256=7b1o_PzxtwaYC-W2zxlkltiO0fYULA8CVGWwHNmqgtI,31457
59
- workbench/core/artifacts/model_core.py,sha256=ECDwQ0qM5qb1yGJ07U70BVdfkrW9m7p9e6YJWib3uR0,50855
59
+ workbench/core/artifacts/model_core.py,sha256=wjoa2GQnzrrTM-E2VgYZHT9Ixebl3LaKbJL0YvEdrJY,51546
60
60
  workbench/core/artifacts/monitor_core.py,sha256=M307yz7tEzOEHgv-LmtVy9jKjSbM98fHW3ckmNYrwlU,27897
61
61
  workbench/core/cloud_platform/cloud_meta.py,sha256=-g4-LTC3D0PXb3VfaXdLR1ERijKuHdffeMK_zhD-koQ,8809
62
62
  workbench/core/cloud_platform/aws/README.md,sha256=QT5IQXoUHbIA0qQ2wO6_2P2lYjYQFVYuezc22mWY4i8,97
@@ -134,7 +134,7 @@ workbench/model_scripts/custom_models/network_security/Readme.md,sha256=Z2gtiu0h
134
134
  workbench/model_scripts/custom_models/proximity/Readme.md,sha256=RlMFAJZgAT2mCgDk-UwR_R0Y_NbCqeI5-8DUsxsbpWQ,289
135
135
  workbench/model_scripts/custom_models/proximity/feature_space_proximity.template,sha256=eOllmqB20BWtTiV53dgpIqXKtgSbPFDW_zf8PvM3oF0,4813
136
136
  workbench/model_scripts/custom_models/proximity/generated_model_script.py,sha256=Zk170ztSM_rNSxgbY6ofb5NaqkEdQdhYg0UZprYqRyk,9056
137
- workbench/model_scripts/custom_models/proximity/proximity.py,sha256=zqmNlX70LnWXr5fdtFFQppSNTLjlOciQVrjGr-g9jRE,13716
137
+ workbench/model_scripts/custom_models/proximity/proximity.py,sha256=P8f3GHRhuc4QHj5KkKW0JMrHhIo2QdBiFG-JituTV1U,14633
138
138
  workbench/model_scripts/custom_models/proximity/requirements.txt,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
139
139
  workbench/model_scripts/custom_models/uq_models/Readme.md,sha256=UVpL-lvtTrLqwBeQFinLhd_uNrEw4JUlggIdUSDrd-w,188
140
140
  workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template,sha256=ca3CaAk6HVuNv1HnPgABTzRY3oDrRxomjgD4V1ZDwoc,6448
@@ -143,7 +143,7 @@ workbench/model_scripts/custom_models/uq_models/gaussian_process.template,sha256
143
143
  workbench/model_scripts/custom_models/uq_models/generated_model_script.py,sha256=Y89qD3gJ8wx9klXXDUQNfoLTImVFcdYLfRz-SA8mppE,21461
144
144
  workbench/model_scripts/custom_models/uq_models/meta_uq.template,sha256=XTfhODRaHlI1jZGo9pSe-TqNsk2_nuSw0xMO2fKzDv8,14011
145
145
  workbench/model_scripts/custom_models/uq_models/ngboost.template,sha256=v1rviYTJGJnQRGgAyveXhOQlS-WFCTlc2vdnWq6HIXk,8241
146
- workbench/model_scripts/custom_models/uq_models/proximity.py,sha256=zqmNlX70LnWXr5fdtFFQppSNTLjlOciQVrjGr-g9jRE,13716
146
+ workbench/model_scripts/custom_models/uq_models/proximity.py,sha256=P8f3GHRhuc4QHj5KkKW0JMrHhIo2QdBiFG-JituTV1U,14633
147
147
  workbench/model_scripts/custom_models/uq_models/requirements.txt,sha256=fw7T7t_YJAXK3T6Ysbesxh_Agx_tv0oYx72cEBTqRDY,98
148
148
  workbench/model_scripts/custom_script_example/custom_model_script.py,sha256=T8aydawgRVAdSlDimoWpXxG2YuWWQkbcjBVjAeSG2_0,6408
149
149
  workbench/model_scripts/custom_script_example/requirements.txt,sha256=jWlGc7HH7vqyukTm38LN4EyDi8jDUPEay4n45z-30uc,104
@@ -156,7 +156,7 @@ workbench/model_scripts/pytorch_model/requirements.txt,sha256=ICS5nW0wix44EJO2tJ
156
156
  workbench/model_scripts/scikit_learn/generated_model_script.py,sha256=c73ZpJBlU5k13Nx-ZDkLXu7da40CYyhwjwwmuPq6uLg,12870
157
157
  workbench/model_scripts/scikit_learn/requirements.txt,sha256=aVvwiJ3LgBUhM_PyFlb2gHXu_kpGPho3ANBzlOkfcvs,107
158
158
  workbench/model_scripts/scikit_learn/scikit_learn.template,sha256=QQvqx-eX9ZTbYmyupq6R6vIQwosmsmY_MRBPaHyfjdk,12586
159
- workbench/model_scripts/uq_models/generated_model_script.py,sha256=Y89qD3gJ8wx9klXXDUQNfoLTImVFcdYLfRz-SA8mppE,21461
159
+ workbench/model_scripts/uq_models/generated_model_script.py,sha256=OS_ufhyLR9IQcyRV2ukO_CfDnjp60UE9kwcAN4RY0Is,21191
160
160
  workbench/model_scripts/uq_models/mapie.template,sha256=8VzoP-Wp3ECVIDqXVkiTS6bwmn3cd3dDZ2WjYPzXTi8,18955
161
161
  workbench/model_scripts/uq_models/requirements.txt,sha256=fw7T7t_YJAXK3T6Ysbesxh_Agx_tv0oYx72cEBTqRDY,98
162
162
  workbench/model_scripts/xgb_model/generated_model_script.py,sha256=Tbn7EMXxZZO8rDdKQ5fYCbpltACsMXNvuusLL9p-U5c,22319
@@ -219,7 +219,7 @@ workbench/utils/lambda_utils.py,sha256=7GhGRPyXn9o-toWb9HBGSnI8-DhK9YRkwhCSk_mNK
219
219
  workbench/utils/license_manager.py,sha256=sDuhk1mZZqUbFmnuFXehyGnui_ALxrmYBg7gYwoo7ho,6975
220
220
  workbench/utils/log_utils.py,sha256=7n1NJXO_jUX82e6LWAQug6oPo3wiPDBYsqk9gsYab_A,3167
221
221
  workbench/utils/markdown_utils.py,sha256=4lEqzgG4EVmLcvvKKNUwNxVCySLQKJTJmWDiaDroI1w,8306
222
- workbench/utils/model_utils.py,sha256=sbsWDaeTagHdrh8Z_A_yfWsiSIEqSl18sq67l9E2xiA,12602
222
+ workbench/utils/model_utils.py,sha256=5z4E_F_uDUhuz5_0ZNqNXvretImF1c49LX158RP6yfI,13588
223
223
  workbench/utils/monitor_utils.py,sha256=kVaJ7BgUXs3VPMFYfLC03wkIV4Dq-pEhoXS0wkJFxCc,7858
224
224
  workbench/utils/pandas_utils.py,sha256=uTUx-d1KYfjbS9PMQp2_9FogCV7xVZR6XLzU5YAGmfs,39371
225
225
  workbench/utils/performance_utils.py,sha256=WDNvz-bOdC99cDuXl0urAV4DJ7alk_V3yzKPwvqgST4,1329
@@ -287,9 +287,9 @@ workbench/web_interface/page_views/main_page.py,sha256=X4-KyGTKLAdxR-Zk2niuLJB2Y
287
287
  workbench/web_interface/page_views/models_page_view.py,sha256=M0bdC7bAzLyIaE2jviY12FF4abdMFZmg6sFuOY_LaGI,2650
288
288
  workbench/web_interface/page_views/page_view.py,sha256=Gh6YnpOGlUejx-bHZAf5pzqoQ1H1R0OSwOpGhOBO06w,455
289
289
  workbench/web_interface/page_views/pipelines_page_view.py,sha256=v2pxrIbsHBcYiblfius3JK766NZ7ciD2yPx0t3E5IJo,2656
290
- workbench-0.8.183.dist-info/licenses/LICENSE,sha256=z4QMMPlLJkZjU8VOKqJkZiQZCEZ--saIU2Z8-p3aVc0,1080
291
- workbench-0.8.183.dist-info/METADATA,sha256=-awjVDp5bJexow_nWLET9GLfph_LRFcouLnPIDgeoAA,9210
292
- workbench-0.8.183.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
293
- workbench-0.8.183.dist-info/entry_points.txt,sha256=zPFPruY9uayk8-wsKrhfnIyIB6jvZOW_ibyllEIsLWo,356
294
- workbench-0.8.183.dist-info/top_level.txt,sha256=Dhy72zTxaA_o_yRkPZx5zw-fwumnjGaeGf0hBN3jc_w,10
295
- workbench-0.8.183.dist-info/RECORD,,
290
+ workbench-0.8.184.dist-info/licenses/LICENSE,sha256=z4QMMPlLJkZjU8VOKqJkZiQZCEZ--saIU2Z8-p3aVc0,1080
291
+ workbench-0.8.184.dist-info/METADATA,sha256=3B5uP_y9cOctNaxDK6Z9Fwfcwzf7p9f3HyjJ35B-nqY,9210
292
+ workbench-0.8.184.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
293
+ workbench-0.8.184.dist-info/entry_points.txt,sha256=zPFPruY9uayk8-wsKrhfnIyIB6jvZOW_ibyllEIsLWo,356
294
+ workbench-0.8.184.dist-info/top_level.txt,sha256=Dhy72zTxaA_o_yRkPZx5zw-fwumnjGaeGf0hBN3jc_w,10
295
+ workbench-0.8.184.dist-info/RECORD,,