workbench 0.8.183__py3-none-any.whl → 0.8.185__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of workbench might be problematic. Click here for more details.

@@ -2,10 +2,9 @@ import pandas as pd
2
2
  import numpy as np
3
3
  from sklearn.preprocessing import StandardScaler
4
4
  from sklearn.neighbors import NearestNeighbors
5
- from typing import List, Dict
5
+ from typing import List, Dict, Optional
6
6
  import logging
7
7
  import pickle
8
- import os
9
8
  import json
10
9
  from pathlib import Path
11
10
  from enum import Enum
@@ -14,7 +13,6 @@ from enum import Enum
14
13
  log = logging.getLogger("workbench")
15
14
 
16
15
 
17
- # ^Enumerated^ Proximity Types (distance or similarity)
18
16
  class ProximityType(Enum):
19
17
  DISTANCE = "distance"
20
18
  SIMILARITY = "similarity"
@@ -26,44 +24,49 @@ class Proximity:
26
24
  df: pd.DataFrame,
27
25
  id_column: str,
28
26
  features: List[str],
29
- target: str = None,
30
- track_columns: List[str] = None,
27
+ target: Optional[str] = None,
28
+ track_columns: Optional[List[str]] = None,
31
29
  n_neighbors: int = 10,
32
30
  ):
33
31
  """
34
32
  Initialize the Proximity class.
35
33
 
36
34
  Args:
37
- df (pd.DataFrame): DataFrame containing data for neighbor computations.
38
- id_column (str): Name of the column used as the identifier.
39
- features (List[str]): List of feature column names to be used for neighbor computations.
40
- target (str, optional): Name of the target column. Defaults to None.
41
- track_columns (List[str], optional): Additional columns to track in results. Defaults to None.
42
- n_neighbors (int): Number of neighbors to compute. Defaults to 10.
35
+ df: DataFrame containing data for neighbor computations.
36
+ id_column: Name of the column used as the identifier.
37
+ features: List of feature column names to be used for neighbor computations.
38
+ target: Name of the target column. Defaults to None.
39
+ track_columns: Additional columns to track in results. Defaults to None.
40
+ n_neighbors: Number of neighbors to compute. Defaults to 10.
43
41
  """
44
- self.df = df.dropna(subset=features).copy()
45
42
  self.id_column = id_column
46
- self.n_neighbors = min(n_neighbors, len(self.df) - 1)
47
43
  self.target = target
48
- self.features = features
44
+ self.track_columns = track_columns or []
45
+ self.proximity_type = None
49
46
  self.scaler = None
50
47
  self.X = None
51
48
  self.nn = None
52
- self.proximity_type = None
53
- self.track_columns = track_columns or []
54
49
 
55
- # Right now we only support numeric features, so remove any columns that are not numeric
56
- non_numeric_features = self.df[self.features].select_dtypes(exclude=["number"]).columns.tolist()
57
- if non_numeric_features:
58
- log.warning(f"Non-numeric features {non_numeric_features} aren't currently supported...")
59
- self.features = [f for f in self.features if f not in non_numeric_features]
50
+ # Filter out non-numeric features
51
+ self.features = self._validate_features(df, features)
52
+
53
+ # Drop NaN rows and set up DataFrame
54
+ self.df = df.dropna(subset=self.features).copy()
55
+ self.n_neighbors = min(n_neighbors, len(self.df) - 1)
60
56
 
61
57
  # Build the proximity model
62
58
  self.build_proximity_model()
63
59
 
60
+ def _validate_features(self, df: pd.DataFrame, features: List[str]) -> List[str]:
61
+ """Remove non-numeric features and log warnings."""
62
+ non_numeric = df[features].select_dtypes(exclude=["number"]).columns.tolist()
63
+ if non_numeric:
64
+ log.warning(f"Non-numeric features {non_numeric} aren't currently supported...")
65
+ return [f for f in features if f not in non_numeric]
66
+ return features
67
+
64
68
  def build_proximity_model(self) -> None:
65
- """Standardize features and fit Nearest Neighbors model.
66
- Note: This method can be overridden in subclasses for custom behavior."""
69
+ """Standardize features and fit Nearest Neighbors model."""
67
70
  self.proximity_type = ProximityType.DISTANCE
68
71
  self.scaler = StandardScaler()
69
72
  self.X = self.scaler.fit_transform(self.df[self.features])
@@ -74,27 +77,60 @@ class Proximity:
74
77
  Compute nearest neighbors for all rows in the dataset.
75
78
 
76
79
  Returns:
77
- pd.DataFrame: A DataFrame of neighbors and their distances.
80
+ DataFrame of neighbors and their distances.
78
81
  """
79
82
  distances, indices = self.nn.kneighbors(self.X)
80
- results = []
81
83
 
82
- for i, (dists, nbrs) in enumerate(zip(distances, indices)):
83
- query_id = self.df.iloc[i][self.id_column]
84
-
85
- # Process neighbors
86
- for neighbor_idx, dist in zip(nbrs, dists):
87
- # Skip self (neighbor index == current row index)
88
- if neighbor_idx == i:
89
- continue
90
- results.append(self._build_neighbor_result(query_id=query_id, neighbor_idx=neighbor_idx, distance=dist))
84
+ results = [
85
+ self._build_neighbor_result(
86
+ query_id=self.df.iloc[i][self.id_column], neighbor_idx=neighbor_idx, distance=dist
87
+ )
88
+ for i, (dists, nbrs) in enumerate(zip(distances, indices))
89
+ for neighbor_idx, dist in zip(nbrs, dists)
90
+ if neighbor_idx != i # Skip self
91
+ ]
91
92
 
92
93
  return pd.DataFrame(results)
93
94
 
94
95
  def neighbors(
96
+ self,
97
+ id_or_ids,
98
+ n_neighbors: Optional[int] = 5,
99
+ radius: Optional[float] = None,
100
+ include_self: bool = True,
101
+ ) -> pd.DataFrame:
102
+ """
103
+ Return neighbors for ID(s) from the existing dataset.
104
+
105
+ Args:
106
+ id_or_ids: Single ID or list of IDs to look up
107
+ n_neighbors: Number of neighbors to return (default: 5)
108
+ radius: If provided, find all neighbors within this radius
109
+ include_self: Whether to include self in results (if present)
110
+
111
+ Returns:
112
+ DataFrame containing neighbors and distances
113
+ """
114
+ # Normalize to list
115
+ ids = [id_or_ids] if not isinstance(id_or_ids, list) else id_or_ids
116
+
117
+ # Validate IDs exist
118
+ missing_ids = set(ids) - set(self.df[self.id_column])
119
+ if missing_ids:
120
+ raise ValueError(f"IDs not found in dataset: {missing_ids}")
121
+
122
+ # Filter to requested IDs and preserve order
123
+ query_df = self.df[self.df[self.id_column].isin(ids)]
124
+ query_df = query_df.set_index(self.id_column).loc[ids].reset_index()
125
+
126
+ # Use the core implementation
127
+ return self.find_neighbors(query_df, n_neighbors=n_neighbors, radius=radius, include_self=include_self)
128
+
129
+ def find_neighbors(
95
130
  self,
96
131
  query_df: pd.DataFrame,
97
- radius: float = None,
132
+ n_neighbors: Optional[int] = 5,
133
+ radius: Optional[float] = None,
98
134
  include_self: bool = True,
99
135
  ) -> pd.DataFrame:
100
136
  """
@@ -102,63 +138,63 @@ class Proximity:
102
138
 
103
139
  Args:
104
140
  query_df: DataFrame containing query points
141
+ n_neighbors: Number of neighbors to return (default: 5)
105
142
  radius: If provided, find all neighbors within this radius
106
143
  include_self: Whether to include self in results (if present)
107
144
 
108
145
  Returns:
109
146
  DataFrame containing neighbors and distances
110
-
111
- Note: The query DataFrame must include the feature columns. The id_column is optional.
112
147
  """
113
- # Check if all required features are present
148
+ # Validate features
114
149
  missing = set(self.features) - set(query_df.columns)
115
150
  if missing:
116
151
  raise ValueError(f"Query DataFrame is missing required feature columns: {missing}")
117
152
 
118
- # Check if id_column is present
119
153
  id_column_present = self.id_column in query_df.columns
120
154
 
121
- # None of the features can be NaNs, so report rows with NaNs and then drop them
122
- rows_with_nan = query_df[self.features].isna().any(axis=1)
123
-
124
- # Print the ID column for rows with NaNs
125
- if rows_with_nan.any():
126
- log.warning(f"Found {rows_with_nan.sum()} rows with NaNs in feature columns:")
127
- log.warning(query_df.loc[rows_with_nan, self.id_column])
128
-
129
- # Drop rows with NaNs in feature columns and reassign to query_df
130
- query_df = query_df.dropna(subset=self.features)
155
+ # Handle NaN rows
156
+ query_df = self._handle_nan_rows(query_df, id_column_present)
131
157
 
132
- # Transform the query features using the model's scaler
158
+ # Transform query features
133
159
  X_query = self.scaler.transform(query_df[self.features])
134
160
 
135
- # Get neighbors using either radius or k-nearest neighbors
161
+ # Get neighbors
136
162
  if radius is not None:
137
163
  distances, indices = self.nn.radius_neighbors(X_query, radius=radius)
138
164
  else:
139
- distances, indices = self.nn.kneighbors(X_query)
165
+ distances, indices = self.nn.kneighbors(X_query, n_neighbors=n_neighbors)
140
166
 
141
167
  # Build results
142
- all_results = []
168
+ results = []
143
169
  for i, (dists, nbrs) in enumerate(zip(distances, indices)):
144
- # Use the ID from the query DataFrame if available, otherwise use the row index
145
170
  query_id = query_df.iloc[i][self.id_column] if id_column_present else f"query_{i}"
146
171
 
147
172
  for neighbor_idx, dist in zip(nbrs, dists):
148
- # Skip if the neighbor is the query itself and include_self is False
149
173
  neighbor_id = self.df.iloc[neighbor_idx][self.id_column]
174
+
175
+ # Skip if neighbor is self and include_self is False
150
176
  if not include_self and neighbor_id == query_id:
151
177
  continue
152
178
 
153
- all_results.append(
154
- self._build_neighbor_result(query_id=query_id, neighbor_idx=neighbor_idx, distance=dist)
155
- )
179
+ results.append(self._build_neighbor_result(query_id=query_id, neighbor_idx=neighbor_idx, distance=dist))
180
+
181
+ results_df = pd.DataFrame(results).sort_values([self.id_column, "distance"]).reset_index(drop=True)
182
+ return results_df
183
+
184
+ def _handle_nan_rows(self, query_df: pd.DataFrame, id_column_present: bool) -> pd.DataFrame:
185
+ """Drop rows with NaN values in feature columns and log warnings."""
186
+ rows_with_nan = query_df[self.features].isna().any(axis=1)
187
+
188
+ if rows_with_nan.any():
189
+ log.warning(f"Found {rows_with_nan.sum()} rows with NaNs in feature columns:")
190
+ if id_column_present:
191
+ log.warning(query_df.loc[rows_with_nan, self.id_column])
156
192
 
157
- return pd.DataFrame(all_results)
193
+ return query_df.dropna(subset=self.features)
158
194
 
159
195
  def _build_neighbor_result(self, query_id, neighbor_idx: int, distance: float) -> Dict:
160
196
  """
161
- Internal: Build a result dictionary for a single neighbor.
197
+ Build a result dictionary for a single neighbor.
162
198
 
163
199
  Args:
164
200
  query_id: ID of the query point
@@ -169,27 +205,30 @@ class Proximity:
169
205
  Dictionary containing neighbor information
170
206
  """
171
207
  neighbor_id = self.df.iloc[neighbor_idx][self.id_column]
208
+ neighbor_row = self.df.iloc[neighbor_idx]
172
209
 
173
- # Basic neighbor info
174
- neighbor_info = {
210
+ # Start with basic info
211
+ result = {
175
212
  self.id_column: query_id,
176
213
  "neighbor_id": neighbor_id,
177
214
  "distance": distance,
178
215
  }
179
216
 
180
- # Determine which additional columns to include
181
- relevant_cols = [self.target, "prediction"] if self.target else []
182
- relevant_cols += [c for c in self.df.columns if "_proba" in c or "residual" in c]
183
- relevant_cols += ["outlier"]
217
+ # Columns to automatically include if they exist
218
+ auto_include = (
219
+ ([self.target, "prediction"] if self.target else [])
220
+ + self.track_columns
221
+ + [col for col in self.df.columns if "_proba" in col or "residual" in col or col == "outlier"]
222
+ )
184
223
 
185
- # Add user-specified columns
186
- relevant_cols += self.track_columns
224
+ # Add values for existing columns
225
+ for col in auto_include:
226
+ if col in self.df.columns:
227
+ result[col] = neighbor_row[col]
187
228
 
188
- # Add values for each relevant column that exists in the dataframe
189
- for col in filter(lambda c: c in self.df.columns, relevant_cols):
190
- neighbor_info[col] = self.df.iloc[neighbor_idx][col]
191
-
192
- return neighbor_info
229
+ # Truncate very small distances to zero
230
+ result["distance"] = 0.0 if distance < 1e-7 else distance
231
+ return result
193
232
 
194
233
  def serialize(self, directory: str) -> None:
195
234
  """
@@ -198,8 +237,8 @@ class Proximity:
198
237
  Args:
199
238
  directory: Directory path to save the model components
200
239
  """
201
- # Create directory if it doesn't exist
202
- os.makedirs(directory, exist_ok=True)
240
+ dir_path = Path(directory)
241
+ dir_path.mkdir(parents=True, exist_ok=True)
203
242
 
204
243
  # Save metadata
205
244
  metadata = {
@@ -210,17 +249,16 @@ class Proximity:
210
249
  "n_neighbors": self.n_neighbors,
211
250
  }
212
251
 
213
- with open(os.path.join(directory, "metadata.json"), "w") as f:
214
- json.dump(metadata, f)
252
+ (dir_path / "metadata.json").write_text(json.dumps(metadata))
215
253
 
216
- # Save the DataFrame
217
- self.df.to_pickle(os.path.join(directory, "df.pkl"))
254
+ # Save DataFrame
255
+ self.df.to_pickle(dir_path / "df.pkl")
218
256
 
219
- # Save the scaler and nearest neighbors model
220
- with open(os.path.join(directory, "scaler.pkl"), "wb") as f:
257
+ # Save models
258
+ with open(dir_path / "scaler.pkl", "wb") as f:
221
259
  pickle.dump(self.scaler, f)
222
260
 
223
- with open(os.path.join(directory, "nn_model.pkl"), "wb") as f:
261
+ with open(dir_path / "nn_model.pkl", "wb") as f:
224
262
  pickle.dump(self.nn, f)
225
263
 
226
264
  log.info(f"Proximity model serialized to {directory}")
@@ -234,23 +272,22 @@ class Proximity:
234
272
  directory: Directory path containing the serialized model components
235
273
 
236
274
  Returns:
237
- Proximity: A new Proximity instance
275
+ A new Proximity instance
238
276
  """
239
- directory_path = Path(directory)
240
- if not directory_path.exists() or not directory_path.is_dir():
277
+ dir_path = Path(directory)
278
+ if not dir_path.is_dir():
241
279
  raise ValueError(f"Directory {directory} does not exist or is not a directory")
242
280
 
243
281
  # Load metadata
244
- with open(os.path.join(directory, "metadata.json"), "r") as f:
245
- metadata = json.load(f)
282
+ metadata = json.loads((dir_path / "metadata.json").read_text())
246
283
 
247
284
  # Load DataFrame
248
- df_path = os.path.join(directory, "df.pkl")
249
- if not os.path.exists(df_path):
285
+ df_path = dir_path / "df.pkl"
286
+ if not df_path.exists():
250
287
  raise FileNotFoundError(f"DataFrame file not found at {df_path}")
251
288
  df = pd.read_pickle(df_path)
252
289
 
253
- # Create instance but skip _prepare_data
290
+ # Create instance without calling __init__
254
291
  instance = cls.__new__(cls)
255
292
  instance.df = df
256
293
  instance.id_column = metadata["id_column"]
@@ -259,15 +296,16 @@ class Proximity:
259
296
  instance.track_columns = metadata["track_columns"]
260
297
  instance.n_neighbors = metadata["n_neighbors"]
261
298
 
262
- # Load scaler and nn model
263
- with open(os.path.join(directory, "scaler.pkl"), "rb") as f:
299
+ # Load models
300
+ with open(dir_path / "scaler.pkl", "rb") as f:
264
301
  instance.scaler = pickle.load(f)
265
302
 
266
- with open(os.path.join(directory, "nn_model.pkl"), "rb") as f:
303
+ with open(dir_path / "nn_model.pkl", "rb") as f:
267
304
  instance.nn = pickle.load(f)
268
305
 
269
- # Load X from scaler transform
306
+ # Restore X
270
307
  instance.X = instance.scaler.transform(instance.df[instance.features])
308
+ instance.proximity_type = ProximityType.DISTANCE
271
309
 
272
310
  log.info(f"Proximity model deserialized from {directory}")
273
311
  return instance
@@ -294,10 +332,10 @@ if __name__ == "__main__":
294
332
  print(prox.all_neighbors())
295
333
 
296
334
  # Test the neighbors method
297
- print(prox.neighbors(query_df=df.iloc[[0]]))
335
+ print(prox.neighbors(1))
298
336
 
299
337
  # Test the neighbors method with radius
300
- print(prox.neighbors(query_df=df.iloc[0:2], radius=2.0))
338
+ print(prox.neighbors(1, radius=2.0))
301
339
 
302
340
  # Test with data that isn't in the 'train' dataframe
303
341
  query_data = {
@@ -307,7 +345,7 @@ if __name__ == "__main__":
307
345
  "Feature3": [2.31],
308
346
  }
309
347
  query_df = pd.DataFrame(query_data)
310
- print(prox.neighbors(query_df=query_df))
348
+ print(prox.find_neighbors(query_df=query_df)) # For new data we use find_neighbors()
311
349
 
312
350
  # Test with Features list
313
351
  prox = Proximity(df, id_column="ID", features=["Feature1"], n_neighbors=2)
@@ -334,13 +372,13 @@ if __name__ == "__main__":
334
372
  print(prox.all_neighbors())
335
373
 
336
374
  # Test the neighbors method
337
- print(prox.neighbors(query_df=df.iloc[0:2]))
375
+ print(prox.neighbors(["a", "b"]))
338
376
 
339
377
  # Time neighbors with all IDs versus calling all_neighbors
340
378
  import time
341
379
 
342
380
  start_time = time.time()
343
- prox_df = prox.neighbors(query_df=df, include_self=False)
381
+ prox_df = prox.find_neighbors(query_df=df, include_self=False)
344
382
  end_time = time.time()
345
383
  print(f"Time taken for neighbors: {end_time - start_time:.4f} seconds")
346
384
  start_time = time.time()
@@ -361,7 +399,7 @@ if __name__ == "__main__":
361
399
 
362
400
  # Test querying without the id_column
363
401
  df_no_id = df.drop(columns=["foo_id"])
364
- print(prox.neighbors(query_df=df_no_id, include_self=False))
402
+ print(prox.find_neighbors(query_df=df_no_id, include_self=False))
365
403
 
366
404
  # Test duplicate IDs
367
405
  data = {
@@ -379,6 +417,9 @@ if __name__ == "__main__":
379
417
 
380
418
  fs = FeatureSet("abalone_features")
381
419
  model = Model("abalone-regression")
420
+ features = model.features()
382
421
  df = fs.pull_dataframe()
383
- prox = Proximity(df, id_column=fs.id_column, features=model.features(), target=model.target())
384
- print(prox.neighbors(query_df=df[0:2]))
422
+ prox = Proximity(
423
+ df, id_column=fs.id_column, features=model.features(), target=model.target(), track_columns=features
424
+ )
425
+ print(prox.find_neighbors(query_df=df[0:2]))
@@ -19,7 +19,7 @@ from typing import List, Tuple
19
19
  # Template Placeholders
20
20
  TEMPLATE_PARAMS = {
21
21
  "target": "udm_asy_res_free_percent",
22
- "features": ['vsa_estate6', 'naromatom', 'mollogp', 'fr_nh2', 'mp', 'c2sp2', 'xch_3d', 'axp_6d', 'bcut2d_mrhi', 'fr_benzene', 'mz', 'slogp_vsa6', 'fr_halogen', 'bcut2d_mwhi', 'vsa_estate4', 'slogp_vsa3', 'estate_vsa5', 'minestateindex', 'axp_3dv', 'estate_vsa3', 'vsa_estate9', 'molwt', 'hallkieralpha', 'fpdensitymorgan1', 'peoe_vsa13', 'xpc_5d', 'bcut2d_chghi', 'peoe_vsa8', 'axp_0dv', 'axp_2d', 'chi2v', 'bcut2d_logphi', 'axp_5d', 'peoe_vsa2', 'estate_vsa6', 'qed', 'numrotatablebonds', 'xc_3dv', 'peoe_vsa3', 'balabanj', 'slogp_vsa5', 'mv', 'vsa_estate2', 'bcut2d_mwlow', 'xch_7d', 'chi3n', 'vsa_estate8', 'estate_vsa4', 'xp_7dv', 'fr_nh1', 'vsa_estate3', 'fr_ketone_topliss', 'minpartialcharge', 'phi', 'peoe_vsa10', 'vsa_estate7', 'estate_vsa7', 'tpsa', 'kappa3', 'kappa2', 'bcut2d_logplow', 'xch_6d', 'maxpartialcharge', 'vsa_estate1', 'peoe_vsa9', 'axp_1d', 'fr_ar_n', 'chi2n', 'vsa_estate5', 'xp_4dv', 'slogp_vsa10', 'num_stereobonds', 'peoe_vsa11', 'bcut2d_chglo', 'chi1v', 'peoe_vsa7', 'bertzct', 'axp_2dv', 'estate_vsa2', 'smr_vsa9', 'peoe_vsa6', 'num_s_centers', 'num_r_centers', 'xch_7dv', 'xc_5d', 'axp_4dv', 'xc_5dv', 'mi', 'xc_3d', 'fpdensitymorgan2', 'xp_0dv', 'nhohcount', 'numatomstereocenters', 'mse', 'smr_vsa3', 'peoe_vsa12', 'nocount', 'fpdensitymorgan3', 'minabsestateindex', 'bcut2d_mrlow', 'axp_5dv', 'sz', 'vsa_estate10', 'axp_3d', 'xch_6dv', 'xch_4d', 'xc_6d', 'estate_vsa8', 'mpe', 'smr_vsa7', 'numhdonors', 'smr_vsa1', 'xp_5d', 'fr_para_hydroxylation', 'chi3v', 'xpc_6dv', 'nbase', 'heavyatommolwt', 'avgipc', 'maxestateindex', 'smr_vsa6', 'fr_bicyclic', 'xc_4dv', 'xp_7d', 'smr_vsa5', 'xpc_4d', 'smr_vsa4', 'peoe_vsa4', 'numheteroatoms', 'fr_nhpyrrole', 'axp_4d', 'smr_vsa10', 'xp_6d', 'sps', 'mare', 'slogp_vsa2', 'axp_0d', 'slogp_vsa4', 'fr_al_oh', 'numheterocycles', 'labuteasa', 'xp_3d', 'chi4n', 'fractioncsp3', 'maxabspartialcharge', 'fr_al_oh_notert', 'peoe_vsa1', 'axp_7dv', 'slogp_vsa11', 'peoe_vsa5', 'xpc_5dv', 'xpc_6d', 'xp_2d', 'xp_3dv', 'fr_ndealkylation1', 'axp_7d', 'estate_vsa9', 'molmr', 'num_stereocenters', 'si', 'estate_vsa1', 'xc_6dv', 'chi0v', 'fr_oxazole', 'axp_6dv', 'xp_6dv', 'xp_4d', 'numaliphaticheterocycles', 'fr_imine', 'fr_imidazole', 'xp_5dv', 'fr_piperdine', 'slogp_vsa7', 'chi1', 'c1sp2', 'numaromaticheterocycles', 'xpc_4dv', 'c3sp2', 'fr_aniline', 'fr_piperzine', 'axp_1dv', 'xch_4dv', 'chi4v', 'chi1n', 'minabspartialcharge', 'slogp_vsa1', 'fr_nh0', 'chi0n', 'c2sp3', 'xc_4d', 'xch_5dv', 'peoe_vsa14', 'xch_5d', 'numsaturatedrings', 'fr_pyridine', 'kappa1', 'slogp_vsa8', 'xp_2dv', 'fr_ar_coo', 'numvalenceelectrons'],
22
+ "features": ['naromatom', 'minabspartialcharge', 'bcut2d_mrhi', 'smr_vsa10', 'vsa_estate2', 'minpartialcharge', 'xpc_5d', 'sps', 'xc_3dv', 'smr_vsa7', 'bcut2d_logplow', 'mollogp', 'vsa_estate1', 'num_s_centers', 'vsa_estate4', 'peoe_vsa13', 'fr_nh2', 'bertzct', 'estate_vsa4', 'vsa_estate9', 'smr_vsa3', 'fr_nh1', 'molwt', 'estate_vsa5', 'slogp_vsa5', 'maxpartialcharge', 'estate_vsa1', 'fr_hoccn', 'xc_5d', 'nbase', 'chi1v', 'peoe_vsa10', 'tpsa', 'vsa_estate3', 'chi2v', 'estate_vsa8', 'numheteroatoms', 'estate_vsa2', 'peoe_vsa1', 'labuteasa', 'axp_4d', 'xch_7dv', 'chi0n', 'num_r_centers', 'vsa_estate8', 'minabsestateindex', 'bcut2d_chglo', 'bcut2d_mwhi', 'fr_nh0', 'chi4n', 'estate_vsa9', 'smr_vsa5', 'peoe_vsa2', 'peoe_vsa7', 'peoe_vsa9', 'kappa3', 'slogp_vsa3', 'fr_arn', 'estate_vsa3', 'avgipc', 'axp_5d', 'xpc_6d', 'c2sp2', 'peoe_vsa5', 'vsa_estate5', 'balabanj', 'maxabspartialcharge', 'fr_aniline', 'fr_piperdine', 'vsa_estate6', 'bcut2d_mwlow', 'numsaturatedheterocycles', 'vsa_estate10', 'smr_vsa1', 'estate_vsa6', 'smr_vsa6', 'fpdensitymorgan1', 'peoe_vsa3', 'peoe_vsa8', 'smr_vsa9', 'slogp_vsa2', 'nocount', 'fpdensitymorgan3', 'axp_6d', 'bcut2d_mrlow', 'bcut2d_logphi', 'axp_4dv', 'fpdensitymorgan2', 'mp', 'xp_5d', 'fr_nhpyrrole', 'mz', 'mv', 'vsa_estate7', 'axp_7dv', 'mi', 'c1sp2', 'xpc_6dv', 'slogp_vsa10', 'xp_7d', 'axp_3dv', 'peoe_vsa4', 'peoe_vsa6', 'axp_2dv', 'xch_5dv', 'qed', 'estate_vsa7', 'numaromaticrings', 'chi1n', 'axp_0d', 'axp_6dv', 'numrotatablebonds', 'hallkieralpha', 'c1sp3', 'xc_4dv', 'kappa2', 'bcut2d_chghi', 'xch_7d', 'axp_0dv', 'slogp_vsa7', 'axp_7d', 'minestateindex', 'axp_2d', 'axp_1d', 'chi0', 'fractioncsp3', 'slogp_vsa6', 'axp_1dv', 'chi2n', 'xp_6dv', 'maxestateindex', 'xpc_4d', 'numaliphaticheterocycles', 'chi1', 'phi', 'chi3n', 'xc_4d', 'xc_3d', 'peoe_vsa12', 'xp_6d', 'chi3v', 'axp_3d', 'axp_5dv', 'fr_benzene', 'slogp_vsa4', 'fr_pyridine', 'fr_aryl_methyl', 'xp_5dv', 'c3sp3', 'xp_7dv', 'slogp_vsa1', 'peoe_vsa11', 'mse', 'xc_5dv', 'xpc_5dv', 'xc_6dv', 'xp_0dv', 'xch_5d', 'c3sp2', 'numatomstereocenters', 'numhacceptors', 'fr_imidazole', 'numsaturatedrings', 'xpc_4dv', 'chi0v', 'numheterocycles', 'xch_6dv', 'estate_vsa10', 'chi4v', 'mare', 'numhdonors', 'xch_6d', 'xp_4d', 'fr_ar_n', 'numunspecifiedatomstereocenters', 'numspiroatoms', 'xch_4dv', 'fr_morpholine', 'fr_methoxy', 'mm', 'fr_piperzine'],
23
23
  "compressed_features": [],
24
24
  "train_all_data": True,
25
25
  "hyperparameters": {},
@@ -93,6 +93,33 @@ def get_custom_script_path(package: str, script_name: str) -> Path:
93
93
  return script_path
94
94
 
95
95
 
96
+ def proximity_model_local(model: "Model", filtered: bool = True):
97
+ """Create a Proximity Model for this Model
98
+
99
+ Args:
100
+ model (Model): The model to create the proximity model from
101
+ filtered (bool, optional): Use filtered training data for the Proximity Model (default: True)
102
+
103
+ Returns:
104
+ Proximity: The proximity model
105
+ """
106
+ from workbench.algorithms.dataframe.proximity import Proximity # noqa: F401 (avoid circular import)
107
+ from workbench.api import Model, FeatureSet # noqa: F401 (avoid circular import)
108
+
109
+ # Get Feature and Target Columns from the existing given Model
110
+ features = model.features()
111
+ target = model.target()
112
+
113
+ # Create the Proximity Model from our FeatureSet
114
+ fs = FeatureSet(model.get_input())
115
+ if filtered:
116
+ df = fs.view("training").pull_dataframe()
117
+ else:
118
+ df = fs.pull_dataframe()
119
+ id_column = fs.id_column
120
+ return Proximity(df, id_column, features, target, track_columns=features)
121
+
122
+
96
123
  def proximity_model(model: "Model", prox_model_name: str, track_columns: list = None) -> "Model":
97
124
  """Create a proximity model based on the given model
98
125
 
@@ -386,6 +386,106 @@ def cross_fold_inference(workbench_model: Any, nfolds: int = 5) -> Tuple[Dict[st
386
386
  return metrics_dict, predictions_df
387
387
 
388
388
 
389
+ def leave_one_out_inference(workbench_model: Any) -> pd.DataFrame:
390
+ """
391
+ Performs leave-one-out cross-validation (parallelized).
392
+ For datasets > 1000 rows, first identifies top 100 worst predictions via 10-fold CV,
393
+ then performs true leave-one-out on those 100 samples.
394
+ Each model trains on ALL data except one sample.
395
+ """
396
+ from workbench.api import FeatureSet
397
+ from joblib import Parallel, delayed
398
+ from tqdm import tqdm
399
+
400
+ def train_and_predict_one(model_params, is_classifier, X, y, train_idx, val_idx):
401
+ """Train on train_idx, predict on val_idx."""
402
+ model = xgb.XGBClassifier(**model_params) if is_classifier else xgb.XGBRegressor(**model_params)
403
+ model.fit(X[train_idx], y[train_idx])
404
+ return model.predict(X[val_idx])[0]
405
+
406
+ # Load model and get params
407
+ model_artifact_uri = workbench_model.model_data_url()
408
+ loaded_model = xgboost_model_from_s3(model_artifact_uri)
409
+ if loaded_model is None:
410
+ log.error("No XGBoost model found in the artifact.")
411
+ return pd.DataFrame()
412
+
413
+ if isinstance(loaded_model, (xgb.XGBClassifier, xgb.XGBRegressor)):
414
+ is_classifier = isinstance(loaded_model, xgb.XGBClassifier)
415
+ model_params = loaded_model.get_params()
416
+ elif isinstance(loaded_model, xgb.Booster):
417
+ log.warning("Deprecated: Loaded model is a Booster, wrapping in sklearn model.")
418
+ is_classifier = workbench_model.model_type.value == "classifier"
419
+ model_params = {"enable_categorical": True}
420
+ else:
421
+ log.error(f"Unexpected model type: {type(loaded_model)}")
422
+ return pd.DataFrame()
423
+
424
+ # Load and prepare data
425
+ fs = FeatureSet(workbench_model.get_input())
426
+ df = fs.view("training").pull_dataframe()
427
+ id_col = fs.id_column
428
+ target_col = workbench_model.target()
429
+ feature_cols = workbench_model.features()
430
+
431
+ # Convert string features to categorical
432
+ for col in feature_cols:
433
+ if df[col].dtype in ["object", "string"]:
434
+ df[col] = df[col].astype("category")
435
+
436
+ # Determine which samples to run LOO on
437
+ if len(df) > 1000:
438
+ log.important(f"Dataset has {len(df)} rows. Running 10-fold CV to identify top 1000 worst predictions...")
439
+ _, predictions_df = cross_fold_inference(workbench_model, nfolds=10)
440
+ predictions_df["residual_abs"] = np.abs(predictions_df[target_col] - predictions_df["prediction"])
441
+ worst_samples = predictions_df.nlargest(1000, "residual_abs")
442
+ worst_ids = worst_samples[id_col].values
443
+ loo_indices = df[df[id_col].isin(worst_ids)].index.values
444
+ log.important(f"Running leave-one-out CV on 1000 worst samples. Each model trains on {len(df)-1} rows...")
445
+ else:
446
+ log.important(f"Running leave-one-out CV on all {len(df)} samples...")
447
+ loo_indices = df.index.values
448
+
449
+ # Prepare full dataset for training
450
+ X_full = df[feature_cols].values
451
+ y_full = df[target_col].values
452
+
453
+ # Encode target if classifier
454
+ label_encoder = LabelEncoder() if is_classifier else None
455
+ if label_encoder:
456
+ y_full = label_encoder.fit_transform(y_full)
457
+
458
+ # Generate LOO splits
459
+ splits = []
460
+ for loo_idx in loo_indices:
461
+ train_idx = np.delete(np.arange(len(X_full)), loo_idx)
462
+ val_idx = np.array([loo_idx])
463
+ splits.append((train_idx, val_idx))
464
+
465
+ # Parallel execution
466
+ predictions = Parallel(n_jobs=4)(
467
+ delayed(train_and_predict_one)(model_params, is_classifier, X_full, y_full, train_idx, val_idx)
468
+ for train_idx, val_idx in tqdm(splits, desc="LOO CV")
469
+ )
470
+
471
+ # Build results dataframe
472
+ predictions_array = np.array(predictions)
473
+ if label_encoder:
474
+ predictions_array = label_encoder.inverse_transform(predictions_array.astype(int))
475
+
476
+ predictions_df = pd.DataFrame(
477
+ {
478
+ id_col: df.loc[loo_indices, id_col].values,
479
+ target_col: df.loc[loo_indices, target_col].values,
480
+ "prediction": predictions_array,
481
+ }
482
+ )
483
+
484
+ predictions_df["residual_abs"] = np.abs(predictions_df[target_col] - predictions_df["prediction"])
485
+
486
+ return predictions_df
487
+
488
+
389
489
  if __name__ == "__main__":
390
490
  """Exercise the Model Utilities"""
391
491
  from workbench.api import Model, FeatureSet
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: workbench
3
- Version: 0.8.183
3
+ Version: 0.8.185
4
4
  Summary: Workbench: A Dashboard and Python API for creating and deploying AWS SageMaker Model Pipelines
5
5
  Author-email: SuperCowPowers LLC <support@supercowpowers.com>
6
6
  License-Expression: MIT