workbench 0.8.183__py3-none-any.whl → 0.8.184__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of workbench might be problematic. Click here for more details.

@@ -2,10 +2,9 @@ import pandas as pd
2
2
  import numpy as np
3
3
  from sklearn.preprocessing import StandardScaler
4
4
  from sklearn.neighbors import NearestNeighbors
5
- from typing import List, Dict
5
+ from typing import List, Dict, Optional
6
6
  import logging
7
7
  import pickle
8
- import os
9
8
  import json
10
9
  from pathlib import Path
11
10
  from enum import Enum
@@ -14,7 +13,6 @@ from enum import Enum
14
13
  log = logging.getLogger("workbench")
15
14
 
16
15
 
17
- # ^Enumerated^ Proximity Types (distance or similarity)
18
16
  class ProximityType(Enum):
19
17
  DISTANCE = "distance"
20
18
  SIMILARITY = "similarity"
@@ -26,44 +24,49 @@ class Proximity:
26
24
  df: pd.DataFrame,
27
25
  id_column: str,
28
26
  features: List[str],
29
- target: str = None,
30
- track_columns: List[str] = None,
27
+ target: Optional[str] = None,
28
+ track_columns: Optional[List[str]] = None,
31
29
  n_neighbors: int = 10,
32
30
  ):
33
31
  """
34
32
  Initialize the Proximity class.
35
33
 
36
34
  Args:
37
- df (pd.DataFrame): DataFrame containing data for neighbor computations.
38
- id_column (str): Name of the column used as the identifier.
39
- features (List[str]): List of feature column names to be used for neighbor computations.
40
- target (str, optional): Name of the target column. Defaults to None.
41
- track_columns (List[str], optional): Additional columns to track in results. Defaults to None.
42
- n_neighbors (int): Number of neighbors to compute. Defaults to 10.
35
+ df: DataFrame containing data for neighbor computations.
36
+ id_column: Name of the column used as the identifier.
37
+ features: List of feature column names to be used for neighbor computations.
38
+ target: Name of the target column. Defaults to None.
39
+ track_columns: Additional columns to track in results. Defaults to None.
40
+ n_neighbors: Number of neighbors to compute. Defaults to 10.
43
41
  """
44
- self.df = df.dropna(subset=features).copy()
45
42
  self.id_column = id_column
46
- self.n_neighbors = min(n_neighbors, len(self.df) - 1)
47
43
  self.target = target
48
- self.features = features
44
+ self.track_columns = track_columns or []
45
+ self.proximity_type = None
49
46
  self.scaler = None
50
47
  self.X = None
51
48
  self.nn = None
52
- self.proximity_type = None
53
- self.track_columns = track_columns or []
54
49
 
55
- # Right now we only support numeric features, so remove any columns that are not numeric
56
- non_numeric_features = self.df[self.features].select_dtypes(exclude=["number"]).columns.tolist()
57
- if non_numeric_features:
58
- log.warning(f"Non-numeric features {non_numeric_features} aren't currently supported...")
59
- self.features = [f for f in self.features if f not in non_numeric_features]
50
+ # Filter out non-numeric features
51
+ self.features = self._validate_features(df, features)
52
+
53
+ # Drop NaN rows and set up DataFrame
54
+ self.df = df.dropna(subset=self.features).copy()
55
+ self.n_neighbors = min(n_neighbors, len(self.df) - 1)
60
56
 
61
57
  # Build the proximity model
62
58
  self.build_proximity_model()
63
59
 
60
+ def _validate_features(self, df: pd.DataFrame, features: List[str]) -> List[str]:
61
+ """Remove non-numeric features and log warnings."""
62
+ non_numeric = df[features].select_dtypes(exclude=["number"]).columns.tolist()
63
+ if non_numeric:
64
+ log.warning(f"Non-numeric features {non_numeric} aren't currently supported...")
65
+ return [f for f in features if f not in non_numeric]
66
+ return features
67
+
64
68
  def build_proximity_model(self) -> None:
65
- """Standardize features and fit Nearest Neighbors model.
66
- Note: This method can be overridden in subclasses for custom behavior."""
69
+ """Standardize features and fit Nearest Neighbors model."""
67
70
  self.proximity_type = ProximityType.DISTANCE
68
71
  self.scaler = StandardScaler()
69
72
  self.X = self.scaler.fit_transform(self.df[self.features])
@@ -74,27 +77,60 @@ class Proximity:
74
77
  Compute nearest neighbors for all rows in the dataset.
75
78
 
76
79
  Returns:
77
- pd.DataFrame: A DataFrame of neighbors and their distances.
80
+ DataFrame of neighbors and their distances.
78
81
  """
79
82
  distances, indices = self.nn.kneighbors(self.X)
80
- results = []
81
83
 
82
- for i, (dists, nbrs) in enumerate(zip(distances, indices)):
83
- query_id = self.df.iloc[i][self.id_column]
84
-
85
- # Process neighbors
86
- for neighbor_idx, dist in zip(nbrs, dists):
87
- # Skip self (neighbor index == current row index)
88
- if neighbor_idx == i:
89
- continue
90
- results.append(self._build_neighbor_result(query_id=query_id, neighbor_idx=neighbor_idx, distance=dist))
84
+ results = [
85
+ self._build_neighbor_result(
86
+ query_id=self.df.iloc[i][self.id_column], neighbor_idx=neighbor_idx, distance=dist
87
+ )
88
+ for i, (dists, nbrs) in enumerate(zip(distances, indices))
89
+ for neighbor_idx, dist in zip(nbrs, dists)
90
+ if neighbor_idx != i # Skip self
91
+ ]
91
92
 
92
93
  return pd.DataFrame(results)
93
94
 
94
95
  def neighbors(
96
+ self,
97
+ id_or_ids,
98
+ n_neighbors: Optional[int] = 5,
99
+ radius: Optional[float] = None,
100
+ include_self: bool = True,
101
+ ) -> pd.DataFrame:
102
+ """
103
+ Return neighbors for ID(s) from the existing dataset.
104
+
105
+ Args:
106
+ id_or_ids: Single ID or list of IDs to look up
107
+ n_neighbors: Number of neighbors to return (default: 5)
108
+ radius: If provided, find all neighbors within this radius
109
+ include_self: Whether to include self in results (if present)
110
+
111
+ Returns:
112
+ DataFrame containing neighbors and distances
113
+ """
114
+ # Normalize to list
115
+ ids = [id_or_ids] if not isinstance(id_or_ids, list) else id_or_ids
116
+
117
+ # Validate IDs exist
118
+ missing_ids = set(ids) - set(self.df[self.id_column])
119
+ if missing_ids:
120
+ raise ValueError(f"IDs not found in dataset: {missing_ids}")
121
+
122
+ # Filter to requested IDs and preserve order
123
+ query_df = self.df[self.df[self.id_column].isin(ids)]
124
+ query_df = query_df.set_index(self.id_column).loc[ids].reset_index()
125
+
126
+ # Use the core implementation
127
+ return self.find_neighbors(query_df, n_neighbors=n_neighbors, radius=radius, include_self=include_self)
128
+
129
+ def find_neighbors(
95
130
  self,
96
131
  query_df: pd.DataFrame,
97
- radius: float = None,
132
+ n_neighbors: Optional[int] = 5,
133
+ radius: Optional[float] = None,
98
134
  include_self: bool = True,
99
135
  ) -> pd.DataFrame:
100
136
  """
@@ -102,63 +138,63 @@ class Proximity:
102
138
 
103
139
  Args:
104
140
  query_df: DataFrame containing query points
141
+ n_neighbors: Number of neighbors to return (default: 5)
105
142
  radius: If provided, find all neighbors within this radius
106
143
  include_self: Whether to include self in results (if present)
107
144
 
108
145
  Returns:
109
146
  DataFrame containing neighbors and distances
110
-
111
- Note: The query DataFrame must include the feature columns. The id_column is optional.
112
147
  """
113
- # Check if all required features are present
148
+ # Validate features
114
149
  missing = set(self.features) - set(query_df.columns)
115
150
  if missing:
116
151
  raise ValueError(f"Query DataFrame is missing required feature columns: {missing}")
117
152
 
118
- # Check if id_column is present
119
153
  id_column_present = self.id_column in query_df.columns
120
154
 
121
- # None of the features can be NaNs, so report rows with NaNs and then drop them
122
- rows_with_nan = query_df[self.features].isna().any(axis=1)
123
-
124
- # Print the ID column for rows with NaNs
125
- if rows_with_nan.any():
126
- log.warning(f"Found {rows_with_nan.sum()} rows with NaNs in feature columns:")
127
- log.warning(query_df.loc[rows_with_nan, self.id_column])
128
-
129
- # Drop rows with NaNs in feature columns and reassign to query_df
130
- query_df = query_df.dropna(subset=self.features)
155
+ # Handle NaN rows
156
+ query_df = self._handle_nan_rows(query_df, id_column_present)
131
157
 
132
- # Transform the query features using the model's scaler
158
+ # Transform query features
133
159
  X_query = self.scaler.transform(query_df[self.features])
134
160
 
135
- # Get neighbors using either radius or k-nearest neighbors
161
+ # Get neighbors
136
162
  if radius is not None:
137
163
  distances, indices = self.nn.radius_neighbors(X_query, radius=radius)
138
164
  else:
139
- distances, indices = self.nn.kneighbors(X_query)
165
+ distances, indices = self.nn.kneighbors(X_query, n_neighbors=n_neighbors)
140
166
 
141
167
  # Build results
142
- all_results = []
168
+ results = []
143
169
  for i, (dists, nbrs) in enumerate(zip(distances, indices)):
144
- # Use the ID from the query DataFrame if available, otherwise use the row index
145
170
  query_id = query_df.iloc[i][self.id_column] if id_column_present else f"query_{i}"
146
171
 
147
172
  for neighbor_idx, dist in zip(nbrs, dists):
148
- # Skip if the neighbor is the query itself and include_self is False
149
173
  neighbor_id = self.df.iloc[neighbor_idx][self.id_column]
174
+
175
+ # Skip if neighbor is self and include_self is False
150
176
  if not include_self and neighbor_id == query_id:
151
177
  continue
152
178
 
153
- all_results.append(
154
- self._build_neighbor_result(query_id=query_id, neighbor_idx=neighbor_idx, distance=dist)
155
- )
179
+ results.append(self._build_neighbor_result(query_id=query_id, neighbor_idx=neighbor_idx, distance=dist))
180
+
181
+ results_df = pd.DataFrame(results).sort_values([self.id_column, "distance"]).reset_index(drop=True)
182
+ return results_df
183
+
184
+ def _handle_nan_rows(self, query_df: pd.DataFrame, id_column_present: bool) -> pd.DataFrame:
185
+ """Drop rows with NaN values in feature columns and log warnings."""
186
+ rows_with_nan = query_df[self.features].isna().any(axis=1)
187
+
188
+ if rows_with_nan.any():
189
+ log.warning(f"Found {rows_with_nan.sum()} rows with NaNs in feature columns:")
190
+ if id_column_present:
191
+ log.warning(query_df.loc[rows_with_nan, self.id_column])
156
192
 
157
- return pd.DataFrame(all_results)
193
+ return query_df.dropna(subset=self.features)
158
194
 
159
195
  def _build_neighbor_result(self, query_id, neighbor_idx: int, distance: float) -> Dict:
160
196
  """
161
- Internal: Build a result dictionary for a single neighbor.
197
+ Build a result dictionary for a single neighbor.
162
198
 
163
199
  Args:
164
200
  query_id: ID of the query point
@@ -169,27 +205,30 @@ class Proximity:
169
205
  Dictionary containing neighbor information
170
206
  """
171
207
  neighbor_id = self.df.iloc[neighbor_idx][self.id_column]
208
+ neighbor_row = self.df.iloc[neighbor_idx]
172
209
 
173
- # Basic neighbor info
174
- neighbor_info = {
210
+ # Start with basic info
211
+ result = {
175
212
  self.id_column: query_id,
176
213
  "neighbor_id": neighbor_id,
177
214
  "distance": distance,
178
215
  }
179
216
 
180
- # Determine which additional columns to include
181
- relevant_cols = [self.target, "prediction"] if self.target else []
182
- relevant_cols += [c for c in self.df.columns if "_proba" in c or "residual" in c]
183
- relevant_cols += ["outlier"]
217
+ # Columns to automatically include if they exist
218
+ auto_include = (
219
+ ([self.target, "prediction"] if self.target else [])
220
+ + self.track_columns
221
+ + [col for col in self.df.columns if "_proba" in col or "residual" in col or col == "outlier"]
222
+ )
184
223
 
185
- # Add user-specified columns
186
- relevant_cols += self.track_columns
224
+ # Add values for existing columns
225
+ for col in auto_include:
226
+ if col in self.df.columns:
227
+ result[col] = neighbor_row[col]
187
228
 
188
- # Add values for each relevant column that exists in the dataframe
189
- for col in filter(lambda c: c in self.df.columns, relevant_cols):
190
- neighbor_info[col] = self.df.iloc[neighbor_idx][col]
191
-
192
- return neighbor_info
229
+ # Truncate very small distances to zero
230
+ result["distance"] = 0.0 if distance < 1e-7 else distance
231
+ return result
193
232
 
194
233
  def serialize(self, directory: str) -> None:
195
234
  """
@@ -198,8 +237,8 @@ class Proximity:
198
237
  Args:
199
238
  directory: Directory path to save the model components
200
239
  """
201
- # Create directory if it doesn't exist
202
- os.makedirs(directory, exist_ok=True)
240
+ dir_path = Path(directory)
241
+ dir_path.mkdir(parents=True, exist_ok=True)
203
242
 
204
243
  # Save metadata
205
244
  metadata = {
@@ -210,17 +249,16 @@ class Proximity:
210
249
  "n_neighbors": self.n_neighbors,
211
250
  }
212
251
 
213
- with open(os.path.join(directory, "metadata.json"), "w") as f:
214
- json.dump(metadata, f)
252
+ (dir_path / "metadata.json").write_text(json.dumps(metadata))
215
253
 
216
- # Save the DataFrame
217
- self.df.to_pickle(os.path.join(directory, "df.pkl"))
254
+ # Save DataFrame
255
+ self.df.to_pickle(dir_path / "df.pkl")
218
256
 
219
- # Save the scaler and nearest neighbors model
220
- with open(os.path.join(directory, "scaler.pkl"), "wb") as f:
257
+ # Save models
258
+ with open(dir_path / "scaler.pkl", "wb") as f:
221
259
  pickle.dump(self.scaler, f)
222
260
 
223
- with open(os.path.join(directory, "nn_model.pkl"), "wb") as f:
261
+ with open(dir_path / "nn_model.pkl", "wb") as f:
224
262
  pickle.dump(self.nn, f)
225
263
 
226
264
  log.info(f"Proximity model serialized to {directory}")
@@ -234,23 +272,22 @@ class Proximity:
234
272
  directory: Directory path containing the serialized model components
235
273
 
236
274
  Returns:
237
- Proximity: A new Proximity instance
275
+ A new Proximity instance
238
276
  """
239
- directory_path = Path(directory)
240
- if not directory_path.exists() or not directory_path.is_dir():
277
+ dir_path = Path(directory)
278
+ if not dir_path.is_dir():
241
279
  raise ValueError(f"Directory {directory} does not exist or is not a directory")
242
280
 
243
281
  # Load metadata
244
- with open(os.path.join(directory, "metadata.json"), "r") as f:
245
- metadata = json.load(f)
282
+ metadata = json.loads((dir_path / "metadata.json").read_text())
246
283
 
247
284
  # Load DataFrame
248
- df_path = os.path.join(directory, "df.pkl")
249
- if not os.path.exists(df_path):
285
+ df_path = dir_path / "df.pkl"
286
+ if not df_path.exists():
250
287
  raise FileNotFoundError(f"DataFrame file not found at {df_path}")
251
288
  df = pd.read_pickle(df_path)
252
289
 
253
- # Create instance but skip _prepare_data
290
+ # Create instance without calling __init__
254
291
  instance = cls.__new__(cls)
255
292
  instance.df = df
256
293
  instance.id_column = metadata["id_column"]
@@ -259,15 +296,16 @@ class Proximity:
259
296
  instance.track_columns = metadata["track_columns"]
260
297
  instance.n_neighbors = metadata["n_neighbors"]
261
298
 
262
- # Load scaler and nn model
263
- with open(os.path.join(directory, "scaler.pkl"), "rb") as f:
299
+ # Load models
300
+ with open(dir_path / "scaler.pkl", "rb") as f:
264
301
  instance.scaler = pickle.load(f)
265
302
 
266
- with open(os.path.join(directory, "nn_model.pkl"), "rb") as f:
303
+ with open(dir_path / "nn_model.pkl", "rb") as f:
267
304
  instance.nn = pickle.load(f)
268
305
 
269
- # Load X from scaler transform
306
+ # Restore X
270
307
  instance.X = instance.scaler.transform(instance.df[instance.features])
308
+ instance.proximity_type = ProximityType.DISTANCE
271
309
 
272
310
  log.info(f"Proximity model deserialized from {directory}")
273
311
  return instance
@@ -294,10 +332,10 @@ if __name__ == "__main__":
294
332
  print(prox.all_neighbors())
295
333
 
296
334
  # Test the neighbors method
297
- print(prox.neighbors(query_df=df.iloc[[0]]))
335
+ print(prox.neighbors(1))
298
336
 
299
337
  # Test the neighbors method with radius
300
- print(prox.neighbors(query_df=df.iloc[0:2], radius=2.0))
338
+ print(prox.neighbors(1, radius=2.0))
301
339
 
302
340
  # Test with data that isn't in the 'train' dataframe
303
341
  query_data = {
@@ -307,7 +345,7 @@ if __name__ == "__main__":
307
345
  "Feature3": [2.31],
308
346
  }
309
347
  query_df = pd.DataFrame(query_data)
310
- print(prox.neighbors(query_df=query_df))
348
+ print(prox.find_neighbors(query_df=query_df)) # For new data we use find_neighbors()
311
349
 
312
350
  # Test with Features list
313
351
  prox = Proximity(df, id_column="ID", features=["Feature1"], n_neighbors=2)
@@ -334,13 +372,13 @@ if __name__ == "__main__":
334
372
  print(prox.all_neighbors())
335
373
 
336
374
  # Test the neighbors method
337
- print(prox.neighbors(query_df=df.iloc[0:2]))
375
+ print(prox.neighbors(["a", "b"]))
338
376
 
339
377
  # Time neighbors with all IDs versus calling all_neighbors
340
378
  import time
341
379
 
342
380
  start_time = time.time()
343
- prox_df = prox.neighbors(query_df=df, include_self=False)
381
+ prox_df = prox.find_neighbors(query_df=df, include_self=False)
344
382
  end_time = time.time()
345
383
  print(f"Time taken for neighbors: {end_time - start_time:.4f} seconds")
346
384
  start_time = time.time()
@@ -361,7 +399,7 @@ if __name__ == "__main__":
361
399
 
362
400
  # Test querying without the id_column
363
401
  df_no_id = df.drop(columns=["foo_id"])
364
- print(prox.neighbors(query_df=df_no_id, include_self=False))
402
+ print(prox.find_neighbors(query_df=df_no_id, include_self=False))
365
403
 
366
404
  # Test duplicate IDs
367
405
  data = {
@@ -379,6 +417,9 @@ if __name__ == "__main__":
379
417
 
380
418
  fs = FeatureSet("abalone_features")
381
419
  model = Model("abalone-regression")
420
+ features = model.features()
382
421
  df = fs.pull_dataframe()
383
- prox = Proximity(df, id_column=fs.id_column, features=model.features(), target=model.target())
384
- print(prox.neighbors(query_df=df[0:2]))
422
+ prox = Proximity(
423
+ df, id_column=fs.id_column, features=model.features(), target=model.target(), track_columns=features
424
+ )
425
+ print(prox.find_neighbors(query_df=df[0:2]))
workbench/api/model.py CHANGED
@@ -10,7 +10,7 @@ from workbench.core.artifacts.artifact import Artifact
10
10
  from workbench.core.artifacts.model_core import ModelCore, ModelType # noqa: F401
11
11
  from workbench.core.transforms.model_to_endpoint.model_to_endpoint import ModelToEndpoint
12
12
  from workbench.api.endpoint import Endpoint
13
- from workbench.utils.model_utils import proximity_model, uq_model
13
+ from workbench.utils.model_utils import proximity_model_local, uq_model
14
14
 
15
15
 
16
16
  class Model(ModelCore):
@@ -83,19 +83,16 @@ class Model(ModelCore):
83
83
  end.set_owner(self.get_owner())
84
84
  return end
85
85
 
86
- def prox_model(self, prox_model_name: str = None, track_columns: list = None) -> "Model":
87
- """Create a Proximity Model for this Model
86
+ def prox_model(self, filtered: bool = True):
87
+ """Create a local Proximity Model for this Model
88
88
 
89
89
  Args:
90
- prox_model_name (str, optional): Name of the Proximity Model (if not specified, a name will be generated)
91
- track_columns (list, optional): List of columns to track in the Proximity Model.
90
+ filtered: bool, optional): Use filtered training data for the Proximity Model (default: True)
92
91
 
93
92
  Returns:
94
- Model: The Proximity Model
93
+ Proximity: A local Proximity Model
95
94
  """
96
- if prox_model_name is None:
97
- prox_model_name = self.model_name + "-prox"
98
- return proximity_model(self, prox_model_name, track_columns=track_columns)
95
+ return proximity_model_local(self, filtered=filtered)
99
96
 
100
97
  def uq_model(self, uq_model_name: str = None, train_all_data: bool = False) -> "Model":
101
98
  """Create a Uncertainty Quantification Model for this Model
@@ -121,6 +118,10 @@ if __name__ == "__main__":
121
118
  pprint(my_model.summary())
122
119
  pprint(my_model.details())
123
120
 
124
- # Create an Endpoint from the Model
125
- my_endpoint = my_model.to_endpoint()
126
- pprint(my_endpoint.summary())
121
+ # Create an Endpoint from the Model (commented out for now)
122
+ # my_endpoint = my_model.to_endpoint()
123
+ # pprint(my_endpoint.summary())
124
+
125
+ # Create a local Proximity Model for this Model
126
+ prox_model = my_model.prox_model()
127
+ print(prox_model.neighbors(3398))
@@ -21,6 +21,7 @@ from workbench.utils.aws_utils import newest_path, pull_s3_data
21
21
  from workbench.utils.s3_utils import compute_s3_object_hash
22
22
  from workbench.utils.shap_utils import shap_values_data, shap_feature_importance
23
23
  from workbench.utils.deprecated_utils import deprecated
24
+ from workbench.utils.model_utils import proximity_model
24
25
 
25
26
 
26
27
  class ModelType(Enum):
@@ -881,6 +882,20 @@ class ModelCore(Artifact):
881
882
  except (KeyError, IndexError, TypeError):
882
883
  return None
883
884
 
885
+ def publish_prox_model(self, prox_model_name: str = None, track_columns: list = None):
886
+ """Create and publish a Proximity Model for this Model
887
+
888
+ Args:
889
+ prox_model_name (str, optional): Name of the Proximity Model (if not specified, a name will be generated)
890
+ track_columns (list, optional): List of columns to track in the Proximity Model.
891
+
892
+ Returns:
893
+ Model: The published Proximity Model
894
+ """
895
+ if prox_model_name is None:
896
+ prox_model_name = self.model_name + "-prox"
897
+ return proximity_model(self, prox_model_name, track_columns=track_columns)
898
+
884
899
  def delete(self):
885
900
  """Delete the Model Packages and the Model Group"""
886
901
  if not self.exists():