workbench 0.8.168__py3-none-any.whl → 0.8.193__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. workbench/algorithms/dataframe/proximity.py +143 -102
  2. workbench/algorithms/graph/light/proximity_graph.py +2 -1
  3. workbench/api/compound.py +1 -1
  4. workbench/api/endpoint.py +3 -2
  5. workbench/api/feature_set.py +4 -4
  6. workbench/api/model.py +16 -12
  7. workbench/api/monitor.py +1 -16
  8. workbench/core/artifacts/artifact.py +11 -3
  9. workbench/core/artifacts/data_capture_core.py +355 -0
  10. workbench/core/artifacts/endpoint_core.py +113 -27
  11. workbench/core/artifacts/feature_set_core.py +72 -13
  12. workbench/core/artifacts/model_core.py +71 -49
  13. workbench/core/artifacts/monitor_core.py +33 -249
  14. workbench/core/cloud_platform/aws/aws_account_clamp.py +50 -1
  15. workbench/core/cloud_platform/aws/aws_meta.py +11 -4
  16. workbench/core/transforms/data_to_features/light/molecular_descriptors.py +4 -4
  17. workbench/core/transforms/features_to_model/features_to_model.py +11 -6
  18. workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +36 -6
  19. workbench/core/transforms/pandas_transforms/pandas_to_features.py +27 -0
  20. workbench/core/views/training_view.py +49 -53
  21. workbench/core/views/view.py +51 -1
  22. workbench/core/views/view_utils.py +4 -4
  23. workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +483 -0
  24. workbench/model_scripts/custom_models/chem_info/mol_standardize.py +450 -0
  25. workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +7 -9
  26. workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +3 -5
  27. workbench/model_scripts/custom_models/proximity/proximity.py +143 -102
  28. workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
  29. workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +10 -17
  30. workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
  31. workbench/model_scripts/custom_models/uq_models/meta_uq.template +156 -58
  32. workbench/model_scripts/custom_models/uq_models/ngboost.template +20 -14
  33. workbench/model_scripts/custom_models/uq_models/proximity.py +143 -102
  34. workbench/model_scripts/custom_models/uq_models/requirements.txt +1 -3
  35. workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +5 -13
  36. workbench/model_scripts/pytorch_model/pytorch.template +9 -18
  37. workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
  38. workbench/model_scripts/script_generation.py +7 -2
  39. workbench/model_scripts/uq_models/mapie.template +492 -0
  40. workbench/model_scripts/uq_models/requirements.txt +1 -0
  41. workbench/model_scripts/xgb_model/generated_model_script.py +34 -43
  42. workbench/model_scripts/xgb_model/xgb_model.template +31 -40
  43. workbench/repl/workbench_shell.py +4 -4
  44. workbench/scripts/lambda_launcher.py +63 -0
  45. workbench/scripts/{ml_pipeline_launcher.py → ml_pipeline_batch.py} +49 -51
  46. workbench/scripts/ml_pipeline_sqs.py +186 -0
  47. workbench/utils/chem_utils/__init__.py +0 -0
  48. workbench/utils/chem_utils/fingerprints.py +134 -0
  49. workbench/utils/chem_utils/misc.py +194 -0
  50. workbench/utils/chem_utils/mol_descriptors.py +483 -0
  51. workbench/utils/chem_utils/mol_standardize.py +450 -0
  52. workbench/utils/chem_utils/mol_tagging.py +348 -0
  53. workbench/utils/chem_utils/projections.py +209 -0
  54. workbench/utils/chem_utils/salts.py +256 -0
  55. workbench/utils/chem_utils/sdf.py +292 -0
  56. workbench/utils/chem_utils/toxicity.py +250 -0
  57. workbench/utils/chem_utils/vis.py +253 -0
  58. workbench/utils/config_manager.py +2 -6
  59. workbench/utils/endpoint_utils.py +5 -7
  60. workbench/utils/license_manager.py +2 -6
  61. workbench/utils/model_utils.py +89 -31
  62. workbench/utils/monitor_utils.py +44 -62
  63. workbench/utils/pandas_utils.py +3 -3
  64. workbench/utils/shap_utils.py +10 -2
  65. workbench/utils/workbench_sqs.py +1 -1
  66. workbench/utils/xgboost_model_utils.py +300 -151
  67. workbench/web_interface/components/model_plot.py +7 -1
  68. workbench/web_interface/components/plugins/dashboard_status.py +3 -1
  69. workbench/web_interface/components/plugins/generated_compounds.py +1 -1
  70. workbench/web_interface/components/plugins/model_details.py +7 -2
  71. workbench/web_interface/components/plugins/scatter_plot.py +3 -3
  72. {workbench-0.8.168.dist-info → workbench-0.8.193.dist-info}/METADATA +24 -2
  73. {workbench-0.8.168.dist-info → workbench-0.8.193.dist-info}/RECORD +77 -72
  74. {workbench-0.8.168.dist-info → workbench-0.8.193.dist-info}/entry_points.txt +3 -1
  75. {workbench-0.8.168.dist-info → workbench-0.8.193.dist-info}/licenses/LICENSE +1 -1
  76. workbench/model_scripts/custom_models/chem_info/local_utils.py +0 -769
  77. workbench/model_scripts/custom_models/chem_info/tautomerize.py +0 -83
  78. workbench/model_scripts/custom_models/proximity/generated_model_script.py +0 -138
  79. workbench/model_scripts/custom_models/uq_models/generated_model_script.py +0 -393
  80. workbench/model_scripts/custom_models/uq_models/mapie_xgb.template +0 -203
  81. workbench/model_scripts/ensemble_xgb/generated_model_script.py +0 -279
  82. workbench/model_scripts/pytorch_model/generated_model_script.py +0 -576
  83. workbench/model_scripts/quant_regression/quant_regression.template +0 -279
  84. workbench/model_scripts/quant_regression/requirements.txt +0 -1
  85. workbench/model_scripts/scikit_learn/generated_model_script.py +0 -307
  86. workbench/utils/chem_utils.py +0 -1556
  87. workbench/utils/fast_inference.py +0 -167
  88. workbench/utils/resource_utils.py +0 -39
  89. {workbench-0.8.168.dist-info → workbench-0.8.193.dist-info}/WHEEL +0 -0
  90. {workbench-0.8.168.dist-info → workbench-0.8.193.dist-info}/top_level.txt +0 -0
@@ -2,10 +2,9 @@ import pandas as pd
2
2
  import numpy as np
3
3
  from sklearn.preprocessing import StandardScaler
4
4
  from sklearn.neighbors import NearestNeighbors
5
- from typing import List, Dict
5
+ from typing import List, Dict, Optional
6
6
  import logging
7
7
  import pickle
8
- import os
9
8
  import json
10
9
  from pathlib import Path
11
10
  from enum import Enum
@@ -14,7 +13,6 @@ from enum import Enum
14
13
  log = logging.getLogger("workbench")
15
14
 
16
15
 
17
- # ^Enumerated^ Proximity Types (distance or similarity)
18
16
  class ProximityType(Enum):
19
17
  DISTANCE = "distance"
20
18
  SIMILARITY = "similarity"
@@ -26,44 +24,49 @@ class Proximity:
26
24
  df: pd.DataFrame,
27
25
  id_column: str,
28
26
  features: List[str],
29
- target: str = None,
30
- track_columns: List[str] = None,
27
+ target: Optional[str] = None,
28
+ track_columns: Optional[List[str]] = None,
31
29
  n_neighbors: int = 10,
32
30
  ):
33
31
  """
34
32
  Initialize the Proximity class.
35
33
 
36
34
  Args:
37
- df (pd.DataFrame): DataFrame containing data for neighbor computations.
38
- id_column (str): Name of the column used as the identifier.
39
- features (List[str]): List of feature column names to be used for neighbor computations.
40
- target (str, optional): Name of the target column. Defaults to None.
41
- track_columns (List[str], optional): Additional columns to track in results. Defaults to None.
42
- n_neighbors (int): Number of neighbors to compute. Defaults to 10.
35
+ df: DataFrame containing data for neighbor computations.
36
+ id_column: Name of the column used as the identifier.
37
+ features: List of feature column names to be used for neighbor computations.
38
+ target: Name of the target column. Defaults to None.
39
+ track_columns: Additional columns to track in results. Defaults to None.
40
+ n_neighbors: Number of neighbors to compute. Defaults to 10.
43
41
  """
44
- self.df = df.dropna(subset=features).copy()
45
42
  self.id_column = id_column
46
- self.n_neighbors = min(n_neighbors, len(self.df) - 1)
47
43
  self.target = target
48
- self.features = features
44
+ self.track_columns = track_columns or []
45
+ self.proximity_type = None
49
46
  self.scaler = None
50
47
  self.X = None
51
48
  self.nn = None
52
- self.proximity_type = None
53
- self.track_columns = track_columns or []
54
49
 
55
- # Right now we only support numeric features, so remove any columns that are not numeric
56
- non_numeric_features = self.df[self.features].select_dtypes(exclude=["number"]).columns.tolist()
57
- if non_numeric_features:
58
- log.warning(f"Non-numeric features {non_numeric_features} aren't currently supported...")
59
- self.features = [f for f in self.features if f not in non_numeric_features]
50
+ # Filter out non-numeric features
51
+ self.features = self._validate_features(df, features)
52
+
53
+ # Drop NaN rows and set up DataFrame
54
+ self.df = df.dropna(subset=self.features).copy()
55
+ self.n_neighbors = min(n_neighbors, len(self.df) - 1)
60
56
 
61
57
  # Build the proximity model
62
58
  self.build_proximity_model()
63
59
 
60
+ def _validate_features(self, df: pd.DataFrame, features: List[str]) -> List[str]:
61
+ """Remove non-numeric features and log warnings."""
62
+ non_numeric = df[features].select_dtypes(exclude=["number"]).columns.tolist()
63
+ if non_numeric:
64
+ log.warning(f"Non-numeric features {non_numeric} aren't currently supported...")
65
+ return [f for f in features if f not in non_numeric]
66
+ return features
67
+
64
68
  def build_proximity_model(self) -> None:
65
- """Standardize features and fit Nearest Neighbors model.
66
- Note: This method can be overridden in subclasses for custom behavior."""
69
+ """Standardize features and fit Nearest Neighbors model."""
67
70
  self.proximity_type = ProximityType.DISTANCE
68
71
  self.scaler = StandardScaler()
69
72
  self.X = self.scaler.fit_transform(self.df[self.features])
@@ -74,27 +77,60 @@ class Proximity:
74
77
  Compute nearest neighbors for all rows in the dataset.
75
78
 
76
79
  Returns:
77
- pd.DataFrame: A DataFrame of neighbors and their distances.
80
+ DataFrame of neighbors and their distances.
78
81
  """
79
82
  distances, indices = self.nn.kneighbors(self.X)
80
- results = []
81
83
 
82
- for i, (dists, nbrs) in enumerate(zip(distances, indices)):
83
- query_id = self.df.iloc[i][self.id_column]
84
-
85
- # Process neighbors
86
- for neighbor_idx, dist in zip(nbrs, dists):
87
- # Skip self (neighbor index == current row index)
88
- if neighbor_idx == i:
89
- continue
90
- results.append(self._build_neighbor_result(query_id=query_id, neighbor_idx=neighbor_idx, distance=dist))
84
+ results = [
85
+ self._build_neighbor_result(
86
+ query_id=self.df.iloc[i][self.id_column], neighbor_idx=neighbor_idx, distance=dist
87
+ )
88
+ for i, (dists, nbrs) in enumerate(zip(distances, indices))
89
+ for neighbor_idx, dist in zip(nbrs, dists)
90
+ if neighbor_idx != i # Skip self
91
+ ]
91
92
 
92
93
  return pd.DataFrame(results)
93
94
 
94
95
  def neighbors(
96
+ self,
97
+ id_or_ids,
98
+ n_neighbors: Optional[int] = 5,
99
+ radius: Optional[float] = None,
100
+ include_self: bool = True,
101
+ ) -> pd.DataFrame:
102
+ """
103
+ Return neighbors for ID(s) from the existing dataset.
104
+
105
+ Args:
106
+ id_or_ids: Single ID or list of IDs to look up
107
+ n_neighbors: Number of neighbors to return (default: 5)
108
+ radius: If provided, find all neighbors within this radius
109
+ include_self: Whether to include self in results (if present)
110
+
111
+ Returns:
112
+ DataFrame containing neighbors and distances
113
+ """
114
+ # Normalize to list
115
+ ids = [id_or_ids] if not isinstance(id_or_ids, list) else id_or_ids
116
+
117
+ # Validate IDs exist
118
+ missing_ids = set(ids) - set(self.df[self.id_column])
119
+ if missing_ids:
120
+ raise ValueError(f"IDs not found in dataset: {missing_ids}")
121
+
122
+ # Filter to requested IDs and preserve order
123
+ query_df = self.df[self.df[self.id_column].isin(ids)]
124
+ query_df = query_df.set_index(self.id_column).loc[ids].reset_index()
125
+
126
+ # Use the core implementation
127
+ return self.find_neighbors(query_df, n_neighbors=n_neighbors, radius=radius, include_self=include_self)
128
+
129
+ def find_neighbors(
95
130
  self,
96
131
  query_df: pd.DataFrame,
97
- radius: float = None,
132
+ n_neighbors: Optional[int] = 5,
133
+ radius: Optional[float] = None,
98
134
  include_self: bool = True,
99
135
  ) -> pd.DataFrame:
100
136
  """
@@ -102,63 +138,63 @@ class Proximity:
102
138
 
103
139
  Args:
104
140
  query_df: DataFrame containing query points
141
+ n_neighbors: Number of neighbors to return (default: 5)
105
142
  radius: If provided, find all neighbors within this radius
106
143
  include_self: Whether to include self in results (if present)
107
144
 
108
145
  Returns:
109
146
  DataFrame containing neighbors and distances
110
-
111
- Note: The query DataFrame must include the feature columns. The id_column is optional.
112
147
  """
113
- # Check if all required features are present
148
+ # Validate features
114
149
  missing = set(self.features) - set(query_df.columns)
115
150
  if missing:
116
151
  raise ValueError(f"Query DataFrame is missing required feature columns: {missing}")
117
152
 
118
- # Check if id_column is present
119
153
  id_column_present = self.id_column in query_df.columns
120
154
 
121
- # None of the features can be NaNs, so report rows with NaNs and then drop them
122
- rows_with_nan = query_df[self.features].isna().any(axis=1)
123
-
124
- # Print the ID column for rows with NaNs
125
- if rows_with_nan.any():
126
- log.warning(f"Found {rows_with_nan.sum()} rows with NaNs in feature columns:")
127
- log.warning(query_df.loc[rows_with_nan, self.id_column])
128
-
129
- # Drop rows with NaNs in feature columns and reassign to query_df
130
- query_df = query_df.dropna(subset=self.features)
155
+ # Handle NaN rows
156
+ query_df = self._handle_nan_rows(query_df, id_column_present)
131
157
 
132
- # Transform the query features using the model's scaler
158
+ # Transform query features
133
159
  X_query = self.scaler.transform(query_df[self.features])
134
160
 
135
- # Get neighbors using either radius or k-nearest neighbors
161
+ # Get neighbors
136
162
  if radius is not None:
137
163
  distances, indices = self.nn.radius_neighbors(X_query, radius=radius)
138
164
  else:
139
- distances, indices = self.nn.kneighbors(X_query)
165
+ distances, indices = self.nn.kneighbors(X_query, n_neighbors=n_neighbors)
140
166
 
141
167
  # Build results
142
- all_results = []
168
+ results = []
143
169
  for i, (dists, nbrs) in enumerate(zip(distances, indices)):
144
- # Use the ID from the query DataFrame if available, otherwise use the row index
145
170
  query_id = query_df.iloc[i][self.id_column] if id_column_present else f"query_{i}"
146
171
 
147
172
  for neighbor_idx, dist in zip(nbrs, dists):
148
- # Skip if the neighbor is the query itself and include_self is False
149
173
  neighbor_id = self.df.iloc[neighbor_idx][self.id_column]
174
+
175
+ # Skip if neighbor is self and include_self is False
150
176
  if not include_self and neighbor_id == query_id:
151
177
  continue
152
178
 
153
- all_results.append(
154
- self._build_neighbor_result(query_id=query_id, neighbor_idx=neighbor_idx, distance=dist)
155
- )
179
+ results.append(self._build_neighbor_result(query_id=query_id, neighbor_idx=neighbor_idx, distance=dist))
180
+
181
+ results_df = pd.DataFrame(results).sort_values([self.id_column, "distance"]).reset_index(drop=True)
182
+ return results_df
183
+
184
+ def _handle_nan_rows(self, query_df: pd.DataFrame, id_column_present: bool) -> pd.DataFrame:
185
+ """Drop rows with NaN values in feature columns and log warnings."""
186
+ rows_with_nan = query_df[self.features].isna().any(axis=1)
187
+
188
+ if rows_with_nan.any():
189
+ log.warning(f"Found {rows_with_nan.sum()} rows with NaNs in feature columns:")
190
+ if id_column_present:
191
+ log.warning(query_df.loc[rows_with_nan, self.id_column])
156
192
 
157
- return pd.DataFrame(all_results)
193
+ return query_df.dropna(subset=self.features)
158
194
 
159
195
  def _build_neighbor_result(self, query_id, neighbor_idx: int, distance: float) -> Dict:
160
196
  """
161
- Internal: Build a result dictionary for a single neighbor.
197
+ Build a result dictionary for a single neighbor.
162
198
 
163
199
  Args:
164
200
  query_id: ID of the query point
@@ -169,27 +205,30 @@ class Proximity:
169
205
  Dictionary containing neighbor information
170
206
  """
171
207
  neighbor_id = self.df.iloc[neighbor_idx][self.id_column]
208
+ neighbor_row = self.df.iloc[neighbor_idx]
172
209
 
173
- # Basic neighbor info
174
- neighbor_info = {
210
+ # Start with basic info
211
+ result = {
175
212
  self.id_column: query_id,
176
213
  "neighbor_id": neighbor_id,
177
214
  "distance": distance,
178
215
  }
179
216
 
180
- # Determine which additional columns to include
181
- relevant_cols = [self.target, "prediction"] if self.target else []
182
- relevant_cols += [c for c in self.df.columns if "_proba" in c or "residual" in c]
183
- relevant_cols += ["outlier"]
217
+ # Columns to automatically include if they exist
218
+ auto_include = (
219
+ ([self.target, "prediction"] if self.target else [])
220
+ + self.track_columns
221
+ + [col for col in self.df.columns if "_proba" in col or "residual" in col or col == "outlier"]
222
+ )
184
223
 
185
- # Add user-specified columns
186
- relevant_cols += self.track_columns
224
+ # Add values for existing columns
225
+ for col in auto_include:
226
+ if col in self.df.columns:
227
+ result[col] = neighbor_row[col]
187
228
 
188
- # Add values for each relevant column that exists in the dataframe
189
- for col in filter(lambda c: c in self.df.columns, relevant_cols):
190
- neighbor_info[col] = self.df.iloc[neighbor_idx][col]
191
-
192
- return neighbor_info
229
+ # Truncate very small distances to zero
230
+ result["distance"] = 0.0 if distance < 1e-7 else distance
231
+ return result
193
232
 
194
233
  def serialize(self, directory: str) -> None:
195
234
  """
@@ -198,8 +237,8 @@ class Proximity:
198
237
  Args:
199
238
  directory: Directory path to save the model components
200
239
  """
201
- # Create directory if it doesn't exist
202
- os.makedirs(directory, exist_ok=True)
240
+ dir_path = Path(directory)
241
+ dir_path.mkdir(parents=True, exist_ok=True)
203
242
 
204
243
  # Save metadata
205
244
  metadata = {
@@ -210,17 +249,16 @@ class Proximity:
210
249
  "n_neighbors": self.n_neighbors,
211
250
  }
212
251
 
213
- with open(os.path.join(directory, "metadata.json"), "w") as f:
214
- json.dump(metadata, f)
252
+ (dir_path / "metadata.json").write_text(json.dumps(metadata))
215
253
 
216
- # Save the DataFrame
217
- self.df.to_pickle(os.path.join(directory, "df.pkl"))
254
+ # Save DataFrame
255
+ self.df.to_pickle(dir_path / "df.pkl")
218
256
 
219
- # Save the scaler and nearest neighbors model
220
- with open(os.path.join(directory, "scaler.pkl"), "wb") as f:
257
+ # Save models
258
+ with open(dir_path / "scaler.pkl", "wb") as f:
221
259
  pickle.dump(self.scaler, f)
222
260
 
223
- with open(os.path.join(directory, "nn_model.pkl"), "wb") as f:
261
+ with open(dir_path / "nn_model.pkl", "wb") as f:
224
262
  pickle.dump(self.nn, f)
225
263
 
226
264
  log.info(f"Proximity model serialized to {directory}")
@@ -234,23 +272,22 @@ class Proximity:
234
272
  directory: Directory path containing the serialized model components
235
273
 
236
274
  Returns:
237
- Proximity: A new Proximity instance
275
+ A new Proximity instance
238
276
  """
239
- directory_path = Path(directory)
240
- if not directory_path.exists() or not directory_path.is_dir():
277
+ dir_path = Path(directory)
278
+ if not dir_path.is_dir():
241
279
  raise ValueError(f"Directory {directory} does not exist or is not a directory")
242
280
 
243
281
  # Load metadata
244
- with open(os.path.join(directory, "metadata.json"), "r") as f:
245
- metadata = json.load(f)
282
+ metadata = json.loads((dir_path / "metadata.json").read_text())
246
283
 
247
284
  # Load DataFrame
248
- df_path = os.path.join(directory, "df.pkl")
249
- if not os.path.exists(df_path):
285
+ df_path = dir_path / "df.pkl"
286
+ if not df_path.exists():
250
287
  raise FileNotFoundError(f"DataFrame file not found at {df_path}")
251
288
  df = pd.read_pickle(df_path)
252
289
 
253
- # Create instance but skip _prepare_data
290
+ # Create instance without calling __init__
254
291
  instance = cls.__new__(cls)
255
292
  instance.df = df
256
293
  instance.id_column = metadata["id_column"]
@@ -259,15 +296,16 @@ class Proximity:
259
296
  instance.track_columns = metadata["track_columns"]
260
297
  instance.n_neighbors = metadata["n_neighbors"]
261
298
 
262
- # Load scaler and nn model
263
- with open(os.path.join(directory, "scaler.pkl"), "rb") as f:
299
+ # Load models
300
+ with open(dir_path / "scaler.pkl", "rb") as f:
264
301
  instance.scaler = pickle.load(f)
265
302
 
266
- with open(os.path.join(directory, "nn_model.pkl"), "rb") as f:
303
+ with open(dir_path / "nn_model.pkl", "rb") as f:
267
304
  instance.nn = pickle.load(f)
268
305
 
269
- # Load X from scaler transform
306
+ # Restore X
270
307
  instance.X = instance.scaler.transform(instance.df[instance.features])
308
+ instance.proximity_type = ProximityType.DISTANCE
271
309
 
272
310
  log.info(f"Proximity model deserialized from {directory}")
273
311
  return instance
@@ -294,10 +332,10 @@ if __name__ == "__main__":
294
332
  print(prox.all_neighbors())
295
333
 
296
334
  # Test the neighbors method
297
- print(prox.neighbors(query_df=df.iloc[[0]]))
335
+ print(prox.neighbors(1))
298
336
 
299
337
  # Test the neighbors method with radius
300
- print(prox.neighbors(query_df=df.iloc[0:2], radius=2.0))
338
+ print(prox.neighbors(1, radius=2.0))
301
339
 
302
340
  # Test with data that isn't in the 'train' dataframe
303
341
  query_data = {
@@ -307,7 +345,7 @@ if __name__ == "__main__":
307
345
  "Feature3": [2.31],
308
346
  }
309
347
  query_df = pd.DataFrame(query_data)
310
- print(prox.neighbors(query_df=query_df))
348
+ print(prox.find_neighbors(query_df=query_df)) # For new data we use find_neighbors()
311
349
 
312
350
  # Test with Features list
313
351
  prox = Proximity(df, id_column="ID", features=["Feature1"], n_neighbors=2)
@@ -334,13 +372,13 @@ if __name__ == "__main__":
334
372
  print(prox.all_neighbors())
335
373
 
336
374
  # Test the neighbors method
337
- print(prox.neighbors(query_df=df.iloc[0:2]))
375
+ print(prox.neighbors(["a", "b"]))
338
376
 
339
377
  # Time neighbors with all IDs versus calling all_neighbors
340
378
  import time
341
379
 
342
380
  start_time = time.time()
343
- prox_df = prox.neighbors(query_df=df, include_self=False)
381
+ prox_df = prox.find_neighbors(query_df=df, include_self=False)
344
382
  end_time = time.time()
345
383
  print(f"Time taken for neighbors: {end_time - start_time:.4f} seconds")
346
384
  start_time = time.time()
@@ -361,7 +399,7 @@ if __name__ == "__main__":
361
399
 
362
400
  # Test querying without the id_column
363
401
  df_no_id = df.drop(columns=["foo_id"])
364
- print(prox.neighbors(query_df=df_no_id, include_self=False))
402
+ print(prox.find_neighbors(query_df=df_no_id, include_self=False))
365
403
 
366
404
  # Test duplicate IDs
367
405
  data = {
@@ -379,6 +417,9 @@ if __name__ == "__main__":
379
417
 
380
418
  fs = FeatureSet("abalone_features")
381
419
  model = Model("abalone-regression")
420
+ features = model.features()
382
421
  df = fs.pull_dataframe()
383
- prox = Proximity(df, id_column=fs.id_column, features=model.features(), target=model.target())
384
- print(prox.neighbors(query_df=df[0:2]))
422
+ prox = Proximity(
423
+ df, id_column=fs.id_column, features=model.features(), target=model.target(), track_columns=features
424
+ )
425
+ print(prox.find_neighbors(query_df=df[0:2]))
@@ -14,7 +14,7 @@ import pandas as pd
14
14
  TEMPLATE_PARAMS = {
15
15
  "features": "{{feature_list}}",
16
16
  "target": "{{target_column}}",
17
- "train_all_data": "{{train_all_data}}"
17
+ "train_all_data": "{{train_all_data}}",
18
18
  }
19
19
 
20
20
 
@@ -37,7 +37,7 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
37
37
  """
38
38
  Matches and renames DataFrame columns to match model feature names (case-insensitive).
39
39
  Prioritizes exact matches, then case-insensitive matches.
40
-
40
+
41
41
  Raises ValueError if any model features cannot be matched.
42
42
  """
43
43
  df_columns_lower = {col.lower(): col for col in df.columns}
@@ -81,10 +81,7 @@ if __name__ == "__main__":
81
81
  args = parser.parse_args()
82
82
 
83
83
  # Load training data from the specified directory
84
- training_files = [
85
- os.path.join(args.train, file)
86
- for file in os.listdir(args.train) if file.endswith(".csv")
87
- ]
84
+ training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
88
85
  df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
89
86
 
90
87
  # Check if the DataFrame is empty
@@ -109,8 +106,10 @@ if __name__ == "__main__":
109
106
  # Create and train the Regression/Confidence model
110
107
  # model = BayesianRidge()
111
108
  model = BayesianRidge(
112
- alpha_1=1e-6, alpha_2=1e-6, # Noise precision
113
- lambda_1=1e-6, lambda_2=1e-6, # Weight precision
109
+ alpha_1=1e-6,
110
+ alpha_2=1e-6, # Noise precision
111
+ lambda_1=1e-6,
112
+ lambda_2=1e-6, # Weight precision
114
113
  fit_intercept=True,
115
114
  )
116
115
 
@@ -4,11 +4,7 @@ import awswrangler as wr
4
4
  import numpy as np
5
5
 
6
6
  # Model Performance Scores
7
- from sklearn.metrics import (
8
- mean_absolute_error,
9
- r2_score,
10
- root_mean_squared_error
11
- )
7
+ from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
12
8
  from sklearn.model_selection import KFold
13
9
  from scipy.optimize import minimize
14
10
 
@@ -23,7 +19,7 @@ TEMPLATE_PARAMS = {
23
19
  "features": "{{feature_list}}",
24
20
  "target": "{{target_column}}",
25
21
  "train_all_data": "{{train_all_data}}",
26
- "model_metrics_s3_path": "{{model_metrics_s3_path}}"
22
+ "model_metrics_s3_path": "{{model_metrics_s3_path}}",
27
23
  }
28
24
 
29
25
 
@@ -47,7 +43,7 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
47
43
  """
48
44
  Matches and renames DataFrame columns to match model feature names (case-insensitive).
49
45
  Prioritizes exact matches, then case-insensitive matches.
50
-
46
+
51
47
  Raises ValueError if any model features cannot be matched.
52
48
  """
53
49
  df_columns_lower = {col.lower(): col for col in df.columns}
@@ -90,10 +86,7 @@ if __name__ == "__main__":
90
86
  args = parser.parse_args()
91
87
 
92
88
  # Load training data from the specified directory
93
- training_files = [
94
- os.path.join(args.train, file)
95
- for file in os.listdir(args.train) if file.endswith(".csv")
96
- ]
89
+ training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
97
90
  df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
98
91
 
99
92
  # Check if the DataFrame is empty
@@ -172,16 +165,14 @@ if __name__ == "__main__":
172
165
  cv_residuals = np.array(cv_residuals)
173
166
  cv_uncertainties = np.array(cv_uncertainties)
174
167
 
175
-
176
168
  # Optimize calibration parameters: σ_cal = a * σ_uc + b
177
169
  def neg_log_likelihood(params):
178
170
  a, b = params
179
171
  sigma_cal = a * cv_uncertainties + b
180
172
  sigma_cal = np.maximum(sigma_cal, 1e-8) # Prevent division by zero
181
- return np.sum(0.5 * np.log(2 * np.pi * sigma_cal ** 2) + 0.5 * (cv_residuals ** 2) / (sigma_cal ** 2))
173
+ return np.sum(0.5 * np.log(2 * np.pi * sigma_cal**2) + 0.5 * (cv_residuals**2) / (sigma_cal**2))
182
174
 
183
-
184
- result = minimize(neg_log_likelihood, x0=[1.0, 0.1], method='Nelder-Mead')
175
+ result = minimize(neg_log_likelihood, x0=[1.0, 0.1], method="Nelder-Mead")
185
176
  cal_a, cal_b = result.x
186
177
 
187
178
  print(f"Calibration parameters: a={cal_a:.4f}, b={cal_b:.4f}")
@@ -205,7 +196,9 @@ if __name__ == "__main__":
205
196
  result_df["prediction"] = result_df[[name for name in result_df.columns if name.startswith("m_")]].mean(axis=1)
206
197
 
207
198
  # Compute uncalibrated uncertainty
208
- result_df["prediction_std_uc"] = result_df[[name for name in result_df.columns if name.startswith("m_")]].std(axis=1)
199
+ result_df["prediction_std_uc"] = result_df[[name for name in result_df.columns if name.startswith("m_")]].std(
200
+ axis=1
201
+ )
209
202
 
210
203
  # Apply calibration to uncertainty
211
204
  result_df["prediction_std"] = cal_a * result_df["prediction_std_uc"] + cal_b
@@ -352,4 +345,4 @@ def predict_fn(df, models) -> pd.DataFrame:
352
345
  df = df.reindex(sorted(df.columns), axis=1)
353
346
 
354
347
  # All done, return the DataFrame
355
- return df
348
+ return df
@@ -9,7 +9,7 @@ from sklearn.model_selection import train_test_split
9
9
  TEMPLATE_PARAMS = {
10
10
  "features": "{{feature_list}}",
11
11
  "target": "{{target_column}}",
12
- "train_all_data": "{{train_all_data}}"
12
+ "train_all_data": "{{train_all_data}}",
13
13
  }
14
14
 
15
15
  from io import StringIO
@@ -33,7 +33,7 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
33
33
  """
34
34
  Matches and renames DataFrame columns to match model feature names (case-insensitive).
35
35
  Prioritizes exact matches, then case-insensitive matches.
36
-
36
+
37
37
  Raises ValueError if any model features cannot be matched.
38
38
  """
39
39
  df_columns_lower = {col.lower(): col for col in df.columns}
@@ -46,7 +46,7 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
46
46
  rename_dict[df_columns_lower[feature.lower()]] = feature
47
47
  else:
48
48
  missing.append(feature)
49
-
49
+
50
50
  if missing:
51
51
  raise ValueError(f"Features not found: {missing}")
52
52
 
@@ -76,10 +76,7 @@ if __name__ == "__main__":
76
76
  args = parser.parse_args()
77
77
 
78
78
  # Load training data from the specified directory
79
- training_files = [
80
- os.path.join(args.train, file)
81
- for file in os.listdir(args.train) if file.endswith(".csv")
82
- ]
79
+ training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
83
80
  df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
84
81
 
85
82
  # Check if the DataFrame is empty
@@ -112,10 +109,7 @@ if __name__ == "__main__":
112
109
  )
113
110
 
114
111
  # Create a Pipeline with StandardScaler
115
- model = Pipeline([
116
- ("scaler", StandardScaler()),
117
- ("model", model)
118
- ])
112
+ model = Pipeline([("scaler", StandardScaler()), ("model", model)])
119
113
 
120
114
  # Prepare features and targets for training
121
115
  X_train = df_train[features]