workbench 0.8.168__py3-none-any.whl → 0.8.193__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. workbench/algorithms/dataframe/proximity.py +143 -102
  2. workbench/algorithms/graph/light/proximity_graph.py +2 -1
  3. workbench/api/compound.py +1 -1
  4. workbench/api/endpoint.py +3 -2
  5. workbench/api/feature_set.py +4 -4
  6. workbench/api/model.py +16 -12
  7. workbench/api/monitor.py +1 -16
  8. workbench/core/artifacts/artifact.py +11 -3
  9. workbench/core/artifacts/data_capture_core.py +355 -0
  10. workbench/core/artifacts/endpoint_core.py +113 -27
  11. workbench/core/artifacts/feature_set_core.py +72 -13
  12. workbench/core/artifacts/model_core.py +71 -49
  13. workbench/core/artifacts/monitor_core.py +33 -249
  14. workbench/core/cloud_platform/aws/aws_account_clamp.py +50 -1
  15. workbench/core/cloud_platform/aws/aws_meta.py +11 -4
  16. workbench/core/transforms/data_to_features/light/molecular_descriptors.py +4 -4
  17. workbench/core/transforms/features_to_model/features_to_model.py +11 -6
  18. workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +36 -6
  19. workbench/core/transforms/pandas_transforms/pandas_to_features.py +27 -0
  20. workbench/core/views/training_view.py +49 -53
  21. workbench/core/views/view.py +51 -1
  22. workbench/core/views/view_utils.py +4 -4
  23. workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +483 -0
  24. workbench/model_scripts/custom_models/chem_info/mol_standardize.py +450 -0
  25. workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +7 -9
  26. workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +3 -5
  27. workbench/model_scripts/custom_models/proximity/proximity.py +143 -102
  28. workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
  29. workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +10 -17
  30. workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
  31. workbench/model_scripts/custom_models/uq_models/meta_uq.template +156 -58
  32. workbench/model_scripts/custom_models/uq_models/ngboost.template +20 -14
  33. workbench/model_scripts/custom_models/uq_models/proximity.py +143 -102
  34. workbench/model_scripts/custom_models/uq_models/requirements.txt +1 -3
  35. workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +5 -13
  36. workbench/model_scripts/pytorch_model/pytorch.template +9 -18
  37. workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
  38. workbench/model_scripts/script_generation.py +7 -2
  39. workbench/model_scripts/uq_models/mapie.template +492 -0
  40. workbench/model_scripts/uq_models/requirements.txt +1 -0
  41. workbench/model_scripts/xgb_model/generated_model_script.py +34 -43
  42. workbench/model_scripts/xgb_model/xgb_model.template +31 -40
  43. workbench/repl/workbench_shell.py +4 -4
  44. workbench/scripts/lambda_launcher.py +63 -0
  45. workbench/scripts/{ml_pipeline_launcher.py → ml_pipeline_batch.py} +49 -51
  46. workbench/scripts/ml_pipeline_sqs.py +186 -0
  47. workbench/utils/chem_utils/__init__.py +0 -0
  48. workbench/utils/chem_utils/fingerprints.py +134 -0
  49. workbench/utils/chem_utils/misc.py +194 -0
  50. workbench/utils/chem_utils/mol_descriptors.py +483 -0
  51. workbench/utils/chem_utils/mol_standardize.py +450 -0
  52. workbench/utils/chem_utils/mol_tagging.py +348 -0
  53. workbench/utils/chem_utils/projections.py +209 -0
  54. workbench/utils/chem_utils/salts.py +256 -0
  55. workbench/utils/chem_utils/sdf.py +292 -0
  56. workbench/utils/chem_utils/toxicity.py +250 -0
  57. workbench/utils/chem_utils/vis.py +253 -0
  58. workbench/utils/config_manager.py +2 -6
  59. workbench/utils/endpoint_utils.py +5 -7
  60. workbench/utils/license_manager.py +2 -6
  61. workbench/utils/model_utils.py +89 -31
  62. workbench/utils/monitor_utils.py +44 -62
  63. workbench/utils/pandas_utils.py +3 -3
  64. workbench/utils/shap_utils.py +10 -2
  65. workbench/utils/workbench_sqs.py +1 -1
  66. workbench/utils/xgboost_model_utils.py +300 -151
  67. workbench/web_interface/components/model_plot.py +7 -1
  68. workbench/web_interface/components/plugins/dashboard_status.py +3 -1
  69. workbench/web_interface/components/plugins/generated_compounds.py +1 -1
  70. workbench/web_interface/components/plugins/model_details.py +7 -2
  71. workbench/web_interface/components/plugins/scatter_plot.py +3 -3
  72. {workbench-0.8.168.dist-info → workbench-0.8.193.dist-info}/METADATA +24 -2
  73. {workbench-0.8.168.dist-info → workbench-0.8.193.dist-info}/RECORD +77 -72
  74. {workbench-0.8.168.dist-info → workbench-0.8.193.dist-info}/entry_points.txt +3 -1
  75. {workbench-0.8.168.dist-info → workbench-0.8.193.dist-info}/licenses/LICENSE +1 -1
  76. workbench/model_scripts/custom_models/chem_info/local_utils.py +0 -769
  77. workbench/model_scripts/custom_models/chem_info/tautomerize.py +0 -83
  78. workbench/model_scripts/custom_models/proximity/generated_model_script.py +0 -138
  79. workbench/model_scripts/custom_models/uq_models/generated_model_script.py +0 -393
  80. workbench/model_scripts/custom_models/uq_models/mapie_xgb.template +0 -203
  81. workbench/model_scripts/ensemble_xgb/generated_model_script.py +0 -279
  82. workbench/model_scripts/pytorch_model/generated_model_script.py +0 -576
  83. workbench/model_scripts/quant_regression/quant_regression.template +0 -279
  84. workbench/model_scripts/quant_regression/requirements.txt +0 -1
  85. workbench/model_scripts/scikit_learn/generated_model_script.py +0 -307
  86. workbench/utils/chem_utils.py +0 -1556
  87. workbench/utils/fast_inference.py +0 -167
  88. workbench/utils/resource_utils.py +0 -39
  89. {workbench-0.8.168.dist-info → workbench-0.8.193.dist-info}/WHEEL +0 -0
  90. {workbench-0.8.168.dist-info → workbench-0.8.193.dist-info}/top_level.txt +0 -0
@@ -2,10 +2,9 @@ import pandas as pd
2
2
  import numpy as np
3
3
  from sklearn.preprocessing import StandardScaler
4
4
  from sklearn.neighbors import NearestNeighbors
5
- from typing import List, Dict
5
+ from typing import List, Dict, Optional
6
6
  import logging
7
7
  import pickle
8
- import os
9
8
  import json
10
9
  from pathlib import Path
11
10
  from enum import Enum
@@ -14,7 +13,6 @@ from enum import Enum
14
13
  log = logging.getLogger("workbench")
15
14
 
16
15
 
17
- # ^Enumerated^ Proximity Types (distance or similarity)
18
16
  class ProximityType(Enum):
19
17
  DISTANCE = "distance"
20
18
  SIMILARITY = "similarity"
@@ -26,44 +24,49 @@ class Proximity:
26
24
  df: pd.DataFrame,
27
25
  id_column: str,
28
26
  features: List[str],
29
- target: str = None,
30
- track_columns: List[str] = None,
27
+ target: Optional[str] = None,
28
+ track_columns: Optional[List[str]] = None,
31
29
  n_neighbors: int = 10,
32
30
  ):
33
31
  """
34
32
  Initialize the Proximity class.
35
33
 
36
34
  Args:
37
- df (pd.DataFrame): DataFrame containing data for neighbor computations.
38
- id_column (str): Name of the column used as the identifier.
39
- features (List[str]): List of feature column names to be used for neighbor computations.
40
- target (str, optional): Name of the target column. Defaults to None.
41
- track_columns (List[str], optional): Additional columns to track in results. Defaults to None.
42
- n_neighbors (int): Number of neighbors to compute. Defaults to 10.
35
+ df: DataFrame containing data for neighbor computations.
36
+ id_column: Name of the column used as the identifier.
37
+ features: List of feature column names to be used for neighbor computations.
38
+ target: Name of the target column. Defaults to None.
39
+ track_columns: Additional columns to track in results. Defaults to None.
40
+ n_neighbors: Number of neighbors to compute. Defaults to 10.
43
41
  """
44
- self.df = df.dropna(subset=features).copy()
45
42
  self.id_column = id_column
46
- self.n_neighbors = min(n_neighbors, len(self.df) - 1)
47
43
  self.target = target
48
- self.features = features
44
+ self.track_columns = track_columns or []
45
+ self.proximity_type = None
49
46
  self.scaler = None
50
47
  self.X = None
51
48
  self.nn = None
52
- self.proximity_type = None
53
- self.track_columns = track_columns or []
54
49
 
55
- # Right now we only support numeric features, so remove any columns that are not numeric
56
- non_numeric_features = self.df[self.features].select_dtypes(exclude=["number"]).columns.tolist()
57
- if non_numeric_features:
58
- log.warning(f"Non-numeric features {non_numeric_features} aren't currently supported...")
59
- self.features = [f for f in self.features if f not in non_numeric_features]
50
+ # Filter out non-numeric features
51
+ self.features = self._validate_features(df, features)
52
+
53
+ # Drop NaN rows and set up DataFrame
54
+ self.df = df.dropna(subset=self.features).copy()
55
+ self.n_neighbors = min(n_neighbors, len(self.df) - 1)
60
56
 
61
57
  # Build the proximity model
62
58
  self.build_proximity_model()
63
59
 
60
+ def _validate_features(self, df: pd.DataFrame, features: List[str]) -> List[str]:
61
+ """Remove non-numeric features and log warnings."""
62
+ non_numeric = df[features].select_dtypes(exclude=["number"]).columns.tolist()
63
+ if non_numeric:
64
+ log.warning(f"Non-numeric features {non_numeric} aren't currently supported...")
65
+ return [f for f in features if f not in non_numeric]
66
+ return features
67
+
64
68
  def build_proximity_model(self) -> None:
65
- """Standardize features and fit Nearest Neighbors model.
66
- Note: This method can be overridden in subclasses for custom behavior."""
69
+ """Standardize features and fit Nearest Neighbors model."""
67
70
  self.proximity_type = ProximityType.DISTANCE
68
71
  self.scaler = StandardScaler()
69
72
  self.X = self.scaler.fit_transform(self.df[self.features])
@@ -74,27 +77,60 @@ class Proximity:
74
77
  Compute nearest neighbors for all rows in the dataset.
75
78
 
76
79
  Returns:
77
- pd.DataFrame: A DataFrame of neighbors and their distances.
80
+ DataFrame of neighbors and their distances.
78
81
  """
79
82
  distances, indices = self.nn.kneighbors(self.X)
80
- results = []
81
83
 
82
- for i, (dists, nbrs) in enumerate(zip(distances, indices)):
83
- query_id = self.df.iloc[i][self.id_column]
84
-
85
- # Process neighbors
86
- for neighbor_idx, dist in zip(nbrs, dists):
87
- # Skip self (neighbor index == current row index)
88
- if neighbor_idx == i:
89
- continue
90
- results.append(self._build_neighbor_result(query_id=query_id, neighbor_idx=neighbor_idx, distance=dist))
84
+ results = [
85
+ self._build_neighbor_result(
86
+ query_id=self.df.iloc[i][self.id_column], neighbor_idx=neighbor_idx, distance=dist
87
+ )
88
+ for i, (dists, nbrs) in enumerate(zip(distances, indices))
89
+ for neighbor_idx, dist in zip(nbrs, dists)
90
+ if neighbor_idx != i # Skip self
91
+ ]
91
92
 
92
93
  return pd.DataFrame(results)
93
94
 
94
95
  def neighbors(
96
+ self,
97
+ id_or_ids,
98
+ n_neighbors: Optional[int] = 5,
99
+ radius: Optional[float] = None,
100
+ include_self: bool = True,
101
+ ) -> pd.DataFrame:
102
+ """
103
+ Return neighbors for ID(s) from the existing dataset.
104
+
105
+ Args:
106
+ id_or_ids: Single ID or list of IDs to look up
107
+ n_neighbors: Number of neighbors to return (default: 5)
108
+ radius: If provided, find all neighbors within this radius
109
+ include_self: Whether to include self in results (if present)
110
+
111
+ Returns:
112
+ DataFrame containing neighbors and distances
113
+ """
114
+ # Normalize to list
115
+ ids = [id_or_ids] if not isinstance(id_or_ids, list) else id_or_ids
116
+
117
+ # Validate IDs exist
118
+ missing_ids = set(ids) - set(self.df[self.id_column])
119
+ if missing_ids:
120
+ raise ValueError(f"IDs not found in dataset: {missing_ids}")
121
+
122
+ # Filter to requested IDs and preserve order
123
+ query_df = self.df[self.df[self.id_column].isin(ids)]
124
+ query_df = query_df.set_index(self.id_column).loc[ids].reset_index()
125
+
126
+ # Use the core implementation
127
+ return self.find_neighbors(query_df, n_neighbors=n_neighbors, radius=radius, include_self=include_self)
128
+
129
+ def find_neighbors(
95
130
  self,
96
131
  query_df: pd.DataFrame,
97
- radius: float = None,
132
+ n_neighbors: Optional[int] = 5,
133
+ radius: Optional[float] = None,
98
134
  include_self: bool = True,
99
135
  ) -> pd.DataFrame:
100
136
  """
@@ -102,63 +138,63 @@ class Proximity:
102
138
 
103
139
  Args:
104
140
  query_df: DataFrame containing query points
141
+ n_neighbors: Number of neighbors to return (default: 5)
105
142
  radius: If provided, find all neighbors within this radius
106
143
  include_self: Whether to include self in results (if present)
107
144
 
108
145
  Returns:
109
146
  DataFrame containing neighbors and distances
110
-
111
- Note: The query DataFrame must include the feature columns. The id_column is optional.
112
147
  """
113
- # Check if all required features are present
148
+ # Validate features
114
149
  missing = set(self.features) - set(query_df.columns)
115
150
  if missing:
116
151
  raise ValueError(f"Query DataFrame is missing required feature columns: {missing}")
117
152
 
118
- # Check if id_column is present
119
153
  id_column_present = self.id_column in query_df.columns
120
154
 
121
- # None of the features can be NaNs, so report rows with NaNs and then drop them
122
- rows_with_nan = query_df[self.features].isna().any(axis=1)
123
-
124
- # Print the ID column for rows with NaNs
125
- if rows_with_nan.any():
126
- log.warning(f"Found {rows_with_nan.sum()} rows with NaNs in feature columns:")
127
- log.warning(query_df.loc[rows_with_nan, self.id_column])
128
-
129
- # Drop rows with NaNs in feature columns and reassign to query_df
130
- query_df = query_df.dropna(subset=self.features)
155
+ # Handle NaN rows
156
+ query_df = self._handle_nan_rows(query_df, id_column_present)
131
157
 
132
- # Transform the query features using the model's scaler
158
+ # Transform query features
133
159
  X_query = self.scaler.transform(query_df[self.features])
134
160
 
135
- # Get neighbors using either radius or k-nearest neighbors
161
+ # Get neighbors
136
162
  if radius is not None:
137
163
  distances, indices = self.nn.radius_neighbors(X_query, radius=radius)
138
164
  else:
139
- distances, indices = self.nn.kneighbors(X_query)
165
+ distances, indices = self.nn.kneighbors(X_query, n_neighbors=n_neighbors)
140
166
 
141
167
  # Build results
142
- all_results = []
168
+ results = []
143
169
  for i, (dists, nbrs) in enumerate(zip(distances, indices)):
144
- # Use the ID from the query DataFrame if available, otherwise use the row index
145
170
  query_id = query_df.iloc[i][self.id_column] if id_column_present else f"query_{i}"
146
171
 
147
172
  for neighbor_idx, dist in zip(nbrs, dists):
148
- # Skip if the neighbor is the query itself and include_self is False
149
173
  neighbor_id = self.df.iloc[neighbor_idx][self.id_column]
174
+
175
+ # Skip if neighbor is self and include_self is False
150
176
  if not include_self and neighbor_id == query_id:
151
177
  continue
152
178
 
153
- all_results.append(
154
- self._build_neighbor_result(query_id=query_id, neighbor_idx=neighbor_idx, distance=dist)
155
- )
179
+ results.append(self._build_neighbor_result(query_id=query_id, neighbor_idx=neighbor_idx, distance=dist))
180
+
181
+ results_df = pd.DataFrame(results).sort_values([self.id_column, "distance"]).reset_index(drop=True)
182
+ return results_df
183
+
184
+ def _handle_nan_rows(self, query_df: pd.DataFrame, id_column_present: bool) -> pd.DataFrame:
185
+ """Drop rows with NaN values in feature columns and log warnings."""
186
+ rows_with_nan = query_df[self.features].isna().any(axis=1)
187
+
188
+ if rows_with_nan.any():
189
+ log.warning(f"Found {rows_with_nan.sum()} rows with NaNs in feature columns:")
190
+ if id_column_present:
191
+ log.warning(query_df.loc[rows_with_nan, self.id_column])
156
192
 
157
- return pd.DataFrame(all_results)
193
+ return query_df.dropna(subset=self.features)
158
194
 
159
195
  def _build_neighbor_result(self, query_id, neighbor_idx: int, distance: float) -> Dict:
160
196
  """
161
- Internal: Build a result dictionary for a single neighbor.
197
+ Build a result dictionary for a single neighbor.
162
198
 
163
199
  Args:
164
200
  query_id: ID of the query point
@@ -169,27 +205,30 @@ class Proximity:
169
205
  Dictionary containing neighbor information
170
206
  """
171
207
  neighbor_id = self.df.iloc[neighbor_idx][self.id_column]
208
+ neighbor_row = self.df.iloc[neighbor_idx]
172
209
 
173
- # Basic neighbor info
174
- neighbor_info = {
210
+ # Start with basic info
211
+ result = {
175
212
  self.id_column: query_id,
176
213
  "neighbor_id": neighbor_id,
177
214
  "distance": distance,
178
215
  }
179
216
 
180
- # Determine which additional columns to include
181
- relevant_cols = [self.target, "prediction"] if self.target else []
182
- relevant_cols += [c for c in self.df.columns if "_proba" in c or "residual" in c]
183
- relevant_cols += ["outlier"]
217
+ # Columns to automatically include if they exist
218
+ auto_include = (
219
+ ([self.target, "prediction"] if self.target else [])
220
+ + self.track_columns
221
+ + [col for col in self.df.columns if "_proba" in col or "residual" in col or col == "outlier"]
222
+ )
184
223
 
185
- # Add user-specified columns
186
- relevant_cols += self.track_columns
224
+ # Add values for existing columns
225
+ for col in auto_include:
226
+ if col in self.df.columns:
227
+ result[col] = neighbor_row[col]
187
228
 
188
- # Add values for each relevant column that exists in the dataframe
189
- for col in filter(lambda c: c in self.df.columns, relevant_cols):
190
- neighbor_info[col] = self.df.iloc[neighbor_idx][col]
191
-
192
- return neighbor_info
229
+ # Truncate very small distances to zero
230
+ result["distance"] = 0.0 if distance < 1e-7 else distance
231
+ return result
193
232
 
194
233
  def serialize(self, directory: str) -> None:
195
234
  """
@@ -198,8 +237,8 @@ class Proximity:
198
237
  Args:
199
238
  directory: Directory path to save the model components
200
239
  """
201
- # Create directory if it doesn't exist
202
- os.makedirs(directory, exist_ok=True)
240
+ dir_path = Path(directory)
241
+ dir_path.mkdir(parents=True, exist_ok=True)
203
242
 
204
243
  # Save metadata
205
244
  metadata = {
@@ -210,17 +249,16 @@ class Proximity:
210
249
  "n_neighbors": self.n_neighbors,
211
250
  }
212
251
 
213
- with open(os.path.join(directory, "metadata.json"), "w") as f:
214
- json.dump(metadata, f)
252
+ (dir_path / "metadata.json").write_text(json.dumps(metadata))
215
253
 
216
- # Save the DataFrame
217
- self.df.to_pickle(os.path.join(directory, "df.pkl"))
254
+ # Save DataFrame
255
+ self.df.to_pickle(dir_path / "df.pkl")
218
256
 
219
- # Save the scaler and nearest neighbors model
220
- with open(os.path.join(directory, "scaler.pkl"), "wb") as f:
257
+ # Save models
258
+ with open(dir_path / "scaler.pkl", "wb") as f:
221
259
  pickle.dump(self.scaler, f)
222
260
 
223
- with open(os.path.join(directory, "nn_model.pkl"), "wb") as f:
261
+ with open(dir_path / "nn_model.pkl", "wb") as f:
224
262
  pickle.dump(self.nn, f)
225
263
 
226
264
  log.info(f"Proximity model serialized to {directory}")
@@ -234,23 +272,22 @@ class Proximity:
234
272
  directory: Directory path containing the serialized model components
235
273
 
236
274
  Returns:
237
- Proximity: A new Proximity instance
275
+ A new Proximity instance
238
276
  """
239
- directory_path = Path(directory)
240
- if not directory_path.exists() or not directory_path.is_dir():
277
+ dir_path = Path(directory)
278
+ if not dir_path.is_dir():
241
279
  raise ValueError(f"Directory {directory} does not exist or is not a directory")
242
280
 
243
281
  # Load metadata
244
- with open(os.path.join(directory, "metadata.json"), "r") as f:
245
- metadata = json.load(f)
282
+ metadata = json.loads((dir_path / "metadata.json").read_text())
246
283
 
247
284
  # Load DataFrame
248
- df_path = os.path.join(directory, "df.pkl")
249
- if not os.path.exists(df_path):
285
+ df_path = dir_path / "df.pkl"
286
+ if not df_path.exists():
250
287
  raise FileNotFoundError(f"DataFrame file not found at {df_path}")
251
288
  df = pd.read_pickle(df_path)
252
289
 
253
- # Create instance but skip _prepare_data
290
+ # Create instance without calling __init__
254
291
  instance = cls.__new__(cls)
255
292
  instance.df = df
256
293
  instance.id_column = metadata["id_column"]
@@ -259,15 +296,16 @@ class Proximity:
259
296
  instance.track_columns = metadata["track_columns"]
260
297
  instance.n_neighbors = metadata["n_neighbors"]
261
298
 
262
- # Load scaler and nn model
263
- with open(os.path.join(directory, "scaler.pkl"), "rb") as f:
299
+ # Load models
300
+ with open(dir_path / "scaler.pkl", "rb") as f:
264
301
  instance.scaler = pickle.load(f)
265
302
 
266
- with open(os.path.join(directory, "nn_model.pkl"), "rb") as f:
303
+ with open(dir_path / "nn_model.pkl", "rb") as f:
267
304
  instance.nn = pickle.load(f)
268
305
 
269
- # Load X from scaler transform
306
+ # Restore X
270
307
  instance.X = instance.scaler.transform(instance.df[instance.features])
308
+ instance.proximity_type = ProximityType.DISTANCE
271
309
 
272
310
  log.info(f"Proximity model deserialized from {directory}")
273
311
  return instance
@@ -294,10 +332,10 @@ if __name__ == "__main__":
294
332
  print(prox.all_neighbors())
295
333
 
296
334
  # Test the neighbors method
297
- print(prox.neighbors(query_df=df.iloc[[0]]))
335
+ print(prox.neighbors(1))
298
336
 
299
337
  # Test the neighbors method with radius
300
- print(prox.neighbors(query_df=df.iloc[0:2], radius=2.0))
338
+ print(prox.neighbors(1, radius=2.0))
301
339
 
302
340
  # Test with data that isn't in the 'train' dataframe
303
341
  query_data = {
@@ -307,7 +345,7 @@ if __name__ == "__main__":
307
345
  "Feature3": [2.31],
308
346
  }
309
347
  query_df = pd.DataFrame(query_data)
310
- print(prox.neighbors(query_df=query_df))
348
+ print(prox.find_neighbors(query_df=query_df)) # For new data we use find_neighbors()
311
349
 
312
350
  # Test with Features list
313
351
  prox = Proximity(df, id_column="ID", features=["Feature1"], n_neighbors=2)
@@ -334,13 +372,13 @@ if __name__ == "__main__":
334
372
  print(prox.all_neighbors())
335
373
 
336
374
  # Test the neighbors method
337
- print(prox.neighbors(query_df=df.iloc[0:2]))
375
+ print(prox.neighbors(["a", "b"]))
338
376
 
339
377
  # Time neighbors with all IDs versus calling all_neighbors
340
378
  import time
341
379
 
342
380
  start_time = time.time()
343
- prox_df = prox.neighbors(query_df=df, include_self=False)
381
+ prox_df = prox.find_neighbors(query_df=df, include_self=False)
344
382
  end_time = time.time()
345
383
  print(f"Time taken for neighbors: {end_time - start_time:.4f} seconds")
346
384
  start_time = time.time()
@@ -361,7 +399,7 @@ if __name__ == "__main__":
361
399
 
362
400
  # Test querying without the id_column
363
401
  df_no_id = df.drop(columns=["foo_id"])
364
- print(prox.neighbors(query_df=df_no_id, include_self=False))
402
+ print(prox.find_neighbors(query_df=df_no_id, include_self=False))
365
403
 
366
404
  # Test duplicate IDs
367
405
  data = {
@@ -379,6 +417,9 @@ if __name__ == "__main__":
379
417
 
380
418
  fs = FeatureSet("abalone_features")
381
419
  model = Model("abalone-regression")
420
+ features = model.features()
382
421
  df = fs.pull_dataframe()
383
- prox = Proximity(df, id_column=fs.id_column, features=model.features(), target=model.target())
384
- print(prox.neighbors(query_df=df[0:2]))
422
+ prox = Proximity(
423
+ df, id_column=fs.id_column, features=model.features(), target=model.target(), track_columns=features
424
+ )
425
+ print(prox.find_neighbors(query_df=df[0:2]))
@@ -1,3 +1 @@
1
- # Note: NGBoost is not included in the default inference image, so it must be specified here.
2
- ngboost
3
- mapie
1
+ # Note: Most libs are already in the training/inference images, ONLY specify additional libs here
@@ -3,7 +3,7 @@ TEMPLATE_PARAMS = {
3
3
  "model_type": "{{model_type}}",
4
4
  "target_column": "{{target_column}}",
5
5
  "feature_list": "{{feature_list}}",
6
- "model_metrics_s3_path": "{{model_metrics_s3_path}}"
6
+ "model_metrics_s3_path": "{{model_metrics_s3_path}}",
7
7
  }
8
8
 
9
9
  # Imports for XGB Model
@@ -12,11 +12,7 @@ import awswrangler as wr
12
12
  import numpy as np
13
13
 
14
14
  # Model Performance Scores
15
- from sklearn.metrics import (
16
- mean_absolute_error,
17
- r2_score,
18
- root_mean_squared_error
19
- )
15
+ from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
20
16
 
21
17
  from io import StringIO
22
18
  import json
@@ -39,6 +35,7 @@ def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
39
35
  print(msg)
40
36
  raise ValueError(msg)
41
37
 
38
+
42
39
  def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> pd.DataFrame:
43
40
  """
44
41
  Matches and renames the DataFrame's column names to match the model's feature names (case-insensitive).
@@ -95,11 +92,7 @@ if __name__ == "__main__":
95
92
  args = parser.parse_args()
96
93
 
97
94
  # Read the training data into DataFrames
98
- training_files = [
99
- os.path.join(args.train, file)
100
- for file in os.listdir(args.train)
101
- if file.endswith(".csv")
102
- ]
95
+ training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
103
96
  print(f"Training Files: {training_files}")
104
97
 
105
98
  # Combine files and read them all into a single pandas dataframe
@@ -150,7 +143,6 @@ if __name__ == "__main__":
150
143
  result_df["residual"] = result_df[target] - result_df["prediction"]
151
144
  result_df["residual_abs"] = result_df["residual"].abs()
152
145
 
153
-
154
146
  # Save the results dataframe to S3
155
147
  wr.s3.to_csv(
156
148
  result_df,
@@ -210,7 +202,7 @@ def input_fn(input_data, content_type):
210
202
  """Parse input data and return a DataFrame."""
211
203
  if not input_data:
212
204
  raise ValueError("Empty input data is not supported!")
213
-
205
+
214
206
  # Decode bytes to string if necessary
215
207
  if isinstance(input_data, bytes):
216
208
  input_data = input_data.decode("utf-8")
@@ -36,12 +36,12 @@ from typing import List, Tuple
36
36
  # Template Parameters
37
37
  TEMPLATE_PARAMS = {
38
38
  "model_type": "{{model_type}}",
39
- "target_column": "{{target_column}}",
39
+ "target": "{{target_column}}",
40
40
  "features": "{{feature_list}}",
41
41
  "compressed_features": "{{compressed_features}}",
42
42
  "model_metrics_s3_path": "{{model_metrics_s3_path}}",
43
43
  "train_all_data": "{{train_all_data}}",
44
- "hyperparameters": "{{hyperparameters}}"
44
+ "hyperparameters": "{{hyperparameters}}",
45
45
  }
46
46
 
47
47
 
@@ -103,7 +103,6 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
103
103
  df_columns_lower = {col.lower(): col for col in df.columns}
104
104
  rename_dict = {}
105
105
  missing = []
106
-
107
106
  for feature in model_features:
108
107
  if feature in df.columns:
109
108
  continue # Exact match
@@ -115,6 +114,7 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
115
114
  if missing:
116
115
  raise ValueError(f"Features not found: {missing}")
117
116
 
117
+ # Rename the DataFrame columns to match the model features
118
118
  return df.rename(columns=rename_dict)
119
119
 
120
120
 
@@ -210,7 +210,7 @@ def model_fn(model_dir):
210
210
  original_cwd = os.getcwd()
211
211
  try:
212
212
  # Change to /tmp because Pytorch Tabular needs write access (creates a .pt_tmp directory)
213
- os.chdir('/tmp')
213
+ os.chdir("/tmp")
214
214
 
215
215
  # Load the model
216
216
  model_path = os.path.join(model_dir, "tabular_model")
@@ -328,7 +328,7 @@ if __name__ == "__main__":
328
328
  """The main function is for training the PyTorch Tabular model"""
329
329
 
330
330
  # Harness Template Parameters
331
- target = TEMPLATE_PARAMS["target_column"]
331
+ target = TEMPLATE_PARAMS["target"]
332
332
  features = TEMPLATE_PARAMS["features"]
333
333
  orig_features = features.copy()
334
334
  compressed_features = TEMPLATE_PARAMS["compressed_features"]
@@ -348,11 +348,7 @@ if __name__ == "__main__":
348
348
  args = parser.parse_args()
349
349
 
350
350
  # Read the training data into DataFrames
351
- training_files = [
352
- os.path.join(args.train, file)
353
- for file in os.listdir(args.train)
354
- if file.endswith(".csv")
355
- ]
351
+ training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
356
352
  print(f"Training Files: {training_files}")
357
353
 
358
354
  # Combine files and read them all into a single pandas dataframe
@@ -433,8 +429,7 @@ if __name__ == "__main__":
433
429
  }
434
430
 
435
431
  # Override defaults with training_config if present
436
- training_overrides = {k: v for k, v in hyperparameters.get('training_config', {}).items()
437
- if k in trainer_defaults}
432
+ training_overrides = {k: v for k, v in hyperparameters.get("training_config", {}).items() if k in trainer_defaults}
438
433
  # Print overwrites
439
434
  for key, value in training_overrides.items():
440
435
  print(f"TRAINING CONFIG Override: {key}: {trainer_defaults[key]} → {value}")
@@ -451,8 +446,7 @@ if __name__ == "__main__":
451
446
  "initialization": "kaiming",
452
447
  }
453
448
  # Override defaults with model_config if present
454
- model_overrides = {k: v for k, v in hyperparameters.get('model_config', {}).items()
455
- if k in model_defaults}
449
+ model_overrides = {k: v for k, v in hyperparameters.get("model_config", {}).items() if k in model_defaults}
456
450
  # Print overwrites
457
451
  for key, value in model_overrides.items():
458
452
  print(f"MODEL CONFIG Override: {key}: {model_defaults[key]} → {value}")
@@ -461,10 +455,7 @@ if __name__ == "__main__":
461
455
  # Use CategoryEmbedding model configuration for general-purpose tabular modeling.
462
456
  # Works effectively for both regression and classification as the foundational
463
457
  # architecture in PyTorch Tabular
464
- model_config = CategoryEmbeddingModelConfig(
465
- task=task,
466
- **model_params
467
- )
458
+ model_config = CategoryEmbeddingModelConfig(task=task, **model_params)
468
459
  optimizer_config = OptimizerConfig()
469
460
 
470
461
  #####################################
@@ -8,7 +8,7 @@ TEMPLATE_PARAMS = {
8
8
  "feature_list": "{{feature_list}}",
9
9
  "model_class": "{{model_class}}",
10
10
  "model_metrics_s3_path": "{{model_metrics_s3_path}}",
11
- "train_all_data": "{{train_all_data}}"
11
+ "train_all_data": "{{train_all_data}}",
12
12
  }
13
13
 
14
14
  import awswrangler as wr
@@ -99,10 +99,7 @@ if __name__ == "__main__":
99
99
  args = parser.parse_args()
100
100
 
101
101
  # Load training data from the specified directory
102
- training_files = [
103
- os.path.join(args.train, file)
104
- for file in os.listdir(args.train) if file.endswith(".csv")
105
- ]
102
+ training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
106
103
  all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
107
104
 
108
105
  # Check if the DataFrame is empty
@@ -116,10 +113,7 @@ if __name__ == "__main__":
116
113
 
117
114
  if needs_standardization:
118
115
  # Create a pipeline with standardization and the model
119
- model = Pipeline([
120
- ("scaler", StandardScaler()),
121
- ("model", model)
122
- ])
116
+ model = Pipeline([("scaler", StandardScaler()), ("model", model)])
123
117
 
124
118
  # Handle logic based on the model_type
125
119
  if model_type in ["classifier", "regressor"]:
@@ -206,6 +200,7 @@ if __name__ == "__main__":
206
200
  with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
207
201
  json.dump(feature_list, fp)
208
202
 
203
+
209
204
  #
210
205
  # Inference Section
211
206
  #
@@ -70,6 +70,11 @@ def fill_template(template_path: str, params: dict, output_script: str) -> str:
70
70
  # Sanity check to ensure all placeholders were replaced
71
71
  if "{{" in template and "}}" in template:
72
72
  msg = "Not all template placeholders were replaced. Please check your params."
73
+
74
+ # Show which placeholders are still present
75
+ start = template.index("{{")
76
+ end = template.index("}}", start) + 2
77
+ msg += f" Unreplaced placeholder: {template[start:end]}"
73
78
  log.critical(msg)
74
79
  raise ValueError(msg)
75
80
 
@@ -112,8 +117,8 @@ def generate_model_script(template_params: dict) -> str:
112
117
  template_name = "xgb_model.template"
113
118
  model_script_dir = "xgb_model"
114
119
  elif template_params["model_type"] == ModelType.UQ_REGRESSOR:
115
- template_name = "quant_regression.template"
116
- model_script_dir = "quant_regression"
120
+ template_name = "mapie.template"
121
+ model_script_dir = "uq_models"
117
122
  elif template_params["model_type"] == ModelType.ENSEMBLE_REGRESSOR:
118
123
  template_name = "ensemble_xgb.template"
119
124
  model_script_dir = "ensemble_xgb"