workbench 0.8.161__py3-none-any.whl → 0.8.192__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- workbench/algorithms/dataframe/proximity.py +143 -102
- workbench/algorithms/graph/light/proximity_graph.py +2 -1
- workbench/api/compound.py +1 -1
- workbench/api/endpoint.py +12 -0
- workbench/api/feature_set.py +4 -4
- workbench/api/meta.py +5 -2
- workbench/api/model.py +16 -12
- workbench/api/monitor.py +1 -16
- workbench/core/artifacts/artifact.py +11 -3
- workbench/core/artifacts/data_capture_core.py +355 -0
- workbench/core/artifacts/endpoint_core.py +168 -78
- workbench/core/artifacts/feature_set_core.py +72 -13
- workbench/core/artifacts/model_core.py +50 -15
- workbench/core/artifacts/monitor_core.py +33 -248
- workbench/core/cloud_platform/aws/aws_account_clamp.py +50 -1
- workbench/core/cloud_platform/aws/aws_meta.py +12 -5
- workbench/core/cloud_platform/aws/aws_session.py +4 -4
- workbench/core/transforms/data_to_features/light/molecular_descriptors.py +4 -4
- workbench/core/transforms/features_to_model/features_to_model.py +9 -4
- workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +36 -6
- workbench/core/transforms/pandas_transforms/pandas_to_features.py +27 -0
- workbench/core/views/training_view.py +49 -53
- workbench/core/views/view.py +51 -1
- workbench/core/views/view_utils.py +4 -4
- workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +483 -0
- workbench/model_scripts/custom_models/chem_info/mol_standardize.py +450 -0
- workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +7 -9
- workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +3 -5
- workbench/model_scripts/custom_models/proximity/proximity.py +143 -102
- workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
- workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +10 -17
- workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
- workbench/model_scripts/custom_models/uq_models/meta_uq.template +156 -58
- workbench/model_scripts/custom_models/uq_models/ngboost.template +20 -14
- workbench/model_scripts/custom_models/uq_models/proximity.py +143 -102
- workbench/model_scripts/custom_models/uq_models/requirements.txt +1 -3
- workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +5 -13
- workbench/model_scripts/pytorch_model/pytorch.template +19 -20
- workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
- workbench/model_scripts/script_generation.py +7 -2
- workbench/model_scripts/uq_models/mapie.template +492 -0
- workbench/model_scripts/uq_models/requirements.txt +1 -0
- workbench/model_scripts/xgb_model/xgb_model.template +31 -40
- workbench/repl/workbench_shell.py +11 -6
- workbench/scripts/lambda_launcher.py +63 -0
- workbench/scripts/ml_pipeline_batch.py +137 -0
- workbench/scripts/ml_pipeline_sqs.py +186 -0
- workbench/scripts/monitor_cloud_watch.py +20 -100
- workbench/utils/aws_utils.py +4 -3
- workbench/utils/chem_utils/__init__.py +0 -0
- workbench/utils/chem_utils/fingerprints.py +134 -0
- workbench/utils/chem_utils/misc.py +194 -0
- workbench/utils/chem_utils/mol_descriptors.py +483 -0
- workbench/utils/chem_utils/mol_standardize.py +450 -0
- workbench/utils/chem_utils/mol_tagging.py +348 -0
- workbench/utils/chem_utils/projections.py +209 -0
- workbench/utils/chem_utils/salts.py +256 -0
- workbench/utils/chem_utils/sdf.py +292 -0
- workbench/utils/chem_utils/toxicity.py +250 -0
- workbench/utils/chem_utils/vis.py +253 -0
- workbench/utils/cloudwatch_handler.py +1 -1
- workbench/utils/cloudwatch_utils.py +137 -0
- workbench/utils/config_manager.py +3 -7
- workbench/utils/endpoint_utils.py +5 -7
- workbench/utils/license_manager.py +2 -6
- workbench/utils/model_utils.py +76 -30
- workbench/utils/monitor_utils.py +44 -62
- workbench/utils/pandas_utils.py +3 -3
- workbench/utils/shap_utils.py +10 -2
- workbench/utils/workbench_logging.py +0 -3
- workbench/utils/workbench_sqs.py +1 -1
- workbench/utils/xgboost_model_utils.py +283 -145
- workbench/web_interface/components/plugins/dashboard_status.py +3 -1
- workbench/web_interface/components/plugins/generated_compounds.py +1 -1
- workbench/web_interface/components/plugins/scatter_plot.py +3 -3
- {workbench-0.8.161.dist-info → workbench-0.8.192.dist-info}/METADATA +4 -4
- {workbench-0.8.161.dist-info → workbench-0.8.192.dist-info}/RECORD +81 -76
- {workbench-0.8.161.dist-info → workbench-0.8.192.dist-info}/entry_points.txt +3 -0
- workbench/model_scripts/custom_models/chem_info/local_utils.py +0 -769
- workbench/model_scripts/custom_models/chem_info/tautomerize.py +0 -83
- workbench/model_scripts/custom_models/proximity/generated_model_script.py +0 -138
- workbench/model_scripts/custom_models/uq_models/generated_model_script.py +0 -393
- workbench/model_scripts/custom_models/uq_models/mapie_xgb.template +0 -203
- workbench/model_scripts/ensemble_xgb/generated_model_script.py +0 -279
- workbench/model_scripts/pytorch_model/generated_model_script.py +0 -565
- workbench/model_scripts/quant_regression/quant_regression.template +0 -279
- workbench/model_scripts/quant_regression/requirements.txt +0 -1
- workbench/model_scripts/scikit_learn/generated_model_script.py +0 -307
- workbench/model_scripts/xgb_model/generated_model_script.py +0 -477
- workbench/utils/chem_utils.py +0 -1556
- workbench/utils/execution_environment.py +0 -211
- workbench/utils/fast_inference.py +0 -167
- workbench/utils/resource_utils.py +0 -39
- {workbench-0.8.161.dist-info → workbench-0.8.192.dist-info}/WHEEL +0 -0
- {workbench-0.8.161.dist-info → workbench-0.8.192.dist-info}/licenses/LICENSE +0 -0
- {workbench-0.8.161.dist-info → workbench-0.8.192.dist-info}/top_level.txt +0 -0
|
@@ -2,10 +2,9 @@ import pandas as pd
|
|
|
2
2
|
import numpy as np
|
|
3
3
|
from sklearn.preprocessing import StandardScaler
|
|
4
4
|
from sklearn.neighbors import NearestNeighbors
|
|
5
|
-
from typing import List, Dict
|
|
5
|
+
from typing import List, Dict, Optional
|
|
6
6
|
import logging
|
|
7
7
|
import pickle
|
|
8
|
-
import os
|
|
9
8
|
import json
|
|
10
9
|
from pathlib import Path
|
|
11
10
|
from enum import Enum
|
|
@@ -14,7 +13,6 @@ from enum import Enum
|
|
|
14
13
|
log = logging.getLogger("workbench")
|
|
15
14
|
|
|
16
15
|
|
|
17
|
-
# ^Enumerated^ Proximity Types (distance or similarity)
|
|
18
16
|
class ProximityType(Enum):
|
|
19
17
|
DISTANCE = "distance"
|
|
20
18
|
SIMILARITY = "similarity"
|
|
@@ -26,44 +24,49 @@ class Proximity:
|
|
|
26
24
|
df: pd.DataFrame,
|
|
27
25
|
id_column: str,
|
|
28
26
|
features: List[str],
|
|
29
|
-
target: str = None,
|
|
30
|
-
track_columns: List[str] = None,
|
|
27
|
+
target: Optional[str] = None,
|
|
28
|
+
track_columns: Optional[List[str]] = None,
|
|
31
29
|
n_neighbors: int = 10,
|
|
32
30
|
):
|
|
33
31
|
"""
|
|
34
32
|
Initialize the Proximity class.
|
|
35
33
|
|
|
36
34
|
Args:
|
|
37
|
-
df
|
|
38
|
-
id_column
|
|
39
|
-
features
|
|
40
|
-
target
|
|
41
|
-
track_columns
|
|
42
|
-
n_neighbors
|
|
35
|
+
df: DataFrame containing data for neighbor computations.
|
|
36
|
+
id_column: Name of the column used as the identifier.
|
|
37
|
+
features: List of feature column names to be used for neighbor computations.
|
|
38
|
+
target: Name of the target column. Defaults to None.
|
|
39
|
+
track_columns: Additional columns to track in results. Defaults to None.
|
|
40
|
+
n_neighbors: Number of neighbors to compute. Defaults to 10.
|
|
43
41
|
"""
|
|
44
|
-
self.df = df.dropna(subset=features).copy()
|
|
45
42
|
self.id_column = id_column
|
|
46
|
-
self.n_neighbors = min(n_neighbors, len(self.df) - 1)
|
|
47
43
|
self.target = target
|
|
48
|
-
self.
|
|
44
|
+
self.track_columns = track_columns or []
|
|
45
|
+
self.proximity_type = None
|
|
49
46
|
self.scaler = None
|
|
50
47
|
self.X = None
|
|
51
48
|
self.nn = None
|
|
52
|
-
self.proximity_type = None
|
|
53
|
-
self.track_columns = track_columns or []
|
|
54
49
|
|
|
55
|
-
#
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
50
|
+
# Filter out non-numeric features
|
|
51
|
+
self.features = self._validate_features(df, features)
|
|
52
|
+
|
|
53
|
+
# Drop NaN rows and set up DataFrame
|
|
54
|
+
self.df = df.dropna(subset=self.features).copy()
|
|
55
|
+
self.n_neighbors = min(n_neighbors, len(self.df) - 1)
|
|
60
56
|
|
|
61
57
|
# Build the proximity model
|
|
62
58
|
self.build_proximity_model()
|
|
63
59
|
|
|
60
|
+
def _validate_features(self, df: pd.DataFrame, features: List[str]) -> List[str]:
|
|
61
|
+
"""Remove non-numeric features and log warnings."""
|
|
62
|
+
non_numeric = df[features].select_dtypes(exclude=["number"]).columns.tolist()
|
|
63
|
+
if non_numeric:
|
|
64
|
+
log.warning(f"Non-numeric features {non_numeric} aren't currently supported...")
|
|
65
|
+
return [f for f in features if f not in non_numeric]
|
|
66
|
+
return features
|
|
67
|
+
|
|
64
68
|
def build_proximity_model(self) -> None:
|
|
65
|
-
"""Standardize features and fit Nearest Neighbors model.
|
|
66
|
-
Note: This method can be overridden in subclasses for custom behavior."""
|
|
69
|
+
"""Standardize features and fit Nearest Neighbors model."""
|
|
67
70
|
self.proximity_type = ProximityType.DISTANCE
|
|
68
71
|
self.scaler = StandardScaler()
|
|
69
72
|
self.X = self.scaler.fit_transform(self.df[self.features])
|
|
@@ -74,27 +77,60 @@ class Proximity:
|
|
|
74
77
|
Compute nearest neighbors for all rows in the dataset.
|
|
75
78
|
|
|
76
79
|
Returns:
|
|
77
|
-
|
|
80
|
+
DataFrame of neighbors and their distances.
|
|
78
81
|
"""
|
|
79
82
|
distances, indices = self.nn.kneighbors(self.X)
|
|
80
|
-
results = []
|
|
81
83
|
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
for
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
results.append(self._build_neighbor_result(query_id=query_id, neighbor_idx=neighbor_idx, distance=dist))
|
|
84
|
+
results = [
|
|
85
|
+
self._build_neighbor_result(
|
|
86
|
+
query_id=self.df.iloc[i][self.id_column], neighbor_idx=neighbor_idx, distance=dist
|
|
87
|
+
)
|
|
88
|
+
for i, (dists, nbrs) in enumerate(zip(distances, indices))
|
|
89
|
+
for neighbor_idx, dist in zip(nbrs, dists)
|
|
90
|
+
if neighbor_idx != i # Skip self
|
|
91
|
+
]
|
|
91
92
|
|
|
92
93
|
return pd.DataFrame(results)
|
|
93
94
|
|
|
94
95
|
def neighbors(
|
|
96
|
+
self,
|
|
97
|
+
id_or_ids,
|
|
98
|
+
n_neighbors: Optional[int] = 5,
|
|
99
|
+
radius: Optional[float] = None,
|
|
100
|
+
include_self: bool = True,
|
|
101
|
+
) -> pd.DataFrame:
|
|
102
|
+
"""
|
|
103
|
+
Return neighbors for ID(s) from the existing dataset.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
id_or_ids: Single ID or list of IDs to look up
|
|
107
|
+
n_neighbors: Number of neighbors to return (default: 5)
|
|
108
|
+
radius: If provided, find all neighbors within this radius
|
|
109
|
+
include_self: Whether to include self in results (if present)
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
DataFrame containing neighbors and distances
|
|
113
|
+
"""
|
|
114
|
+
# Normalize to list
|
|
115
|
+
ids = [id_or_ids] if not isinstance(id_or_ids, list) else id_or_ids
|
|
116
|
+
|
|
117
|
+
# Validate IDs exist
|
|
118
|
+
missing_ids = set(ids) - set(self.df[self.id_column])
|
|
119
|
+
if missing_ids:
|
|
120
|
+
raise ValueError(f"IDs not found in dataset: {missing_ids}")
|
|
121
|
+
|
|
122
|
+
# Filter to requested IDs and preserve order
|
|
123
|
+
query_df = self.df[self.df[self.id_column].isin(ids)]
|
|
124
|
+
query_df = query_df.set_index(self.id_column).loc[ids].reset_index()
|
|
125
|
+
|
|
126
|
+
# Use the core implementation
|
|
127
|
+
return self.find_neighbors(query_df, n_neighbors=n_neighbors, radius=radius, include_self=include_self)
|
|
128
|
+
|
|
129
|
+
def find_neighbors(
|
|
95
130
|
self,
|
|
96
131
|
query_df: pd.DataFrame,
|
|
97
|
-
|
|
132
|
+
n_neighbors: Optional[int] = 5,
|
|
133
|
+
radius: Optional[float] = None,
|
|
98
134
|
include_self: bool = True,
|
|
99
135
|
) -> pd.DataFrame:
|
|
100
136
|
"""
|
|
@@ -102,63 +138,63 @@ class Proximity:
|
|
|
102
138
|
|
|
103
139
|
Args:
|
|
104
140
|
query_df: DataFrame containing query points
|
|
141
|
+
n_neighbors: Number of neighbors to return (default: 5)
|
|
105
142
|
radius: If provided, find all neighbors within this radius
|
|
106
143
|
include_self: Whether to include self in results (if present)
|
|
107
144
|
|
|
108
145
|
Returns:
|
|
109
146
|
DataFrame containing neighbors and distances
|
|
110
|
-
|
|
111
|
-
Note: The query DataFrame must include the feature columns. The id_column is optional.
|
|
112
147
|
"""
|
|
113
|
-
#
|
|
148
|
+
# Validate features
|
|
114
149
|
missing = set(self.features) - set(query_df.columns)
|
|
115
150
|
if missing:
|
|
116
151
|
raise ValueError(f"Query DataFrame is missing required feature columns: {missing}")
|
|
117
152
|
|
|
118
|
-
# Check if id_column is present
|
|
119
153
|
id_column_present = self.id_column in query_df.columns
|
|
120
154
|
|
|
121
|
-
#
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
# Print the ID column for rows with NaNs
|
|
125
|
-
if rows_with_nan.any():
|
|
126
|
-
log.warning(f"Found {rows_with_nan.sum()} rows with NaNs in feature columns:")
|
|
127
|
-
log.warning(query_df.loc[rows_with_nan, self.id_column])
|
|
128
|
-
|
|
129
|
-
# Drop rows with NaNs in feature columns and reassign to query_df
|
|
130
|
-
query_df = query_df.dropna(subset=self.features)
|
|
155
|
+
# Handle NaN rows
|
|
156
|
+
query_df = self._handle_nan_rows(query_df, id_column_present)
|
|
131
157
|
|
|
132
|
-
# Transform
|
|
158
|
+
# Transform query features
|
|
133
159
|
X_query = self.scaler.transform(query_df[self.features])
|
|
134
160
|
|
|
135
|
-
# Get neighbors
|
|
161
|
+
# Get neighbors
|
|
136
162
|
if radius is not None:
|
|
137
163
|
distances, indices = self.nn.radius_neighbors(X_query, radius=radius)
|
|
138
164
|
else:
|
|
139
|
-
distances, indices = self.nn.kneighbors(X_query)
|
|
165
|
+
distances, indices = self.nn.kneighbors(X_query, n_neighbors=n_neighbors)
|
|
140
166
|
|
|
141
167
|
# Build results
|
|
142
|
-
|
|
168
|
+
results = []
|
|
143
169
|
for i, (dists, nbrs) in enumerate(zip(distances, indices)):
|
|
144
|
-
# Use the ID from the query DataFrame if available, otherwise use the row index
|
|
145
170
|
query_id = query_df.iloc[i][self.id_column] if id_column_present else f"query_{i}"
|
|
146
171
|
|
|
147
172
|
for neighbor_idx, dist in zip(nbrs, dists):
|
|
148
|
-
# Skip if the neighbor is the query itself and include_self is False
|
|
149
173
|
neighbor_id = self.df.iloc[neighbor_idx][self.id_column]
|
|
174
|
+
|
|
175
|
+
# Skip if neighbor is self and include_self is False
|
|
150
176
|
if not include_self and neighbor_id == query_id:
|
|
151
177
|
continue
|
|
152
178
|
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
179
|
+
results.append(self._build_neighbor_result(query_id=query_id, neighbor_idx=neighbor_idx, distance=dist))
|
|
180
|
+
|
|
181
|
+
results_df = pd.DataFrame(results).sort_values([self.id_column, "distance"]).reset_index(drop=True)
|
|
182
|
+
return results_df
|
|
183
|
+
|
|
184
|
+
def _handle_nan_rows(self, query_df: pd.DataFrame, id_column_present: bool) -> pd.DataFrame:
|
|
185
|
+
"""Drop rows with NaN values in feature columns and log warnings."""
|
|
186
|
+
rows_with_nan = query_df[self.features].isna().any(axis=1)
|
|
187
|
+
|
|
188
|
+
if rows_with_nan.any():
|
|
189
|
+
log.warning(f"Found {rows_with_nan.sum()} rows with NaNs in feature columns:")
|
|
190
|
+
if id_column_present:
|
|
191
|
+
log.warning(query_df.loc[rows_with_nan, self.id_column])
|
|
156
192
|
|
|
157
|
-
return
|
|
193
|
+
return query_df.dropna(subset=self.features)
|
|
158
194
|
|
|
159
195
|
def _build_neighbor_result(self, query_id, neighbor_idx: int, distance: float) -> Dict:
|
|
160
196
|
"""
|
|
161
|
-
|
|
197
|
+
Build a result dictionary for a single neighbor.
|
|
162
198
|
|
|
163
199
|
Args:
|
|
164
200
|
query_id: ID of the query point
|
|
@@ -169,27 +205,30 @@ class Proximity:
|
|
|
169
205
|
Dictionary containing neighbor information
|
|
170
206
|
"""
|
|
171
207
|
neighbor_id = self.df.iloc[neighbor_idx][self.id_column]
|
|
208
|
+
neighbor_row = self.df.iloc[neighbor_idx]
|
|
172
209
|
|
|
173
|
-
#
|
|
174
|
-
|
|
210
|
+
# Start with basic info
|
|
211
|
+
result = {
|
|
175
212
|
self.id_column: query_id,
|
|
176
213
|
"neighbor_id": neighbor_id,
|
|
177
214
|
"distance": distance,
|
|
178
215
|
}
|
|
179
216
|
|
|
180
|
-
#
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
217
|
+
# Columns to automatically include if they exist
|
|
218
|
+
auto_include = (
|
|
219
|
+
([self.target, "prediction"] if self.target else [])
|
|
220
|
+
+ self.track_columns
|
|
221
|
+
+ [col for col in self.df.columns if "_proba" in col or "residual" in col or col == "outlier"]
|
|
222
|
+
)
|
|
184
223
|
|
|
185
|
-
# Add
|
|
186
|
-
|
|
224
|
+
# Add values for existing columns
|
|
225
|
+
for col in auto_include:
|
|
226
|
+
if col in self.df.columns:
|
|
227
|
+
result[col] = neighbor_row[col]
|
|
187
228
|
|
|
188
|
-
#
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
return neighbor_info
|
|
229
|
+
# Truncate very small distances to zero
|
|
230
|
+
result["distance"] = 0.0 if distance < 1e-7 else distance
|
|
231
|
+
return result
|
|
193
232
|
|
|
194
233
|
def serialize(self, directory: str) -> None:
|
|
195
234
|
"""
|
|
@@ -198,8 +237,8 @@ class Proximity:
|
|
|
198
237
|
Args:
|
|
199
238
|
directory: Directory path to save the model components
|
|
200
239
|
"""
|
|
201
|
-
|
|
202
|
-
|
|
240
|
+
dir_path = Path(directory)
|
|
241
|
+
dir_path.mkdir(parents=True, exist_ok=True)
|
|
203
242
|
|
|
204
243
|
# Save metadata
|
|
205
244
|
metadata = {
|
|
@@ -210,17 +249,16 @@ class Proximity:
|
|
|
210
249
|
"n_neighbors": self.n_neighbors,
|
|
211
250
|
}
|
|
212
251
|
|
|
213
|
-
|
|
214
|
-
json.dump(metadata, f)
|
|
252
|
+
(dir_path / "metadata.json").write_text(json.dumps(metadata))
|
|
215
253
|
|
|
216
|
-
# Save
|
|
217
|
-
self.df.to_pickle(
|
|
254
|
+
# Save DataFrame
|
|
255
|
+
self.df.to_pickle(dir_path / "df.pkl")
|
|
218
256
|
|
|
219
|
-
# Save
|
|
220
|
-
with open(
|
|
257
|
+
# Save models
|
|
258
|
+
with open(dir_path / "scaler.pkl", "wb") as f:
|
|
221
259
|
pickle.dump(self.scaler, f)
|
|
222
260
|
|
|
223
|
-
with open(
|
|
261
|
+
with open(dir_path / "nn_model.pkl", "wb") as f:
|
|
224
262
|
pickle.dump(self.nn, f)
|
|
225
263
|
|
|
226
264
|
log.info(f"Proximity model serialized to {directory}")
|
|
@@ -234,23 +272,22 @@ class Proximity:
|
|
|
234
272
|
directory: Directory path containing the serialized model components
|
|
235
273
|
|
|
236
274
|
Returns:
|
|
237
|
-
|
|
275
|
+
A new Proximity instance
|
|
238
276
|
"""
|
|
239
|
-
|
|
240
|
-
if not
|
|
277
|
+
dir_path = Path(directory)
|
|
278
|
+
if not dir_path.is_dir():
|
|
241
279
|
raise ValueError(f"Directory {directory} does not exist or is not a directory")
|
|
242
280
|
|
|
243
281
|
# Load metadata
|
|
244
|
-
|
|
245
|
-
metadata = json.load(f)
|
|
282
|
+
metadata = json.loads((dir_path / "metadata.json").read_text())
|
|
246
283
|
|
|
247
284
|
# Load DataFrame
|
|
248
|
-
df_path =
|
|
249
|
-
if not
|
|
285
|
+
df_path = dir_path / "df.pkl"
|
|
286
|
+
if not df_path.exists():
|
|
250
287
|
raise FileNotFoundError(f"DataFrame file not found at {df_path}")
|
|
251
288
|
df = pd.read_pickle(df_path)
|
|
252
289
|
|
|
253
|
-
# Create instance
|
|
290
|
+
# Create instance without calling __init__
|
|
254
291
|
instance = cls.__new__(cls)
|
|
255
292
|
instance.df = df
|
|
256
293
|
instance.id_column = metadata["id_column"]
|
|
@@ -259,15 +296,16 @@ class Proximity:
|
|
|
259
296
|
instance.track_columns = metadata["track_columns"]
|
|
260
297
|
instance.n_neighbors = metadata["n_neighbors"]
|
|
261
298
|
|
|
262
|
-
# Load
|
|
263
|
-
with open(
|
|
299
|
+
# Load models
|
|
300
|
+
with open(dir_path / "scaler.pkl", "rb") as f:
|
|
264
301
|
instance.scaler = pickle.load(f)
|
|
265
302
|
|
|
266
|
-
with open(
|
|
303
|
+
with open(dir_path / "nn_model.pkl", "rb") as f:
|
|
267
304
|
instance.nn = pickle.load(f)
|
|
268
305
|
|
|
269
|
-
#
|
|
306
|
+
# Restore X
|
|
270
307
|
instance.X = instance.scaler.transform(instance.df[instance.features])
|
|
308
|
+
instance.proximity_type = ProximityType.DISTANCE
|
|
271
309
|
|
|
272
310
|
log.info(f"Proximity model deserialized from {directory}")
|
|
273
311
|
return instance
|
|
@@ -294,10 +332,10 @@ if __name__ == "__main__":
|
|
|
294
332
|
print(prox.all_neighbors())
|
|
295
333
|
|
|
296
334
|
# Test the neighbors method
|
|
297
|
-
print(prox.neighbors(
|
|
335
|
+
print(prox.neighbors(1))
|
|
298
336
|
|
|
299
337
|
# Test the neighbors method with radius
|
|
300
|
-
print(prox.neighbors(
|
|
338
|
+
print(prox.neighbors(1, radius=2.0))
|
|
301
339
|
|
|
302
340
|
# Test with data that isn't in the 'train' dataframe
|
|
303
341
|
query_data = {
|
|
@@ -307,7 +345,7 @@ if __name__ == "__main__":
|
|
|
307
345
|
"Feature3": [2.31],
|
|
308
346
|
}
|
|
309
347
|
query_df = pd.DataFrame(query_data)
|
|
310
|
-
print(prox.
|
|
348
|
+
print(prox.find_neighbors(query_df=query_df)) # For new data we use find_neighbors()
|
|
311
349
|
|
|
312
350
|
# Test with Features list
|
|
313
351
|
prox = Proximity(df, id_column="ID", features=["Feature1"], n_neighbors=2)
|
|
@@ -334,13 +372,13 @@ if __name__ == "__main__":
|
|
|
334
372
|
print(prox.all_neighbors())
|
|
335
373
|
|
|
336
374
|
# Test the neighbors method
|
|
337
|
-
print(prox.neighbors(
|
|
375
|
+
print(prox.neighbors(["a", "b"]))
|
|
338
376
|
|
|
339
377
|
# Time neighbors with all IDs versus calling all_neighbors
|
|
340
378
|
import time
|
|
341
379
|
|
|
342
380
|
start_time = time.time()
|
|
343
|
-
prox_df = prox.
|
|
381
|
+
prox_df = prox.find_neighbors(query_df=df, include_self=False)
|
|
344
382
|
end_time = time.time()
|
|
345
383
|
print(f"Time taken for neighbors: {end_time - start_time:.4f} seconds")
|
|
346
384
|
start_time = time.time()
|
|
@@ -361,7 +399,7 @@ if __name__ == "__main__":
|
|
|
361
399
|
|
|
362
400
|
# Test querying without the id_column
|
|
363
401
|
df_no_id = df.drop(columns=["foo_id"])
|
|
364
|
-
print(prox.
|
|
402
|
+
print(prox.find_neighbors(query_df=df_no_id, include_self=False))
|
|
365
403
|
|
|
366
404
|
# Test duplicate IDs
|
|
367
405
|
data = {
|
|
@@ -379,6 +417,9 @@ if __name__ == "__main__":
|
|
|
379
417
|
|
|
380
418
|
fs = FeatureSet("abalone_features")
|
|
381
419
|
model = Model("abalone-regression")
|
|
420
|
+
features = model.features()
|
|
382
421
|
df = fs.pull_dataframe()
|
|
383
|
-
prox = Proximity(
|
|
384
|
-
|
|
422
|
+
prox = Proximity(
|
|
423
|
+
df, id_column=fs.id_column, features=model.features(), target=model.target(), track_columns=features
|
|
424
|
+
)
|
|
425
|
+
print(prox.find_neighbors(query_df=df[0:2]))
|
|
@@ -3,7 +3,7 @@ TEMPLATE_PARAMS = {
|
|
|
3
3
|
"model_type": "{{model_type}}",
|
|
4
4
|
"target_column": "{{target_column}}",
|
|
5
5
|
"feature_list": "{{feature_list}}",
|
|
6
|
-
"model_metrics_s3_path": "{{model_metrics_s3_path}}"
|
|
6
|
+
"model_metrics_s3_path": "{{model_metrics_s3_path}}",
|
|
7
7
|
}
|
|
8
8
|
|
|
9
9
|
# Imports for XGB Model
|
|
@@ -12,11 +12,7 @@ import awswrangler as wr
|
|
|
12
12
|
import numpy as np
|
|
13
13
|
|
|
14
14
|
# Model Performance Scores
|
|
15
|
-
from sklearn.metrics import
|
|
16
|
-
mean_absolute_error,
|
|
17
|
-
r2_score,
|
|
18
|
-
root_mean_squared_error
|
|
19
|
-
)
|
|
15
|
+
from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
|
|
20
16
|
|
|
21
17
|
from io import StringIO
|
|
22
18
|
import json
|
|
@@ -39,6 +35,7 @@ def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
|
|
|
39
35
|
print(msg)
|
|
40
36
|
raise ValueError(msg)
|
|
41
37
|
|
|
38
|
+
|
|
42
39
|
def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> pd.DataFrame:
|
|
43
40
|
"""
|
|
44
41
|
Matches and renames the DataFrame's column names to match the model's feature names (case-insensitive).
|
|
@@ -95,11 +92,7 @@ if __name__ == "__main__":
|
|
|
95
92
|
args = parser.parse_args()
|
|
96
93
|
|
|
97
94
|
# Read the training data into DataFrames
|
|
98
|
-
training_files = [
|
|
99
|
-
os.path.join(args.train, file)
|
|
100
|
-
for file in os.listdir(args.train)
|
|
101
|
-
if file.endswith(".csv")
|
|
102
|
-
]
|
|
95
|
+
training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
|
|
103
96
|
print(f"Training Files: {training_files}")
|
|
104
97
|
|
|
105
98
|
# Combine files and read them all into a single pandas dataframe
|
|
@@ -150,7 +143,6 @@ if __name__ == "__main__":
|
|
|
150
143
|
result_df["residual"] = result_df[target] - result_df["prediction"]
|
|
151
144
|
result_df["residual_abs"] = result_df["residual"].abs()
|
|
152
145
|
|
|
153
|
-
|
|
154
146
|
# Save the results dataframe to S3
|
|
155
147
|
wr.s3.to_csv(
|
|
156
148
|
result_df,
|
|
@@ -210,7 +202,7 @@ def input_fn(input_data, content_type):
|
|
|
210
202
|
"""Parse input data and return a DataFrame."""
|
|
211
203
|
if not input_data:
|
|
212
204
|
raise ValueError("Empty input data is not supported!")
|
|
213
|
-
|
|
205
|
+
|
|
214
206
|
# Decode bytes to string if necessary
|
|
215
207
|
if isinstance(input_data, bytes):
|
|
216
208
|
input_data = input_data.decode("utf-8")
|
|
@@ -36,12 +36,12 @@ from typing import List, Tuple
|
|
|
36
36
|
# Template Parameters
|
|
37
37
|
TEMPLATE_PARAMS = {
|
|
38
38
|
"model_type": "{{model_type}}",
|
|
39
|
-
"
|
|
39
|
+
"target": "{{target_column}}",
|
|
40
40
|
"features": "{{feature_list}}",
|
|
41
41
|
"compressed_features": "{{compressed_features}}",
|
|
42
42
|
"model_metrics_s3_path": "{{model_metrics_s3_path}}",
|
|
43
43
|
"train_all_data": "{{train_all_data}}",
|
|
44
|
-
"hyperparameters": "{{hyperparameters}}"
|
|
44
|
+
"hyperparameters": "{{hyperparameters}}",
|
|
45
45
|
}
|
|
46
46
|
|
|
47
47
|
|
|
@@ -103,7 +103,6 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
|
|
|
103
103
|
df_columns_lower = {col.lower(): col for col in df.columns}
|
|
104
104
|
rename_dict = {}
|
|
105
105
|
missing = []
|
|
106
|
-
|
|
107
106
|
for feature in model_features:
|
|
108
107
|
if feature in df.columns:
|
|
109
108
|
continue # Exact match
|
|
@@ -115,6 +114,7 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
|
|
|
115
114
|
if missing:
|
|
116
115
|
raise ValueError(f"Features not found: {missing}")
|
|
117
116
|
|
|
117
|
+
# Rename the DataFrame columns to match the model features
|
|
118
118
|
return df.rename(columns=rename_dict)
|
|
119
119
|
|
|
120
120
|
|
|
@@ -210,7 +210,7 @@ def model_fn(model_dir):
|
|
|
210
210
|
original_cwd = os.getcwd()
|
|
211
211
|
try:
|
|
212
212
|
# Change to /tmp because Pytorch Tabular needs write access (creates a .pt_tmp directory)
|
|
213
|
-
os.chdir(
|
|
213
|
+
os.chdir("/tmp")
|
|
214
214
|
|
|
215
215
|
# Load the model
|
|
216
216
|
model_path = os.path.join(model_dir, "tabular_model")
|
|
@@ -328,7 +328,7 @@ if __name__ == "__main__":
|
|
|
328
328
|
"""The main function is for training the PyTorch Tabular model"""
|
|
329
329
|
|
|
330
330
|
# Harness Template Parameters
|
|
331
|
-
target = TEMPLATE_PARAMS["
|
|
331
|
+
target = TEMPLATE_PARAMS["target"]
|
|
332
332
|
features = TEMPLATE_PARAMS["features"]
|
|
333
333
|
orig_features = features.copy()
|
|
334
334
|
compressed_features = TEMPLATE_PARAMS["compressed_features"]
|
|
@@ -348,11 +348,7 @@ if __name__ == "__main__":
|
|
|
348
348
|
args = parser.parse_args()
|
|
349
349
|
|
|
350
350
|
# Read the training data into DataFrames
|
|
351
|
-
training_files = [
|
|
352
|
-
os.path.join(args.train, file)
|
|
353
|
-
for file in os.listdir(args.train)
|
|
354
|
-
if file.endswith(".csv")
|
|
355
|
-
]
|
|
351
|
+
training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
|
|
356
352
|
print(f"Training Files: {training_files}")
|
|
357
353
|
|
|
358
354
|
# Combine files and read them all into a single pandas dataframe
|
|
@@ -432,9 +428,12 @@ if __name__ == "__main__":
|
|
|
432
428
|
"gradient_clip_val": 1.0,
|
|
433
429
|
}
|
|
434
430
|
|
|
435
|
-
# Override defaults with
|
|
436
|
-
|
|
437
|
-
|
|
431
|
+
# Override defaults with training_config if present
|
|
432
|
+
training_overrides = {k: v for k, v in hyperparameters.get("training_config", {}).items() if k in trainer_defaults}
|
|
433
|
+
# Print overwrites
|
|
434
|
+
for key, value in training_overrides.items():
|
|
435
|
+
print(f"TRAINING CONFIG Override: {key}: {trainer_defaults[key]} → {value}")
|
|
436
|
+
trainer_params = {**trainer_defaults, **training_overrides}
|
|
438
437
|
trainer_config = TrainerConfig(**trainer_params)
|
|
439
438
|
|
|
440
439
|
# Model config defaults
|
|
@@ -446,17 +445,17 @@ if __name__ == "__main__":
|
|
|
446
445
|
"use_batch_norm": True,
|
|
447
446
|
"initialization": "kaiming",
|
|
448
447
|
}
|
|
449
|
-
# Override defaults with
|
|
450
|
-
|
|
451
|
-
|
|
448
|
+
# Override defaults with model_config if present
|
|
449
|
+
model_overrides = {k: v for k, v in hyperparameters.get("model_config", {}).items() if k in model_defaults}
|
|
450
|
+
# Print overwrites
|
|
451
|
+
for key, value in model_overrides.items():
|
|
452
|
+
print(f"MODEL CONFIG Override: {key}: {model_defaults[key]} → {value}")
|
|
453
|
+
model_params = {**model_defaults, **model_overrides}
|
|
452
454
|
|
|
453
455
|
# Use CategoryEmbedding model configuration for general-purpose tabular modeling.
|
|
454
456
|
# Works effectively for both regression and classification as the foundational
|
|
455
457
|
# architecture in PyTorch Tabular
|
|
456
|
-
model_config = CategoryEmbeddingModelConfig(
|
|
457
|
-
task=task,
|
|
458
|
-
**model_params
|
|
459
|
-
)
|
|
458
|
+
model_config = CategoryEmbeddingModelConfig(task=task, **model_params)
|
|
460
459
|
optimizer_config = OptimizerConfig()
|
|
461
460
|
|
|
462
461
|
#####################################
|
|
@@ -8,7 +8,7 @@ TEMPLATE_PARAMS = {
|
|
|
8
8
|
"feature_list": "{{feature_list}}",
|
|
9
9
|
"model_class": "{{model_class}}",
|
|
10
10
|
"model_metrics_s3_path": "{{model_metrics_s3_path}}",
|
|
11
|
-
"train_all_data": "{{train_all_data}}"
|
|
11
|
+
"train_all_data": "{{train_all_data}}",
|
|
12
12
|
}
|
|
13
13
|
|
|
14
14
|
import awswrangler as wr
|
|
@@ -99,10 +99,7 @@ if __name__ == "__main__":
|
|
|
99
99
|
args = parser.parse_args()
|
|
100
100
|
|
|
101
101
|
# Load training data from the specified directory
|
|
102
|
-
training_files = [
|
|
103
|
-
os.path.join(args.train, file)
|
|
104
|
-
for file in os.listdir(args.train) if file.endswith(".csv")
|
|
105
|
-
]
|
|
102
|
+
training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
|
|
106
103
|
all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
|
|
107
104
|
|
|
108
105
|
# Check if the DataFrame is empty
|
|
@@ -116,10 +113,7 @@ if __name__ == "__main__":
|
|
|
116
113
|
|
|
117
114
|
if needs_standardization:
|
|
118
115
|
# Create a pipeline with standardization and the model
|
|
119
|
-
model = Pipeline([
|
|
120
|
-
("scaler", StandardScaler()),
|
|
121
|
-
("model", model)
|
|
122
|
-
])
|
|
116
|
+
model = Pipeline([("scaler", StandardScaler()), ("model", model)])
|
|
123
117
|
|
|
124
118
|
# Handle logic based on the model_type
|
|
125
119
|
if model_type in ["classifier", "regressor"]:
|
|
@@ -206,6 +200,7 @@ if __name__ == "__main__":
|
|
|
206
200
|
with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
|
|
207
201
|
json.dump(feature_list, fp)
|
|
208
202
|
|
|
203
|
+
|
|
209
204
|
#
|
|
210
205
|
# Inference Section
|
|
211
206
|
#
|
|
@@ -70,6 +70,11 @@ def fill_template(template_path: str, params: dict, output_script: str) -> str:
|
|
|
70
70
|
# Sanity check to ensure all placeholders were replaced
|
|
71
71
|
if "{{" in template and "}}" in template:
|
|
72
72
|
msg = "Not all template placeholders were replaced. Please check your params."
|
|
73
|
+
|
|
74
|
+
# Show which placeholders are still present
|
|
75
|
+
start = template.index("{{")
|
|
76
|
+
end = template.index("}}", start) + 2
|
|
77
|
+
msg += f" Unreplaced placeholder: {template[start:end]}"
|
|
73
78
|
log.critical(msg)
|
|
74
79
|
raise ValueError(msg)
|
|
75
80
|
|
|
@@ -112,8 +117,8 @@ def generate_model_script(template_params: dict) -> str:
|
|
|
112
117
|
template_name = "xgb_model.template"
|
|
113
118
|
model_script_dir = "xgb_model"
|
|
114
119
|
elif template_params["model_type"] == ModelType.UQ_REGRESSOR:
|
|
115
|
-
template_name = "
|
|
116
|
-
model_script_dir = "
|
|
120
|
+
template_name = "mapie.template"
|
|
121
|
+
model_script_dir = "uq_models"
|
|
117
122
|
elif template_params["model_type"] == ModelType.ENSEMBLE_REGRESSOR:
|
|
118
123
|
template_name = "ensemble_xgb.template"
|
|
119
124
|
model_script_dir = "ensemble_xgb"
|