workbench 0.8.177__py3-none-any.whl → 0.8.227__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of workbench might be problematic. Click here for more details.
- workbench/__init__.py +1 -0
- workbench/algorithms/dataframe/__init__.py +1 -2
- workbench/algorithms/dataframe/compound_dataset_overlap.py +321 -0
- workbench/algorithms/dataframe/feature_space_proximity.py +168 -75
- workbench/algorithms/dataframe/fingerprint_proximity.py +422 -86
- workbench/algorithms/dataframe/projection_2d.py +44 -21
- workbench/algorithms/dataframe/proximity.py +259 -305
- workbench/algorithms/graph/light/proximity_graph.py +12 -11
- workbench/algorithms/models/cleanlab_model.py +382 -0
- workbench/algorithms/models/noise_model.py +388 -0
- workbench/algorithms/sql/column_stats.py +0 -1
- workbench/algorithms/sql/correlations.py +0 -1
- workbench/algorithms/sql/descriptive_stats.py +0 -1
- workbench/algorithms/sql/outliers.py +3 -3
- workbench/api/__init__.py +5 -1
- workbench/api/df_store.py +17 -108
- workbench/api/endpoint.py +14 -12
- workbench/api/feature_set.py +117 -11
- workbench/api/meta.py +0 -1
- workbench/api/meta_model.py +289 -0
- workbench/api/model.py +52 -21
- workbench/api/parameter_store.py +3 -52
- workbench/cached/cached_meta.py +0 -1
- workbench/cached/cached_model.py +49 -11
- workbench/core/artifacts/__init__.py +11 -2
- workbench/core/artifacts/artifact.py +5 -5
- workbench/core/artifacts/df_store_core.py +114 -0
- workbench/core/artifacts/endpoint_core.py +319 -204
- workbench/core/artifacts/feature_set_core.py +249 -45
- workbench/core/artifacts/model_core.py +135 -82
- workbench/core/artifacts/parameter_store_core.py +98 -0
- workbench/core/cloud_platform/cloud_meta.py +0 -1
- workbench/core/pipelines/pipeline_executor.py +1 -1
- workbench/core/transforms/features_to_model/features_to_model.py +60 -44
- workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +43 -10
- workbench/core/transforms/pandas_transforms/pandas_to_features.py +38 -2
- workbench/core/views/training_view.py +113 -42
- workbench/core/views/view.py +53 -3
- workbench/core/views/view_utils.py +4 -4
- workbench/model_script_utils/model_script_utils.py +339 -0
- workbench/model_script_utils/pytorch_utils.py +405 -0
- workbench/model_script_utils/uq_harness.py +277 -0
- workbench/model_scripts/chemprop/chemprop.template +774 -0
- workbench/model_scripts/chemprop/generated_model_script.py +774 -0
- workbench/model_scripts/chemprop/model_script_utils.py +339 -0
- workbench/model_scripts/chemprop/requirements.txt +3 -0
- workbench/model_scripts/custom_models/chem_info/fingerprints.py +175 -0
- workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +0 -1
- workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +0 -1
- workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py +1 -2
- workbench/model_scripts/custom_models/proximity/feature_space_proximity.py +194 -0
- workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +8 -10
- workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
- workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +20 -21
- workbench/model_scripts/custom_models/uq_models/feature_space_proximity.py +194 -0
- workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
- workbench/model_scripts/custom_models/uq_models/ngboost.template +15 -16
- workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +15 -17
- workbench/model_scripts/meta_model/generated_model_script.py +209 -0
- workbench/model_scripts/meta_model/meta_model.template +209 -0
- workbench/model_scripts/pytorch_model/generated_model_script.py +443 -499
- workbench/model_scripts/pytorch_model/model_script_utils.py +339 -0
- workbench/model_scripts/pytorch_model/pytorch.template +440 -496
- workbench/model_scripts/pytorch_model/pytorch_utils.py +405 -0
- workbench/model_scripts/pytorch_model/requirements.txt +1 -1
- workbench/model_scripts/pytorch_model/uq_harness.py +277 -0
- workbench/model_scripts/scikit_learn/generated_model_script.py +7 -12
- workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
- workbench/model_scripts/script_generation.py +15 -12
- workbench/model_scripts/uq_models/generated_model_script.py +248 -0
- workbench/model_scripts/xgb_model/generated_model_script.py +371 -403
- workbench/model_scripts/xgb_model/model_script_utils.py +339 -0
- workbench/model_scripts/xgb_model/uq_harness.py +277 -0
- workbench/model_scripts/xgb_model/xgb_model.template +367 -399
- workbench/repl/workbench_shell.py +18 -14
- workbench/resources/open_source_api.key +1 -1
- workbench/scripts/endpoint_test.py +162 -0
- workbench/scripts/lambda_test.py +73 -0
- workbench/scripts/meta_model_sim.py +35 -0
- workbench/scripts/ml_pipeline_sqs.py +122 -6
- workbench/scripts/training_test.py +85 -0
- workbench/themes/dark/custom.css +59 -0
- workbench/themes/dark/plotly.json +5 -5
- workbench/themes/light/custom.css +153 -40
- workbench/themes/light/plotly.json +9 -9
- workbench/themes/midnight_blue/custom.css +59 -0
- workbench/utils/aws_utils.py +0 -1
- workbench/utils/chem_utils/fingerprints.py +87 -46
- workbench/utils/chem_utils/mol_descriptors.py +0 -1
- workbench/utils/chem_utils/projections.py +16 -6
- workbench/utils/chem_utils/vis.py +25 -27
- workbench/utils/chemprop_utils.py +141 -0
- workbench/utils/config_manager.py +2 -6
- workbench/utils/endpoint_utils.py +5 -7
- workbench/utils/license_manager.py +2 -6
- workbench/utils/markdown_utils.py +57 -0
- workbench/utils/meta_model_simulator.py +499 -0
- workbench/utils/metrics_utils.py +256 -0
- workbench/utils/model_utils.py +260 -76
- workbench/utils/pipeline_utils.py +0 -1
- workbench/utils/plot_utils.py +159 -34
- workbench/utils/pytorch_utils.py +87 -0
- workbench/utils/shap_utils.py +11 -57
- workbench/utils/theme_manager.py +95 -30
- workbench/utils/xgboost_local_crossfold.py +267 -0
- workbench/utils/xgboost_model_utils.py +127 -220
- workbench/web_interface/components/experiments/outlier_plot.py +0 -1
- workbench/web_interface/components/model_plot.py +16 -2
- workbench/web_interface/components/plugin_unit_test.py +5 -3
- workbench/web_interface/components/plugins/ag_table.py +2 -4
- workbench/web_interface/components/plugins/confusion_matrix.py +3 -6
- workbench/web_interface/components/plugins/model_details.py +48 -80
- workbench/web_interface/components/plugins/scatter_plot.py +192 -92
- workbench/web_interface/components/settings_menu.py +184 -0
- workbench/web_interface/page_views/main_page.py +0 -1
- {workbench-0.8.177.dist-info → workbench-0.8.227.dist-info}/METADATA +31 -17
- {workbench-0.8.177.dist-info → workbench-0.8.227.dist-info}/RECORD +121 -106
- {workbench-0.8.177.dist-info → workbench-0.8.227.dist-info}/entry_points.txt +4 -0
- {workbench-0.8.177.dist-info → workbench-0.8.227.dist-info}/licenses/LICENSE +1 -1
- workbench/core/cloud_platform/aws/aws_df_store.py +0 -404
- workbench/core/cloud_platform/aws/aws_parameter_store.py +0 -280
- workbench/model_scripts/custom_models/meta_endpoints/example.py +0 -53
- workbench/model_scripts/custom_models/proximity/generated_model_script.py +0 -138
- workbench/model_scripts/custom_models/proximity/proximity.py +0 -384
- workbench/model_scripts/custom_models/uq_models/generated_model_script.py +0 -494
- workbench/model_scripts/custom_models/uq_models/mapie.template +0 -494
- workbench/model_scripts/custom_models/uq_models/meta_uq.template +0 -386
- workbench/model_scripts/custom_models/uq_models/proximity.py +0 -384
- workbench/model_scripts/ensemble_xgb/generated_model_script.py +0 -279
- workbench/model_scripts/quant_regression/quant_regression.template +0 -279
- workbench/model_scripts/quant_regression/requirements.txt +0 -1
- workbench/themes/quartz/base_css.url +0 -1
- workbench/themes/quartz/custom.css +0 -117
- workbench/themes/quartz/plotly.json +0 -642
- workbench/themes/quartz_dark/base_css.url +0 -1
- workbench/themes/quartz_dark/custom.css +0 -131
- workbench/themes/quartz_dark/plotly.json +0 -642
- workbench/utils/resource_utils.py +0 -39
- {workbench-0.8.177.dist-info → workbench-0.8.227.dist-info}/WHEEL +0 -0
- {workbench-0.8.177.dist-info → workbench-0.8.227.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
from sklearn.preprocessing import StandardScaler
|
|
4
|
+
from sklearn.neighbors import NearestNeighbors
|
|
5
|
+
from typing import List, Optional
|
|
6
|
+
import logging
|
|
7
|
+
|
|
8
|
+
# Workbench Imports
|
|
9
|
+
from workbench.algorithms.dataframe.proximity import Proximity
|
|
10
|
+
from workbench.algorithms.dataframe.projection_2d import Projection2D
|
|
11
|
+
|
|
12
|
+
# Set up logging
|
|
13
|
+
log = logging.getLogger("workbench")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class FeatureSpaceProximity(Proximity):
|
|
17
|
+
"""Proximity computations for numeric feature spaces using Euclidean distance."""
|
|
18
|
+
|
|
19
|
+
def __init__(
|
|
20
|
+
self,
|
|
21
|
+
df: pd.DataFrame,
|
|
22
|
+
id_column: str,
|
|
23
|
+
features: List[str],
|
|
24
|
+
target: Optional[str] = None,
|
|
25
|
+
include_all_columns: bool = False,
|
|
26
|
+
):
|
|
27
|
+
"""
|
|
28
|
+
Initialize the FeatureSpaceProximity class.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
df: DataFrame containing data for neighbor computations.
|
|
32
|
+
id_column: Name of the column used as the identifier.
|
|
33
|
+
features: List of feature column names to be used for neighbor computations.
|
|
34
|
+
target: Name of the target column. Defaults to None.
|
|
35
|
+
include_all_columns: Include all DataFrame columns in neighbor results. Defaults to False.
|
|
36
|
+
"""
|
|
37
|
+
# Validate and filter features before calling parent init
|
|
38
|
+
self._raw_features = features
|
|
39
|
+
super().__init__(
|
|
40
|
+
df, id_column=id_column, features=features, target=target, include_all_columns=include_all_columns
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
def _prepare_data(self) -> None:
|
|
44
|
+
"""Filter out non-numeric features and drop NaN rows."""
|
|
45
|
+
# Validate features
|
|
46
|
+
self.features = self._validate_features(self.df, self._raw_features)
|
|
47
|
+
|
|
48
|
+
# Drop NaN rows for the features we're using
|
|
49
|
+
self.df = self.df.dropna(subset=self.features).copy()
|
|
50
|
+
|
|
51
|
+
def _validate_features(self, df: pd.DataFrame, features: List[str]) -> List[str]:
|
|
52
|
+
"""Remove non-numeric features and log warnings."""
|
|
53
|
+
non_numeric = [f for f in features if f not in df.select_dtypes(include=["number"]).columns]
|
|
54
|
+
if non_numeric:
|
|
55
|
+
log.warning(f"Non-numeric features {non_numeric} aren't currently supported, excluding them")
|
|
56
|
+
return [f for f in features if f not in non_numeric]
|
|
57
|
+
|
|
58
|
+
def _build_model(self) -> None:
|
|
59
|
+
"""Standardize features and fit Nearest Neighbors model."""
|
|
60
|
+
self.scaler = StandardScaler()
|
|
61
|
+
X = self.scaler.fit_transform(self.df[self.features])
|
|
62
|
+
self.nn = NearestNeighbors().fit(X)
|
|
63
|
+
|
|
64
|
+
def _transform_features(self, df: pd.DataFrame) -> np.ndarray:
|
|
65
|
+
"""Transform features using the fitted scaler."""
|
|
66
|
+
return self.scaler.transform(df[self.features])
|
|
67
|
+
|
|
68
|
+
def _project_2d(self) -> None:
|
|
69
|
+
"""Project the numeric features to 2D for visualization."""
|
|
70
|
+
if len(self.features) >= 2:
|
|
71
|
+
self.df = Projection2D().fit_transform(self.df, features=self.features)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
# Testing the FeatureSpaceProximity class
|
|
75
|
+
if __name__ == "__main__":
|
|
76
|
+
|
|
77
|
+
pd.set_option("display.max_columns", None)
|
|
78
|
+
pd.set_option("display.width", 1000)
|
|
79
|
+
|
|
80
|
+
# Create a sample DataFrame
|
|
81
|
+
data = {
|
|
82
|
+
"ID": [1, 2, 3, 4, 5],
|
|
83
|
+
"Feature1": [0.1, 0.2, 0.3, 0.4, 0.5],
|
|
84
|
+
"Feature2": [0.5, 0.4, 0.3, 0.2, 0.1],
|
|
85
|
+
"Feature3": [2.5, 2.4, 2.3, 2.3, np.nan],
|
|
86
|
+
}
|
|
87
|
+
df = pd.DataFrame(data)
|
|
88
|
+
|
|
89
|
+
# Test the FeatureSpaceProximity class
|
|
90
|
+
features = ["Feature1", "Feature2", "Feature3"]
|
|
91
|
+
prox = FeatureSpaceProximity(df, id_column="ID", features=features)
|
|
92
|
+
print(prox.neighbors(1, n_neighbors=2))
|
|
93
|
+
|
|
94
|
+
# Test the neighbors method with radius
|
|
95
|
+
print(prox.neighbors(1, radius=2.0))
|
|
96
|
+
|
|
97
|
+
# Test with Features list
|
|
98
|
+
prox = FeatureSpaceProximity(df, id_column="ID", features=["Feature1"])
|
|
99
|
+
print(prox.neighbors(1))
|
|
100
|
+
|
|
101
|
+
# Create a sample DataFrame
|
|
102
|
+
data = {
|
|
103
|
+
"id": ["a", "b", "c", "d", "e"], # Testing string IDs
|
|
104
|
+
"Feature1": [0.1, 0.2, 0.3, 0.4, 0.5],
|
|
105
|
+
"Feature2": [0.5, 0.4, 0.3, 0.2, 0.1],
|
|
106
|
+
"target": [1, 0, 1, 0, 5],
|
|
107
|
+
}
|
|
108
|
+
df = pd.DataFrame(data)
|
|
109
|
+
|
|
110
|
+
# Test with String Ids
|
|
111
|
+
prox = FeatureSpaceProximity(
|
|
112
|
+
df,
|
|
113
|
+
id_column="id",
|
|
114
|
+
features=["Feature1", "Feature2"],
|
|
115
|
+
target="target",
|
|
116
|
+
include_all_columns=True,
|
|
117
|
+
)
|
|
118
|
+
print(prox.neighbors(["a", "b"]))
|
|
119
|
+
|
|
120
|
+
# Test duplicate IDs
|
|
121
|
+
data = {
|
|
122
|
+
"id": ["a", "b", "c", "d", "d"], # Duplicate ID (d)
|
|
123
|
+
"Feature1": [0.1, 0.2, 0.3, 0.4, 0.5],
|
|
124
|
+
"Feature2": [0.5, 0.4, 0.3, 0.2, 0.1],
|
|
125
|
+
"target": [1, 0, 1, 0, 5],
|
|
126
|
+
}
|
|
127
|
+
df = pd.DataFrame(data)
|
|
128
|
+
prox = FeatureSpaceProximity(df, id_column="id", features=["Feature1", "Feature2"], target="target")
|
|
129
|
+
print(df.equals(prox.df))
|
|
130
|
+
|
|
131
|
+
# Test on real data from Workbench
|
|
132
|
+
from workbench.api import FeatureSet, Model
|
|
133
|
+
|
|
134
|
+
fs = FeatureSet("aqsol_features")
|
|
135
|
+
model = Model("aqsol-regression")
|
|
136
|
+
features = model.features()
|
|
137
|
+
df = fs.pull_dataframe()
|
|
138
|
+
prox = FeatureSpaceProximity(df, id_column=fs.id_column, features=model.features(), target=model.target())
|
|
139
|
+
print("\n" + "=" * 80)
|
|
140
|
+
print("Testing Neighbors...")
|
|
141
|
+
print("=" * 80)
|
|
142
|
+
test_id = df[fs.id_column].tolist()[0]
|
|
143
|
+
print(f"\nNeighbors for ID {test_id}:")
|
|
144
|
+
print(prox.neighbors(test_id))
|
|
145
|
+
|
|
146
|
+
print("\n" + "=" * 80)
|
|
147
|
+
print("Testing isolated_compounds...")
|
|
148
|
+
print("=" * 80)
|
|
149
|
+
|
|
150
|
+
# Test isolated data in the top 1%
|
|
151
|
+
isolated_1pct = prox.isolated(top_percent=1.0)
|
|
152
|
+
print(f"\nTop 1% most isolated compounds (n={len(isolated_1pct)}):")
|
|
153
|
+
print(isolated_1pct)
|
|
154
|
+
|
|
155
|
+
# Test isolated data in the top 5%
|
|
156
|
+
isolated_5pct = prox.isolated(top_percent=5.0)
|
|
157
|
+
print(f"\nTop 5% most isolated compounds (n={len(isolated_5pct)}):")
|
|
158
|
+
print(isolated_5pct)
|
|
159
|
+
|
|
160
|
+
print("\n" + "=" * 80)
|
|
161
|
+
print("Testing target_gradients...")
|
|
162
|
+
print("=" * 80)
|
|
163
|
+
|
|
164
|
+
# Test with different parameters
|
|
165
|
+
gradients_1pct = prox.target_gradients(top_percent=1.0, min_delta=1.0)
|
|
166
|
+
print(f"\nTop 1% target gradients (min_delta=5.0) (n={len(gradients_1pct)}):")
|
|
167
|
+
print(gradients_1pct)
|
|
168
|
+
|
|
169
|
+
gradients_5pct = prox.target_gradients(top_percent=5.0, min_delta=5.0)
|
|
170
|
+
print(f"\nTop 5% target gradients (min_delta=5.0) (n={len(gradients_5pct)}):")
|
|
171
|
+
print(gradients_5pct)
|
|
172
|
+
|
|
173
|
+
# Test proximity_stats
|
|
174
|
+
print("\n" + "=" * 80)
|
|
175
|
+
print("Testing proximity_stats...")
|
|
176
|
+
print("=" * 80)
|
|
177
|
+
stats = prox.proximity_stats()
|
|
178
|
+
print(stats)
|
|
179
|
+
|
|
180
|
+
# Plot the distance distribution using pandas
|
|
181
|
+
print("\n" + "=" * 80)
|
|
182
|
+
print("Plotting distance distribution...")
|
|
183
|
+
print("=" * 80)
|
|
184
|
+
prox.df["nn_distance"].hist(bins=50, figsize=(10, 6), edgecolor="black")
|
|
185
|
+
|
|
186
|
+
# Visualize the 2D projection
|
|
187
|
+
print("\n" + "=" * 80)
|
|
188
|
+
print("Visualizing 2D Projection...")
|
|
189
|
+
print("=" * 80)
|
|
190
|
+
from workbench.web_interface.components.plugin_unit_test import PluginUnitTest
|
|
191
|
+
from workbench.web_interface.components.plugins.scatter_plot import ScatterPlot
|
|
192
|
+
|
|
193
|
+
unit_test = PluginUnitTest(ScatterPlot, input_data=prox.df[:1000], x="x", y="y", color=model.target())
|
|
194
|
+
unit_test.run()
|
|
@@ -8,7 +8,7 @@ TEMPLATE_PARAMS = {
|
|
|
8
8
|
"id_column": "{{id_column}}",
|
|
9
9
|
"features": "{{feature_list}}",
|
|
10
10
|
"target": "{{target_column}}",
|
|
11
|
-
"
|
|
11
|
+
"include_all_columns": "{{include_all_columns}}",
|
|
12
12
|
}
|
|
13
13
|
|
|
14
14
|
from io import StringIO
|
|
@@ -18,7 +18,7 @@ import os
|
|
|
18
18
|
import pandas as pd
|
|
19
19
|
|
|
20
20
|
# Local Imports
|
|
21
|
-
from
|
|
21
|
+
from feature_space_proximity import FeatureSpaceProximity
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
# Function to check if dataframe is empty
|
|
@@ -61,7 +61,7 @@ if __name__ == "__main__":
|
|
|
61
61
|
id_column = TEMPLATE_PARAMS["id_column"]
|
|
62
62
|
features = TEMPLATE_PARAMS["features"]
|
|
63
63
|
target = TEMPLATE_PARAMS["target"] # Can be None for unsupervised models
|
|
64
|
-
|
|
64
|
+
include_all_columns = TEMPLATE_PARAMS["include_all_columns"] # Defaults to False
|
|
65
65
|
|
|
66
66
|
# Script arguments for input/output directories
|
|
67
67
|
parser = argparse.ArgumentParser()
|
|
@@ -73,26 +73,24 @@ if __name__ == "__main__":
|
|
|
73
73
|
args = parser.parse_args()
|
|
74
74
|
|
|
75
75
|
# Load training data from the specified directory
|
|
76
|
-
training_files = [
|
|
77
|
-
os.path.join(args.train, file)
|
|
78
|
-
for file in os.listdir(args.train) if file.endswith(".csv")
|
|
79
|
-
]
|
|
76
|
+
training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
|
|
80
77
|
all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
|
|
81
78
|
|
|
82
79
|
# Check if the DataFrame is empty
|
|
83
80
|
check_dataframe(all_df, "training_df")
|
|
84
81
|
|
|
85
|
-
# Create the
|
|
86
|
-
model =
|
|
82
|
+
# Create the FeatureSpaceProximity model
|
|
83
|
+
model = FeatureSpaceProximity(all_df, id_column=id_column, features=features, target=target, include_all_columns=include_all_columns)
|
|
87
84
|
|
|
88
85
|
# Now serialize the model
|
|
89
86
|
model.serialize(args.model_dir)
|
|
90
87
|
|
|
88
|
+
|
|
91
89
|
# Model loading and prediction functions
|
|
92
90
|
def model_fn(model_dir):
|
|
93
91
|
|
|
94
92
|
# Deserialize the model
|
|
95
|
-
model =
|
|
93
|
+
model = FeatureSpaceProximity.deserialize(model_dir)
|
|
96
94
|
return model
|
|
97
95
|
|
|
98
96
|
|
|
@@ -14,7 +14,7 @@ import pandas as pd
|
|
|
14
14
|
TEMPLATE_PARAMS = {
|
|
15
15
|
"features": "{{feature_list}}",
|
|
16
16
|
"target": "{{target_column}}",
|
|
17
|
-
"train_all_data": "{{train_all_data}}"
|
|
17
|
+
"train_all_data": "{{train_all_data}}",
|
|
18
18
|
}
|
|
19
19
|
|
|
20
20
|
|
|
@@ -37,7 +37,7 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
|
|
|
37
37
|
"""
|
|
38
38
|
Matches and renames DataFrame columns to match model feature names (case-insensitive).
|
|
39
39
|
Prioritizes exact matches, then case-insensitive matches.
|
|
40
|
-
|
|
40
|
+
|
|
41
41
|
Raises ValueError if any model features cannot be matched.
|
|
42
42
|
"""
|
|
43
43
|
df_columns_lower = {col.lower(): col for col in df.columns}
|
|
@@ -81,10 +81,7 @@ if __name__ == "__main__":
|
|
|
81
81
|
args = parser.parse_args()
|
|
82
82
|
|
|
83
83
|
# Load training data from the specified directory
|
|
84
|
-
training_files = [
|
|
85
|
-
os.path.join(args.train, file)
|
|
86
|
-
for file in os.listdir(args.train) if file.endswith(".csv")
|
|
87
|
-
]
|
|
84
|
+
training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
|
|
88
85
|
df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
|
|
89
86
|
|
|
90
87
|
# Check if the DataFrame is empty
|
|
@@ -109,8 +106,10 @@ if __name__ == "__main__":
|
|
|
109
106
|
# Create and train the Regression/Confidence model
|
|
110
107
|
# model = BayesianRidge()
|
|
111
108
|
model = BayesianRidge(
|
|
112
|
-
alpha_1=1e-6,
|
|
113
|
-
|
|
109
|
+
alpha_1=1e-6,
|
|
110
|
+
alpha_2=1e-6, # Noise precision
|
|
111
|
+
lambda_1=1e-6,
|
|
112
|
+
lambda_2=1e-6, # Weight precision
|
|
114
113
|
fit_intercept=True,
|
|
115
114
|
)
|
|
116
115
|
|
|
@@ -4,13 +4,10 @@ import awswrangler as wr
|
|
|
4
4
|
import numpy as np
|
|
5
5
|
|
|
6
6
|
# Model Performance Scores
|
|
7
|
-
from sklearn.metrics import
|
|
8
|
-
mean_absolute_error,
|
|
9
|
-
r2_score,
|
|
10
|
-
root_mean_squared_error
|
|
11
|
-
)
|
|
7
|
+
from sklearn.metrics import mean_absolute_error, median_absolute_error, r2_score, root_mean_squared_error
|
|
12
8
|
from sklearn.model_selection import KFold
|
|
13
9
|
from scipy.optimize import minimize
|
|
10
|
+
from scipy.stats import spearmanr
|
|
14
11
|
|
|
15
12
|
from io import StringIO
|
|
16
13
|
import json
|
|
@@ -23,7 +20,7 @@ TEMPLATE_PARAMS = {
|
|
|
23
20
|
"features": "{{feature_list}}",
|
|
24
21
|
"target": "{{target_column}}",
|
|
25
22
|
"train_all_data": "{{train_all_data}}",
|
|
26
|
-
"model_metrics_s3_path": "{{model_metrics_s3_path}}"
|
|
23
|
+
"model_metrics_s3_path": "{{model_metrics_s3_path}}",
|
|
27
24
|
}
|
|
28
25
|
|
|
29
26
|
|
|
@@ -47,7 +44,7 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
|
|
|
47
44
|
"""
|
|
48
45
|
Matches and renames DataFrame columns to match model feature names (case-insensitive).
|
|
49
46
|
Prioritizes exact matches, then case-insensitive matches.
|
|
50
|
-
|
|
47
|
+
|
|
51
48
|
Raises ValueError if any model features cannot be matched.
|
|
52
49
|
"""
|
|
53
50
|
df_columns_lower = {col.lower(): col for col in df.columns}
|
|
@@ -90,10 +87,7 @@ if __name__ == "__main__":
|
|
|
90
87
|
args = parser.parse_args()
|
|
91
88
|
|
|
92
89
|
# Load training data from the specified directory
|
|
93
|
-
training_files = [
|
|
94
|
-
os.path.join(args.train, file)
|
|
95
|
-
for file in os.listdir(args.train) if file.endswith(".csv")
|
|
96
|
-
]
|
|
90
|
+
training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
|
|
97
91
|
df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
|
|
98
92
|
|
|
99
93
|
# Check if the DataFrame is empty
|
|
@@ -172,16 +166,14 @@ if __name__ == "__main__":
|
|
|
172
166
|
cv_residuals = np.array(cv_residuals)
|
|
173
167
|
cv_uncertainties = np.array(cv_uncertainties)
|
|
174
168
|
|
|
175
|
-
|
|
176
169
|
# Optimize calibration parameters: σ_cal = a * σ_uc + b
|
|
177
170
|
def neg_log_likelihood(params):
|
|
178
171
|
a, b = params
|
|
179
172
|
sigma_cal = a * cv_uncertainties + b
|
|
180
173
|
sigma_cal = np.maximum(sigma_cal, 1e-8) # Prevent division by zero
|
|
181
|
-
return np.sum(0.5 * np.log(2 * np.pi * sigma_cal
|
|
174
|
+
return np.sum(0.5 * np.log(2 * np.pi * sigma_cal**2) + 0.5 * (cv_residuals**2) / (sigma_cal**2))
|
|
182
175
|
|
|
183
|
-
|
|
184
|
-
result = minimize(neg_log_likelihood, x0=[1.0, 0.1], method='Nelder-Mead')
|
|
176
|
+
result = minimize(neg_log_likelihood, x0=[1.0, 0.1], method="Nelder-Mead")
|
|
185
177
|
cal_a, cal_b = result.x
|
|
186
178
|
|
|
187
179
|
print(f"Calibration parameters: a={cal_a:.4f}, b={cal_b:.4f}")
|
|
@@ -205,7 +197,9 @@ if __name__ == "__main__":
|
|
|
205
197
|
result_df["prediction"] = result_df[[name for name in result_df.columns if name.startswith("m_")]].mean(axis=1)
|
|
206
198
|
|
|
207
199
|
# Compute uncalibrated uncertainty
|
|
208
|
-
result_df["prediction_std_uc"] = result_df[[name for name in result_df.columns if name.startswith("m_")]].std(
|
|
200
|
+
result_df["prediction_std_uc"] = result_df[[name for name in result_df.columns if name.startswith("m_")]].std(
|
|
201
|
+
axis=1
|
|
202
|
+
)
|
|
209
203
|
|
|
210
204
|
# Apply calibration to uncertainty
|
|
211
205
|
result_df["prediction_std"] = cal_a * result_df["prediction_std_uc"] + cal_b
|
|
@@ -224,11 +218,16 @@ if __name__ == "__main__":
|
|
|
224
218
|
# Report Performance Metrics
|
|
225
219
|
rmse = root_mean_squared_error(result_df[target], result_df["prediction"])
|
|
226
220
|
mae = mean_absolute_error(result_df[target], result_df["prediction"])
|
|
221
|
+
medae = median_absolute_error(result_df[target], result_df["prediction"])
|
|
227
222
|
r2 = r2_score(result_df[target], result_df["prediction"])
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
print(f"
|
|
231
|
-
print(f"
|
|
223
|
+
spearman_corr = spearmanr(result_df[target], result_df["prediction"]).correlation
|
|
224
|
+
support = len(result_df)
|
|
225
|
+
print(f"rmse: {rmse:.3f}")
|
|
226
|
+
print(f"mae: {mae:.3f}")
|
|
227
|
+
print(f"medae: {medae:.3f}")
|
|
228
|
+
print(f"r2: {r2:.3f}")
|
|
229
|
+
print(f"spearmanr: {spearman_corr:.3f}")
|
|
230
|
+
print(f"support: {support}")
|
|
232
231
|
|
|
233
232
|
# Now save the models
|
|
234
233
|
for name, model in models.items():
|
|
@@ -352,4 +351,4 @@ def predict_fn(df, models) -> pd.DataFrame:
|
|
|
352
351
|
df = df.reindex(sorted(df.columns), axis=1)
|
|
353
352
|
|
|
354
353
|
# All done, return the DataFrame
|
|
355
|
-
return df
|
|
354
|
+
return df
|
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
from sklearn.preprocessing import StandardScaler
|
|
4
|
+
from sklearn.neighbors import NearestNeighbors
|
|
5
|
+
from typing import List, Optional
|
|
6
|
+
import logging
|
|
7
|
+
|
|
8
|
+
# Workbench Imports
|
|
9
|
+
from workbench.algorithms.dataframe.proximity import Proximity
|
|
10
|
+
from workbench.algorithms.dataframe.projection_2d import Projection2D
|
|
11
|
+
|
|
12
|
+
# Set up logging
|
|
13
|
+
log = logging.getLogger("workbench")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class FeatureSpaceProximity(Proximity):
|
|
17
|
+
"""Proximity computations for numeric feature spaces using Euclidean distance."""
|
|
18
|
+
|
|
19
|
+
def __init__(
|
|
20
|
+
self,
|
|
21
|
+
df: pd.DataFrame,
|
|
22
|
+
id_column: str,
|
|
23
|
+
features: List[str],
|
|
24
|
+
target: Optional[str] = None,
|
|
25
|
+
include_all_columns: bool = False,
|
|
26
|
+
):
|
|
27
|
+
"""
|
|
28
|
+
Initialize the FeatureSpaceProximity class.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
df: DataFrame containing data for neighbor computations.
|
|
32
|
+
id_column: Name of the column used as the identifier.
|
|
33
|
+
features: List of feature column names to be used for neighbor computations.
|
|
34
|
+
target: Name of the target column. Defaults to None.
|
|
35
|
+
include_all_columns: Include all DataFrame columns in neighbor results. Defaults to False.
|
|
36
|
+
"""
|
|
37
|
+
# Validate and filter features before calling parent init
|
|
38
|
+
self._raw_features = features
|
|
39
|
+
super().__init__(
|
|
40
|
+
df, id_column=id_column, features=features, target=target, include_all_columns=include_all_columns
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
def _prepare_data(self) -> None:
|
|
44
|
+
"""Filter out non-numeric features and drop NaN rows."""
|
|
45
|
+
# Validate features
|
|
46
|
+
self.features = self._validate_features(self.df, self._raw_features)
|
|
47
|
+
|
|
48
|
+
# Drop NaN rows for the features we're using
|
|
49
|
+
self.df = self.df.dropna(subset=self.features).copy()
|
|
50
|
+
|
|
51
|
+
def _validate_features(self, df: pd.DataFrame, features: List[str]) -> List[str]:
|
|
52
|
+
"""Remove non-numeric features and log warnings."""
|
|
53
|
+
non_numeric = [f for f in features if f not in df.select_dtypes(include=["number"]).columns]
|
|
54
|
+
if non_numeric:
|
|
55
|
+
log.warning(f"Non-numeric features {non_numeric} aren't currently supported, excluding them")
|
|
56
|
+
return [f for f in features if f not in non_numeric]
|
|
57
|
+
|
|
58
|
+
def _build_model(self) -> None:
|
|
59
|
+
"""Standardize features and fit Nearest Neighbors model."""
|
|
60
|
+
self.scaler = StandardScaler()
|
|
61
|
+
X = self.scaler.fit_transform(self.df[self.features])
|
|
62
|
+
self.nn = NearestNeighbors().fit(X)
|
|
63
|
+
|
|
64
|
+
def _transform_features(self, df: pd.DataFrame) -> np.ndarray:
|
|
65
|
+
"""Transform features using the fitted scaler."""
|
|
66
|
+
return self.scaler.transform(df[self.features])
|
|
67
|
+
|
|
68
|
+
def _project_2d(self) -> None:
|
|
69
|
+
"""Project the numeric features to 2D for visualization."""
|
|
70
|
+
if len(self.features) >= 2:
|
|
71
|
+
self.df = Projection2D().fit_transform(self.df, features=self.features)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
# Testing the FeatureSpaceProximity class
|
|
75
|
+
if __name__ == "__main__":
|
|
76
|
+
|
|
77
|
+
pd.set_option("display.max_columns", None)
|
|
78
|
+
pd.set_option("display.width", 1000)
|
|
79
|
+
|
|
80
|
+
# Create a sample DataFrame
|
|
81
|
+
data = {
|
|
82
|
+
"ID": [1, 2, 3, 4, 5],
|
|
83
|
+
"Feature1": [0.1, 0.2, 0.3, 0.4, 0.5],
|
|
84
|
+
"Feature2": [0.5, 0.4, 0.3, 0.2, 0.1],
|
|
85
|
+
"Feature3": [2.5, 2.4, 2.3, 2.3, np.nan],
|
|
86
|
+
}
|
|
87
|
+
df = pd.DataFrame(data)
|
|
88
|
+
|
|
89
|
+
# Test the FeatureSpaceProximity class
|
|
90
|
+
features = ["Feature1", "Feature2", "Feature3"]
|
|
91
|
+
prox = FeatureSpaceProximity(df, id_column="ID", features=features)
|
|
92
|
+
print(prox.neighbors(1, n_neighbors=2))
|
|
93
|
+
|
|
94
|
+
# Test the neighbors method with radius
|
|
95
|
+
print(prox.neighbors(1, radius=2.0))
|
|
96
|
+
|
|
97
|
+
# Test with Features list
|
|
98
|
+
prox = FeatureSpaceProximity(df, id_column="ID", features=["Feature1"])
|
|
99
|
+
print(prox.neighbors(1))
|
|
100
|
+
|
|
101
|
+
# Create a sample DataFrame
|
|
102
|
+
data = {
|
|
103
|
+
"id": ["a", "b", "c", "d", "e"], # Testing string IDs
|
|
104
|
+
"Feature1": [0.1, 0.2, 0.3, 0.4, 0.5],
|
|
105
|
+
"Feature2": [0.5, 0.4, 0.3, 0.2, 0.1],
|
|
106
|
+
"target": [1, 0, 1, 0, 5],
|
|
107
|
+
}
|
|
108
|
+
df = pd.DataFrame(data)
|
|
109
|
+
|
|
110
|
+
# Test with String Ids
|
|
111
|
+
prox = FeatureSpaceProximity(
|
|
112
|
+
df,
|
|
113
|
+
id_column="id",
|
|
114
|
+
features=["Feature1", "Feature2"],
|
|
115
|
+
target="target",
|
|
116
|
+
include_all_columns=True,
|
|
117
|
+
)
|
|
118
|
+
print(prox.neighbors(["a", "b"]))
|
|
119
|
+
|
|
120
|
+
# Test duplicate IDs
|
|
121
|
+
data = {
|
|
122
|
+
"id": ["a", "b", "c", "d", "d"], # Duplicate ID (d)
|
|
123
|
+
"Feature1": [0.1, 0.2, 0.3, 0.4, 0.5],
|
|
124
|
+
"Feature2": [0.5, 0.4, 0.3, 0.2, 0.1],
|
|
125
|
+
"target": [1, 0, 1, 0, 5],
|
|
126
|
+
}
|
|
127
|
+
df = pd.DataFrame(data)
|
|
128
|
+
prox = FeatureSpaceProximity(df, id_column="id", features=["Feature1", "Feature2"], target="target")
|
|
129
|
+
print(df.equals(prox.df))
|
|
130
|
+
|
|
131
|
+
# Test on real data from Workbench
|
|
132
|
+
from workbench.api import FeatureSet, Model
|
|
133
|
+
|
|
134
|
+
fs = FeatureSet("aqsol_features")
|
|
135
|
+
model = Model("aqsol-regression")
|
|
136
|
+
features = model.features()
|
|
137
|
+
df = fs.pull_dataframe()
|
|
138
|
+
prox = FeatureSpaceProximity(df, id_column=fs.id_column, features=model.features(), target=model.target())
|
|
139
|
+
print("\n" + "=" * 80)
|
|
140
|
+
print("Testing Neighbors...")
|
|
141
|
+
print("=" * 80)
|
|
142
|
+
test_id = df[fs.id_column].tolist()[0]
|
|
143
|
+
print(f"\nNeighbors for ID {test_id}:")
|
|
144
|
+
print(prox.neighbors(test_id))
|
|
145
|
+
|
|
146
|
+
print("\n" + "=" * 80)
|
|
147
|
+
print("Testing isolated_compounds...")
|
|
148
|
+
print("=" * 80)
|
|
149
|
+
|
|
150
|
+
# Test isolated data in the top 1%
|
|
151
|
+
isolated_1pct = prox.isolated(top_percent=1.0)
|
|
152
|
+
print(f"\nTop 1% most isolated compounds (n={len(isolated_1pct)}):")
|
|
153
|
+
print(isolated_1pct)
|
|
154
|
+
|
|
155
|
+
# Test isolated data in the top 5%
|
|
156
|
+
isolated_5pct = prox.isolated(top_percent=5.0)
|
|
157
|
+
print(f"\nTop 5% most isolated compounds (n={len(isolated_5pct)}):")
|
|
158
|
+
print(isolated_5pct)
|
|
159
|
+
|
|
160
|
+
print("\n" + "=" * 80)
|
|
161
|
+
print("Testing target_gradients...")
|
|
162
|
+
print("=" * 80)
|
|
163
|
+
|
|
164
|
+
# Test with different parameters
|
|
165
|
+
gradients_1pct = prox.target_gradients(top_percent=1.0, min_delta=1.0)
|
|
166
|
+
print(f"\nTop 1% target gradients (min_delta=5.0) (n={len(gradients_1pct)}):")
|
|
167
|
+
print(gradients_1pct)
|
|
168
|
+
|
|
169
|
+
gradients_5pct = prox.target_gradients(top_percent=5.0, min_delta=5.0)
|
|
170
|
+
print(f"\nTop 5% target gradients (min_delta=5.0) (n={len(gradients_5pct)}):")
|
|
171
|
+
print(gradients_5pct)
|
|
172
|
+
|
|
173
|
+
# Test proximity_stats
|
|
174
|
+
print("\n" + "=" * 80)
|
|
175
|
+
print("Testing proximity_stats...")
|
|
176
|
+
print("=" * 80)
|
|
177
|
+
stats = prox.proximity_stats()
|
|
178
|
+
print(stats)
|
|
179
|
+
|
|
180
|
+
# Plot the distance distribution using pandas
|
|
181
|
+
print("\n" + "=" * 80)
|
|
182
|
+
print("Plotting distance distribution...")
|
|
183
|
+
print("=" * 80)
|
|
184
|
+
prox.df["nn_distance"].hist(bins=50, figsize=(10, 6), edgecolor="black")
|
|
185
|
+
|
|
186
|
+
# Visualize the 2D projection
|
|
187
|
+
print("\n" + "=" * 80)
|
|
188
|
+
print("Visualizing 2D Projection...")
|
|
189
|
+
print("=" * 80)
|
|
190
|
+
from workbench.web_interface.components.plugin_unit_test import PluginUnitTest
|
|
191
|
+
from workbench.web_interface.components.plugins.scatter_plot import ScatterPlot
|
|
192
|
+
|
|
193
|
+
unit_test = PluginUnitTest(ScatterPlot, input_data=prox.df[:1000], x="x", y="y", color=model.target())
|
|
194
|
+
unit_test.run()
|
|
@@ -9,7 +9,7 @@ from sklearn.model_selection import train_test_split
|
|
|
9
9
|
TEMPLATE_PARAMS = {
|
|
10
10
|
"features": "{{feature_list}}",
|
|
11
11
|
"target": "{{target_column}}",
|
|
12
|
-
"train_all_data": "{{train_all_data}}"
|
|
12
|
+
"train_all_data": "{{train_all_data}}",
|
|
13
13
|
}
|
|
14
14
|
|
|
15
15
|
from io import StringIO
|
|
@@ -33,7 +33,7 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
|
|
|
33
33
|
"""
|
|
34
34
|
Matches and renames DataFrame columns to match model feature names (case-insensitive).
|
|
35
35
|
Prioritizes exact matches, then case-insensitive matches.
|
|
36
|
-
|
|
36
|
+
|
|
37
37
|
Raises ValueError if any model features cannot be matched.
|
|
38
38
|
"""
|
|
39
39
|
df_columns_lower = {col.lower(): col for col in df.columns}
|
|
@@ -46,7 +46,7 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
|
|
|
46
46
|
rename_dict[df_columns_lower[feature.lower()]] = feature
|
|
47
47
|
else:
|
|
48
48
|
missing.append(feature)
|
|
49
|
-
|
|
49
|
+
|
|
50
50
|
if missing:
|
|
51
51
|
raise ValueError(f"Features not found: {missing}")
|
|
52
52
|
|
|
@@ -76,10 +76,7 @@ if __name__ == "__main__":
|
|
|
76
76
|
args = parser.parse_args()
|
|
77
77
|
|
|
78
78
|
# Load training data from the specified directory
|
|
79
|
-
training_files = [
|
|
80
|
-
os.path.join(args.train, file)
|
|
81
|
-
for file in os.listdir(args.train) if file.endswith(".csv")
|
|
82
|
-
]
|
|
79
|
+
training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
|
|
83
80
|
df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
|
|
84
81
|
|
|
85
82
|
# Check if the DataFrame is empty
|
|
@@ -112,10 +109,7 @@ if __name__ == "__main__":
|
|
|
112
109
|
)
|
|
113
110
|
|
|
114
111
|
# Create a Pipeline with StandardScaler
|
|
115
|
-
model = Pipeline([
|
|
116
|
-
("scaler", StandardScaler()),
|
|
117
|
-
("model", model)
|
|
118
|
-
])
|
|
112
|
+
model = Pipeline([("scaler", StandardScaler()), ("model", model)])
|
|
119
113
|
|
|
120
114
|
# Prepare features and targets for training
|
|
121
115
|
X_train = df_train[features]
|