workbench 0.8.162__py3-none-any.whl → 0.8.220__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of workbench might be problematic. Click here for more details.

Files changed (147) hide show
  1. workbench/algorithms/dataframe/__init__.py +1 -2
  2. workbench/algorithms/dataframe/compound_dataset_overlap.py +321 -0
  3. workbench/algorithms/dataframe/feature_space_proximity.py +168 -75
  4. workbench/algorithms/dataframe/fingerprint_proximity.py +422 -86
  5. workbench/algorithms/dataframe/projection_2d.py +44 -21
  6. workbench/algorithms/dataframe/proximity.py +259 -305
  7. workbench/algorithms/graph/light/proximity_graph.py +14 -12
  8. workbench/algorithms/models/cleanlab_model.py +382 -0
  9. workbench/algorithms/models/noise_model.py +388 -0
  10. workbench/algorithms/sql/outliers.py +3 -3
  11. workbench/api/__init__.py +5 -1
  12. workbench/api/compound.py +1 -1
  13. workbench/api/df_store.py +17 -108
  14. workbench/api/endpoint.py +18 -5
  15. workbench/api/feature_set.py +121 -15
  16. workbench/api/meta.py +5 -2
  17. workbench/api/meta_model.py +289 -0
  18. workbench/api/model.py +55 -21
  19. workbench/api/monitor.py +1 -16
  20. workbench/api/parameter_store.py +3 -52
  21. workbench/cached/cached_model.py +4 -4
  22. workbench/core/artifacts/__init__.py +11 -2
  23. workbench/core/artifacts/artifact.py +16 -8
  24. workbench/core/artifacts/data_capture_core.py +355 -0
  25. workbench/core/artifacts/df_store_core.py +114 -0
  26. workbench/core/artifacts/endpoint_core.py +382 -253
  27. workbench/core/artifacts/feature_set_core.py +249 -45
  28. workbench/core/artifacts/model_core.py +135 -80
  29. workbench/core/artifacts/monitor_core.py +33 -248
  30. workbench/core/artifacts/parameter_store_core.py +98 -0
  31. workbench/core/cloud_platform/aws/aws_account_clamp.py +50 -1
  32. workbench/core/cloud_platform/aws/aws_meta.py +12 -5
  33. workbench/core/cloud_platform/aws/aws_session.py +4 -4
  34. workbench/core/pipelines/pipeline_executor.py +1 -1
  35. workbench/core/transforms/data_to_features/light/molecular_descriptors.py +4 -4
  36. workbench/core/transforms/features_to_model/features_to_model.py +62 -40
  37. workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +76 -15
  38. workbench/core/transforms/pandas_transforms/pandas_to_features.py +38 -2
  39. workbench/core/views/training_view.py +113 -42
  40. workbench/core/views/view.py +53 -3
  41. workbench/core/views/view_utils.py +4 -4
  42. workbench/model_script_utils/model_script_utils.py +339 -0
  43. workbench/model_script_utils/pytorch_utils.py +405 -0
  44. workbench/model_script_utils/uq_harness.py +278 -0
  45. workbench/model_scripts/chemprop/chemprop.template +649 -0
  46. workbench/model_scripts/chemprop/generated_model_script.py +649 -0
  47. workbench/model_scripts/chemprop/model_script_utils.py +339 -0
  48. workbench/model_scripts/chemprop/requirements.txt +3 -0
  49. workbench/model_scripts/custom_models/chem_info/fingerprints.py +175 -0
  50. workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +483 -0
  51. workbench/model_scripts/custom_models/chem_info/mol_standardize.py +450 -0
  52. workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +7 -9
  53. workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py +1 -1
  54. workbench/model_scripts/custom_models/proximity/feature_space_proximity.py +194 -0
  55. workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +8 -10
  56. workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
  57. workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +20 -21
  58. workbench/model_scripts/custom_models/uq_models/feature_space_proximity.py +194 -0
  59. workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
  60. workbench/model_scripts/custom_models/uq_models/ngboost.template +30 -18
  61. workbench/model_scripts/custom_models/uq_models/requirements.txt +1 -3
  62. workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +15 -17
  63. workbench/model_scripts/meta_model/generated_model_script.py +209 -0
  64. workbench/model_scripts/meta_model/meta_model.template +209 -0
  65. workbench/model_scripts/pytorch_model/generated_model_script.py +444 -500
  66. workbench/model_scripts/pytorch_model/model_script_utils.py +339 -0
  67. workbench/model_scripts/pytorch_model/pytorch.template +440 -496
  68. workbench/model_scripts/pytorch_model/pytorch_utils.py +405 -0
  69. workbench/model_scripts/pytorch_model/requirements.txt +1 -1
  70. workbench/model_scripts/pytorch_model/uq_harness.py +278 -0
  71. workbench/model_scripts/scikit_learn/generated_model_script.py +7 -12
  72. workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
  73. workbench/model_scripts/script_generation.py +20 -11
  74. workbench/model_scripts/uq_models/generated_model_script.py +248 -0
  75. workbench/model_scripts/xgb_model/generated_model_script.py +372 -404
  76. workbench/model_scripts/xgb_model/model_script_utils.py +339 -0
  77. workbench/model_scripts/xgb_model/uq_harness.py +278 -0
  78. workbench/model_scripts/xgb_model/xgb_model.template +369 -401
  79. workbench/repl/workbench_shell.py +28 -19
  80. workbench/resources/open_source_api.key +1 -1
  81. workbench/scripts/endpoint_test.py +162 -0
  82. workbench/scripts/lambda_test.py +73 -0
  83. workbench/scripts/meta_model_sim.py +35 -0
  84. workbench/scripts/ml_pipeline_batch.py +137 -0
  85. workbench/scripts/ml_pipeline_sqs.py +186 -0
  86. workbench/scripts/monitor_cloud_watch.py +20 -100
  87. workbench/scripts/training_test.py +85 -0
  88. workbench/utils/aws_utils.py +4 -3
  89. workbench/utils/chem_utils/__init__.py +0 -0
  90. workbench/utils/chem_utils/fingerprints.py +175 -0
  91. workbench/utils/chem_utils/misc.py +194 -0
  92. workbench/utils/chem_utils/mol_descriptors.py +483 -0
  93. workbench/utils/chem_utils/mol_standardize.py +450 -0
  94. workbench/utils/chem_utils/mol_tagging.py +348 -0
  95. workbench/utils/chem_utils/projections.py +219 -0
  96. workbench/utils/chem_utils/salts.py +256 -0
  97. workbench/utils/chem_utils/sdf.py +292 -0
  98. workbench/utils/chem_utils/toxicity.py +250 -0
  99. workbench/utils/chem_utils/vis.py +253 -0
  100. workbench/utils/chemprop_utils.py +141 -0
  101. workbench/utils/cloudwatch_handler.py +1 -1
  102. workbench/utils/cloudwatch_utils.py +137 -0
  103. workbench/utils/config_manager.py +3 -7
  104. workbench/utils/endpoint_utils.py +5 -7
  105. workbench/utils/license_manager.py +2 -6
  106. workbench/utils/meta_model_simulator.py +499 -0
  107. workbench/utils/metrics_utils.py +256 -0
  108. workbench/utils/model_utils.py +278 -79
  109. workbench/utils/monitor_utils.py +44 -62
  110. workbench/utils/pandas_utils.py +3 -3
  111. workbench/utils/pytorch_utils.py +87 -0
  112. workbench/utils/shap_utils.py +11 -57
  113. workbench/utils/workbench_logging.py +0 -3
  114. workbench/utils/workbench_sqs.py +1 -1
  115. workbench/utils/xgboost_local_crossfold.py +267 -0
  116. workbench/utils/xgboost_model_utils.py +127 -219
  117. workbench/web_interface/components/model_plot.py +14 -2
  118. workbench/web_interface/components/plugin_unit_test.py +5 -2
  119. workbench/web_interface/components/plugins/dashboard_status.py +3 -1
  120. workbench/web_interface/components/plugins/generated_compounds.py +1 -1
  121. workbench/web_interface/components/plugins/model_details.py +38 -74
  122. workbench/web_interface/components/plugins/scatter_plot.py +6 -10
  123. {workbench-0.8.162.dist-info → workbench-0.8.220.dist-info}/METADATA +31 -9
  124. {workbench-0.8.162.dist-info → workbench-0.8.220.dist-info}/RECORD +128 -96
  125. workbench-0.8.220.dist-info/entry_points.txt +11 -0
  126. {workbench-0.8.162.dist-info → workbench-0.8.220.dist-info}/licenses/LICENSE +1 -1
  127. workbench/core/cloud_platform/aws/aws_df_store.py +0 -404
  128. workbench/core/cloud_platform/aws/aws_parameter_store.py +0 -280
  129. workbench/model_scripts/custom_models/chem_info/local_utils.py +0 -769
  130. workbench/model_scripts/custom_models/chem_info/tautomerize.py +0 -83
  131. workbench/model_scripts/custom_models/meta_endpoints/example.py +0 -53
  132. workbench/model_scripts/custom_models/proximity/generated_model_script.py +0 -138
  133. workbench/model_scripts/custom_models/proximity/proximity.py +0 -384
  134. workbench/model_scripts/custom_models/uq_models/generated_model_script.py +0 -393
  135. workbench/model_scripts/custom_models/uq_models/mapie_xgb.template +0 -203
  136. workbench/model_scripts/custom_models/uq_models/meta_uq.template +0 -273
  137. workbench/model_scripts/custom_models/uq_models/proximity.py +0 -384
  138. workbench/model_scripts/ensemble_xgb/generated_model_script.py +0 -279
  139. workbench/model_scripts/quant_regression/quant_regression.template +0 -279
  140. workbench/model_scripts/quant_regression/requirements.txt +0 -1
  141. workbench/utils/chem_utils.py +0 -1556
  142. workbench/utils/execution_environment.py +0 -211
  143. workbench/utils/fast_inference.py +0 -167
  144. workbench/utils/resource_utils.py +0 -39
  145. workbench-0.8.162.dist-info/entry_points.txt +0 -5
  146. {workbench-0.8.162.dist-info → workbench-0.8.220.dist-info}/WHEEL +0 -0
  147. {workbench-0.8.162.dist-info → workbench-0.8.220.dist-info}/top_level.txt +0 -0
@@ -8,7 +8,7 @@ TEMPLATE_PARAMS = {
8
8
  "id_column": "{{id_column}}",
9
9
  "features": "{{feature_list}}",
10
10
  "target": "{{target_column}}",
11
- "track_columns": "{{track_columns}}"
11
+ "include_all_columns": "{{include_all_columns}}",
12
12
  }
13
13
 
14
14
  from io import StringIO
@@ -18,7 +18,7 @@ import os
18
18
  import pandas as pd
19
19
 
20
20
  # Local Imports
21
- from proximity import Proximity
21
+ from feature_space_proximity import FeatureSpaceProximity
22
22
 
23
23
 
24
24
  # Function to check if dataframe is empty
@@ -61,7 +61,7 @@ if __name__ == "__main__":
61
61
  id_column = TEMPLATE_PARAMS["id_column"]
62
62
  features = TEMPLATE_PARAMS["features"]
63
63
  target = TEMPLATE_PARAMS["target"] # Can be None for unsupervised models
64
- track_columns = TEMPLATE_PARAMS["track_columns"] # Can be None
64
+ include_all_columns = TEMPLATE_PARAMS["include_all_columns"] # Defaults to False
65
65
 
66
66
  # Script arguments for input/output directories
67
67
  parser = argparse.ArgumentParser()
@@ -73,26 +73,24 @@ if __name__ == "__main__":
73
73
  args = parser.parse_args()
74
74
 
75
75
  # Load training data from the specified directory
76
- training_files = [
77
- os.path.join(args.train, file)
78
- for file in os.listdir(args.train) if file.endswith(".csv")
79
- ]
76
+ training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
80
77
  all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
81
78
 
82
79
  # Check if the DataFrame is empty
83
80
  check_dataframe(all_df, "training_df")
84
81
 
85
- # Create the Proximity model
86
- model = Proximity(all_df, id_column, features, target, track_columns=track_columns)
82
+ # Create the FeatureSpaceProximity model
83
+ model = FeatureSpaceProximity(all_df, id_column=id_column, features=features, target=target, include_all_columns=include_all_columns)
87
84
 
88
85
  # Now serialize the model
89
86
  model.serialize(args.model_dir)
90
87
 
88
+
91
89
  # Model loading and prediction functions
92
90
  def model_fn(model_dir):
93
91
 
94
92
  # Deserialize the model
95
- model = Proximity.deserialize(model_dir)
93
+ model = FeatureSpaceProximity.deserialize(model_dir)
96
94
  return model
97
95
 
98
96
 
@@ -14,7 +14,7 @@ import pandas as pd
14
14
  TEMPLATE_PARAMS = {
15
15
  "features": "{{feature_list}}",
16
16
  "target": "{{target_column}}",
17
- "train_all_data": "{{train_all_data}}"
17
+ "train_all_data": "{{train_all_data}}",
18
18
  }
19
19
 
20
20
 
@@ -37,7 +37,7 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
37
37
  """
38
38
  Matches and renames DataFrame columns to match model feature names (case-insensitive).
39
39
  Prioritizes exact matches, then case-insensitive matches.
40
-
40
+
41
41
  Raises ValueError if any model features cannot be matched.
42
42
  """
43
43
  df_columns_lower = {col.lower(): col for col in df.columns}
@@ -81,10 +81,7 @@ if __name__ == "__main__":
81
81
  args = parser.parse_args()
82
82
 
83
83
  # Load training data from the specified directory
84
- training_files = [
85
- os.path.join(args.train, file)
86
- for file in os.listdir(args.train) if file.endswith(".csv")
87
- ]
84
+ training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
88
85
  df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
89
86
 
90
87
  # Check if the DataFrame is empty
@@ -109,8 +106,10 @@ if __name__ == "__main__":
109
106
  # Create and train the Regression/Confidence model
110
107
  # model = BayesianRidge()
111
108
  model = BayesianRidge(
112
- alpha_1=1e-6, alpha_2=1e-6, # Noise precision
113
- lambda_1=1e-6, lambda_2=1e-6, # Weight precision
109
+ alpha_1=1e-6,
110
+ alpha_2=1e-6, # Noise precision
111
+ lambda_1=1e-6,
112
+ lambda_2=1e-6, # Weight precision
114
113
  fit_intercept=True,
115
114
  )
116
115
 
@@ -4,13 +4,10 @@ import awswrangler as wr
4
4
  import numpy as np
5
5
 
6
6
  # Model Performance Scores
7
- from sklearn.metrics import (
8
- mean_absolute_error,
9
- r2_score,
10
- root_mean_squared_error
11
- )
7
+ from sklearn.metrics import mean_absolute_error, median_absolute_error, r2_score, root_mean_squared_error
12
8
  from sklearn.model_selection import KFold
13
9
  from scipy.optimize import minimize
10
+ from scipy.stats import spearmanr
14
11
 
15
12
  from io import StringIO
16
13
  import json
@@ -23,7 +20,7 @@ TEMPLATE_PARAMS = {
23
20
  "features": "{{feature_list}}",
24
21
  "target": "{{target_column}}",
25
22
  "train_all_data": "{{train_all_data}}",
26
- "model_metrics_s3_path": "{{model_metrics_s3_path}}"
23
+ "model_metrics_s3_path": "{{model_metrics_s3_path}}",
27
24
  }
28
25
 
29
26
 
@@ -47,7 +44,7 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
47
44
  """
48
45
  Matches and renames DataFrame columns to match model feature names (case-insensitive).
49
46
  Prioritizes exact matches, then case-insensitive matches.
50
-
47
+
51
48
  Raises ValueError if any model features cannot be matched.
52
49
  """
53
50
  df_columns_lower = {col.lower(): col for col in df.columns}
@@ -90,10 +87,7 @@ if __name__ == "__main__":
90
87
  args = parser.parse_args()
91
88
 
92
89
  # Load training data from the specified directory
93
- training_files = [
94
- os.path.join(args.train, file)
95
- for file in os.listdir(args.train) if file.endswith(".csv")
96
- ]
90
+ training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
97
91
  df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
98
92
 
99
93
  # Check if the DataFrame is empty
@@ -172,16 +166,14 @@ if __name__ == "__main__":
172
166
  cv_residuals = np.array(cv_residuals)
173
167
  cv_uncertainties = np.array(cv_uncertainties)
174
168
 
175
-
176
169
  # Optimize calibration parameters: σ_cal = a * σ_uc + b
177
170
  def neg_log_likelihood(params):
178
171
  a, b = params
179
172
  sigma_cal = a * cv_uncertainties + b
180
173
  sigma_cal = np.maximum(sigma_cal, 1e-8) # Prevent division by zero
181
- return np.sum(0.5 * np.log(2 * np.pi * sigma_cal ** 2) + 0.5 * (cv_residuals ** 2) / (sigma_cal ** 2))
174
+ return np.sum(0.5 * np.log(2 * np.pi * sigma_cal**2) + 0.5 * (cv_residuals**2) / (sigma_cal**2))
182
175
 
183
-
184
- result = minimize(neg_log_likelihood, x0=[1.0, 0.1], method='Nelder-Mead')
176
+ result = minimize(neg_log_likelihood, x0=[1.0, 0.1], method="Nelder-Mead")
185
177
  cal_a, cal_b = result.x
186
178
 
187
179
  print(f"Calibration parameters: a={cal_a:.4f}, b={cal_b:.4f}")
@@ -205,7 +197,9 @@ if __name__ == "__main__":
205
197
  result_df["prediction"] = result_df[[name for name in result_df.columns if name.startswith("m_")]].mean(axis=1)
206
198
 
207
199
  # Compute uncalibrated uncertainty
208
- result_df["prediction_std_uc"] = result_df[[name for name in result_df.columns if name.startswith("m_")]].std(axis=1)
200
+ result_df["prediction_std_uc"] = result_df[[name for name in result_df.columns if name.startswith("m_")]].std(
201
+ axis=1
202
+ )
209
203
 
210
204
  # Apply calibration to uncertainty
211
205
  result_df["prediction_std"] = cal_a * result_df["prediction_std_uc"] + cal_b
@@ -224,11 +218,16 @@ if __name__ == "__main__":
224
218
  # Report Performance Metrics
225
219
  rmse = root_mean_squared_error(result_df[target], result_df["prediction"])
226
220
  mae = mean_absolute_error(result_df[target], result_df["prediction"])
221
+ medae = median_absolute_error(result_df[target], result_df["prediction"])
227
222
  r2 = r2_score(result_df[target], result_df["prediction"])
228
- print(f"RMSE: {rmse:.3f}")
229
- print(f"MAE: {mae:.3f}")
230
- print(f"R2: {r2:.3f}")
231
- print(f"NumRows: {len(result_df)}")
223
+ spearman_corr = spearmanr(result_df[target], result_df["prediction"]).correlation
224
+ support = len(result_df)
225
+ print(f"rmse: {rmse:.3f}")
226
+ print(f"mae: {mae:.3f}")
227
+ print(f"medae: {medae:.3f}")
228
+ print(f"r2: {r2:.3f}")
229
+ print(f"spearmanr: {spearman_corr:.3f}")
230
+ print(f"support: {support}")
232
231
 
233
232
  # Now save the models
234
233
  for name, model in models.items():
@@ -352,4 +351,4 @@ def predict_fn(df, models) -> pd.DataFrame:
352
351
  df = df.reindex(sorted(df.columns), axis=1)
353
352
 
354
353
  # All done, return the DataFrame
355
- return df
354
+ return df
@@ -0,0 +1,194 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sklearn.preprocessing import StandardScaler
4
+ from sklearn.neighbors import NearestNeighbors
5
+ from typing import List, Optional
6
+ import logging
7
+
8
+ # Workbench Imports
9
+ from workbench.algorithms.dataframe.proximity import Proximity
10
+ from workbench.algorithms.dataframe.projection_2d import Projection2D
11
+
12
+ # Set up logging
13
+ log = logging.getLogger("workbench")
14
+
15
+
16
+ class FeatureSpaceProximity(Proximity):
17
+ """Proximity computations for numeric feature spaces using Euclidean distance."""
18
+
19
+ def __init__(
20
+ self,
21
+ df: pd.DataFrame,
22
+ id_column: str,
23
+ features: List[str],
24
+ target: Optional[str] = None,
25
+ include_all_columns: bool = False,
26
+ ):
27
+ """
28
+ Initialize the FeatureSpaceProximity class.
29
+
30
+ Args:
31
+ df: DataFrame containing data for neighbor computations.
32
+ id_column: Name of the column used as the identifier.
33
+ features: List of feature column names to be used for neighbor computations.
34
+ target: Name of the target column. Defaults to None.
35
+ include_all_columns: Include all DataFrame columns in neighbor results. Defaults to False.
36
+ """
37
+ # Validate and filter features before calling parent init
38
+ self._raw_features = features
39
+ super().__init__(
40
+ df, id_column=id_column, features=features, target=target, include_all_columns=include_all_columns
41
+ )
42
+
43
+ def _prepare_data(self) -> None:
44
+ """Filter out non-numeric features and drop NaN rows."""
45
+ # Validate features
46
+ self.features = self._validate_features(self.df, self._raw_features)
47
+
48
+ # Drop NaN rows for the features we're using
49
+ self.df = self.df.dropna(subset=self.features).copy()
50
+
51
+ def _validate_features(self, df: pd.DataFrame, features: List[str]) -> List[str]:
52
+ """Remove non-numeric features and log warnings."""
53
+ non_numeric = [f for f in features if f not in df.select_dtypes(include=["number"]).columns]
54
+ if non_numeric:
55
+ log.warning(f"Non-numeric features {non_numeric} aren't currently supported, excluding them")
56
+ return [f for f in features if f not in non_numeric]
57
+
58
+ def _build_model(self) -> None:
59
+ """Standardize features and fit Nearest Neighbors model."""
60
+ self.scaler = StandardScaler()
61
+ X = self.scaler.fit_transform(self.df[self.features])
62
+ self.nn = NearestNeighbors().fit(X)
63
+
64
+ def _transform_features(self, df: pd.DataFrame) -> np.ndarray:
65
+ """Transform features using the fitted scaler."""
66
+ return self.scaler.transform(df[self.features])
67
+
68
+ def _project_2d(self) -> None:
69
+ """Project the numeric features to 2D for visualization."""
70
+ if len(self.features) >= 2:
71
+ self.df = Projection2D().fit_transform(self.df, features=self.features)
72
+
73
+
74
+ # Testing the FeatureSpaceProximity class
75
+ if __name__ == "__main__":
76
+
77
+ pd.set_option("display.max_columns", None)
78
+ pd.set_option("display.width", 1000)
79
+
80
+ # Create a sample DataFrame
81
+ data = {
82
+ "ID": [1, 2, 3, 4, 5],
83
+ "Feature1": [0.1, 0.2, 0.3, 0.4, 0.5],
84
+ "Feature2": [0.5, 0.4, 0.3, 0.2, 0.1],
85
+ "Feature3": [2.5, 2.4, 2.3, 2.3, np.nan],
86
+ }
87
+ df = pd.DataFrame(data)
88
+
89
+ # Test the FeatureSpaceProximity class
90
+ features = ["Feature1", "Feature2", "Feature3"]
91
+ prox = FeatureSpaceProximity(df, id_column="ID", features=features)
92
+ print(prox.neighbors(1, n_neighbors=2))
93
+
94
+ # Test the neighbors method with radius
95
+ print(prox.neighbors(1, radius=2.0))
96
+
97
+ # Test with Features list
98
+ prox = FeatureSpaceProximity(df, id_column="ID", features=["Feature1"])
99
+ print(prox.neighbors(1))
100
+
101
+ # Create a sample DataFrame
102
+ data = {
103
+ "id": ["a", "b", "c", "d", "e"], # Testing string IDs
104
+ "Feature1": [0.1, 0.2, 0.3, 0.4, 0.5],
105
+ "Feature2": [0.5, 0.4, 0.3, 0.2, 0.1],
106
+ "target": [1, 0, 1, 0, 5],
107
+ }
108
+ df = pd.DataFrame(data)
109
+
110
+ # Test with String Ids
111
+ prox = FeatureSpaceProximity(
112
+ df,
113
+ id_column="id",
114
+ features=["Feature1", "Feature2"],
115
+ target="target",
116
+ include_all_columns=True,
117
+ )
118
+ print(prox.neighbors(["a", "b"]))
119
+
120
+ # Test duplicate IDs
121
+ data = {
122
+ "id": ["a", "b", "c", "d", "d"], # Duplicate ID (d)
123
+ "Feature1": [0.1, 0.2, 0.3, 0.4, 0.5],
124
+ "Feature2": [0.5, 0.4, 0.3, 0.2, 0.1],
125
+ "target": [1, 0, 1, 0, 5],
126
+ }
127
+ df = pd.DataFrame(data)
128
+ prox = FeatureSpaceProximity(df, id_column="id", features=["Feature1", "Feature2"], target="target")
129
+ print(df.equals(prox.df))
130
+
131
+ # Test on real data from Workbench
132
+ from workbench.api import FeatureSet, Model
133
+
134
+ fs = FeatureSet("aqsol_features")
135
+ model = Model("aqsol-regression")
136
+ features = model.features()
137
+ df = fs.pull_dataframe()
138
+ prox = FeatureSpaceProximity(df, id_column=fs.id_column, features=model.features(), target=model.target())
139
+ print("\n" + "=" * 80)
140
+ print("Testing Neighbors...")
141
+ print("=" * 80)
142
+ test_id = df[fs.id_column].tolist()[0]
143
+ print(f"\nNeighbors for ID {test_id}:")
144
+ print(prox.neighbors(test_id))
145
+
146
+ print("\n" + "=" * 80)
147
+ print("Testing isolated_compounds...")
148
+ print("=" * 80)
149
+
150
+ # Test isolated data in the top 1%
151
+ isolated_1pct = prox.isolated(top_percent=1.0)
152
+ print(f"\nTop 1% most isolated compounds (n={len(isolated_1pct)}):")
153
+ print(isolated_1pct)
154
+
155
+ # Test isolated data in the top 5%
156
+ isolated_5pct = prox.isolated(top_percent=5.0)
157
+ print(f"\nTop 5% most isolated compounds (n={len(isolated_5pct)}):")
158
+ print(isolated_5pct)
159
+
160
+ print("\n" + "=" * 80)
161
+ print("Testing target_gradients...")
162
+ print("=" * 80)
163
+
164
+ # Test with different parameters
165
+ gradients_1pct = prox.target_gradients(top_percent=1.0, min_delta=1.0)
166
+ print(f"\nTop 1% target gradients (min_delta=5.0) (n={len(gradients_1pct)}):")
167
+ print(gradients_1pct)
168
+
169
+ gradients_5pct = prox.target_gradients(top_percent=5.0, min_delta=5.0)
170
+ print(f"\nTop 5% target gradients (min_delta=5.0) (n={len(gradients_5pct)}):")
171
+ print(gradients_5pct)
172
+
173
+ # Test proximity_stats
174
+ print("\n" + "=" * 80)
175
+ print("Testing proximity_stats...")
176
+ print("=" * 80)
177
+ stats = prox.proximity_stats()
178
+ print(stats)
179
+
180
+ # Plot the distance distribution using pandas
181
+ print("\n" + "=" * 80)
182
+ print("Plotting distance distribution...")
183
+ print("=" * 80)
184
+ prox.df["nn_distance"].hist(bins=50, figsize=(10, 6), edgecolor="black")
185
+
186
+ # Visualize the 2D projection
187
+ print("\n" + "=" * 80)
188
+ print("Visualizing 2D Projection...")
189
+ print("=" * 80)
190
+ from workbench.web_interface.components.plugin_unit_test import PluginUnitTest
191
+ from workbench.web_interface.components.plugins.scatter_plot import ScatterPlot
192
+
193
+ unit_test = PluginUnitTest(ScatterPlot, input_data=prox.df[:1000], x="x", y="y", color=model.target())
194
+ unit_test.run()
@@ -9,7 +9,7 @@ from sklearn.model_selection import train_test_split
9
9
  TEMPLATE_PARAMS = {
10
10
  "features": "{{feature_list}}",
11
11
  "target": "{{target_column}}",
12
- "train_all_data": "{{train_all_data}}"
12
+ "train_all_data": "{{train_all_data}}",
13
13
  }
14
14
 
15
15
  from io import StringIO
@@ -33,7 +33,7 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
33
33
  """
34
34
  Matches and renames DataFrame columns to match model feature names (case-insensitive).
35
35
  Prioritizes exact matches, then case-insensitive matches.
36
-
36
+
37
37
  Raises ValueError if any model features cannot be matched.
38
38
  """
39
39
  df_columns_lower = {col.lower(): col for col in df.columns}
@@ -46,7 +46,7 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
46
46
  rename_dict[df_columns_lower[feature.lower()]] = feature
47
47
  else:
48
48
  missing.append(feature)
49
-
49
+
50
50
  if missing:
51
51
  raise ValueError(f"Features not found: {missing}")
52
52
 
@@ -76,10 +76,7 @@ if __name__ == "__main__":
76
76
  args = parser.parse_args()
77
77
 
78
78
  # Load training data from the specified directory
79
- training_files = [
80
- os.path.join(args.train, file)
81
- for file in os.listdir(args.train) if file.endswith(".csv")
82
- ]
79
+ training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
83
80
  df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
84
81
 
85
82
  # Check if the DataFrame is empty
@@ -112,10 +109,7 @@ if __name__ == "__main__":
112
109
  )
113
110
 
114
111
  # Create a Pipeline with StandardScaler
115
- model = Pipeline([
116
- ("scaler", StandardScaler()),
117
- ("model", model)
118
- ])
112
+ model = Pipeline([("scaler", StandardScaler()), ("model", model)])
119
113
 
120
114
  # Prepare features and targets for training
121
115
  X_train = df_train[features]
@@ -3,11 +3,8 @@ from ngboost import NGBRegressor
3
3
  from sklearn.model_selection import train_test_split
4
4
 
5
5
  # Model Performance Scores
6
- from sklearn.metrics import (
7
- mean_absolute_error,
8
- r2_score,
9
- root_mean_squared_error
10
- )
6
+ from sklearn.metrics import mean_absolute_error, median_absolute_error, r2_score, root_mean_squared_error
7
+ from scipy.stats import spearmanr
11
8
 
12
9
  from io import StringIO
13
10
  import json
@@ -21,7 +18,7 @@ import pandas as pd
21
18
  TEMPLATE_PARAMS = {
22
19
  "features": "{{feature_list}}",
23
20
  "target": "{{target_column}}",
24
- "train_all_data": "{{train_all_data}}"
21
+ "train_all_data": "{{train_all_data}}",
25
22
  }
26
23
 
27
24
 
@@ -87,10 +84,7 @@ if __name__ == "__main__":
87
84
  args = parser.parse_args()
88
85
 
89
86
  # Load training data from the specified directory
90
- training_files = [
91
- os.path.join(args.train, file)
92
- for file in os.listdir(args.train) if file.endswith(".csv")
93
- ]
87
+ training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
94
88
  print(f"Training Files: {training_files}")
95
89
 
96
90
  # Combine files and read them all into a single pandas dataframe
@@ -136,11 +130,16 @@ if __name__ == "__main__":
136
130
  # Calculate various model performance metrics (regression)
137
131
  rmse = root_mean_squared_error(y_validate, preds)
138
132
  mae = mean_absolute_error(y_validate, preds)
133
+ medae = median_absolute_error(y_validate, preds)
139
134
  r2 = r2_score(y_validate, preds)
140
- print(f"RMSE: {rmse:.3f}")
141
- print(f"MAE: {mae:.3f}")
142
- print(f"R2: {r2:.3f}")
143
- print(f"NumRows: {len(df_val)}")
135
+ spearman_corr = spearmanr(y_validate, preds).correlation
136
+ support = len(df_val)
137
+ print(f"rmse: {rmse:.3f}")
138
+ print(f"mae: {mae:.3f}")
139
+ print(f"medae: {medae:.3f}")
140
+ print(f"r2: {r2:.3f}")
141
+ print(f"spearmanr: {spearman_corr:.3f}")
142
+ print(f"support: {support}")
144
143
 
145
144
  # Save the trained NGBoost model
146
145
  joblib.dump(ngb_model, os.path.join(args.model_dir, "ngb_model.joblib"))
@@ -212,16 +211,29 @@ def predict_fn(df, model) -> pd.DataFrame:
212
211
  dist_params = y_dists.params
213
212
 
214
213
  # Extract mean and std from distribution parameters
215
- df["prediction"] = dist_params['loc'] # mean
216
- df["prediction_std"] = dist_params['scale'] # standard deviation
214
+ df["prediction"] = dist_params["loc"] # mean
215
+ df["prediction_std"] = dist_params["scale"] # standard deviation
217
216
 
218
217
  # Add 95% prediction intervals using ppf (percent point function)
219
218
  df["q_025"] = y_dists.ppf(0.025) # 2.5th percentile
220
219
  df["q_975"] = y_dists.ppf(0.975) # 97.5th percentile
221
220
 
221
+ # Add 90% prediction intervals
222
+ df["q_05"] = y_dists.ppf(0.05) # 5th percentile
223
+ df["q_95"] = y_dists.ppf(0.95) # 95th percentile
224
+
225
+ # Add 80% prediction intervals
226
+ df["q_10"] = y_dists.ppf(0.10) # 10th percentile
227
+ df["q_90"] = y_dists.ppf(0.90) # 90th percentile
228
+
222
229
  # Add 50% prediction intervals
223
- df["q_25"] = y_dists.ppf(0.25) # 25th percentile
224
- df["q_75"] = y_dists.ppf(0.75) # 75th percentile
230
+ df["q_25"] = y_dists.ppf(0.25) # 25th percentile
231
+ df["q_75"] = y_dists.ppf(0.75) # 75th percentile
232
+
233
+ # Reorder the quantile columns for easier reading
234
+ quantile_cols = ["q_025", "q_05", "q_10", "q_25", "q_75", "q_90", "q_95", "q_975"]
235
+ other_cols = [col for col in df.columns if col not in quantile_cols]
236
+ df = df[other_cols + quantile_cols]
225
237
 
226
238
  # Return the modified DataFrame
227
239
  return df
@@ -1,3 +1 @@
1
- # Note: NGBoost is not included in the default inference image, so it must be specified here.
2
- ngboost
3
- mapie
1
+ # Note: Most libs are already in the training/inference images, ONLY specify additional libs here
@@ -3,7 +3,7 @@ TEMPLATE_PARAMS = {
3
3
  "model_type": "{{model_type}}",
4
4
  "target_column": "{{target_column}}",
5
5
  "feature_list": "{{feature_list}}",
6
- "model_metrics_s3_path": "{{model_metrics_s3_path}}"
6
+ "model_metrics_s3_path": "{{model_metrics_s3_path}}",
7
7
  }
8
8
 
9
9
  # Imports for XGB Model
@@ -12,11 +12,8 @@ import awswrangler as wr
12
12
  import numpy as np
13
13
 
14
14
  # Model Performance Scores
15
- from sklearn.metrics import (
16
- mean_absolute_error,
17
- r2_score,
18
- root_mean_squared_error
19
- )
15
+ from sklearn.metrics import mean_absolute_error, median_absolute_error, r2_score, root_mean_squared_error
16
+ from scipy.stats import spearmanr
20
17
 
21
18
  from io import StringIO
22
19
  import json
@@ -39,6 +36,7 @@ def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
39
36
  print(msg)
40
37
  raise ValueError(msg)
41
38
 
39
+
42
40
  def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> pd.DataFrame:
43
41
  """
44
42
  Matches and renames the DataFrame's column names to match the model's feature names (case-insensitive).
@@ -95,11 +93,7 @@ if __name__ == "__main__":
95
93
  args = parser.parse_args()
96
94
 
97
95
  # Read the training data into DataFrames
98
- training_files = [
99
- os.path.join(args.train, file)
100
- for file in os.listdir(args.train)
101
- if file.endswith(".csv")
102
- ]
96
+ training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
103
97
  print(f"Training Files: {training_files}")
104
98
 
105
99
  # Combine files and read them all into a single pandas dataframe
@@ -150,7 +144,6 @@ if __name__ == "__main__":
150
144
  result_df["residual"] = result_df[target] - result_df["prediction"]
151
145
  result_df["residual_abs"] = result_df["residual"].abs()
152
146
 
153
-
154
147
  # Save the results dataframe to S3
155
148
  wr.s3.to_csv(
156
149
  result_df,
@@ -161,11 +154,16 @@ if __name__ == "__main__":
161
154
  # Report Performance Metrics
162
155
  rmse = root_mean_squared_error(result_df[target], result_df["prediction"])
163
156
  mae = mean_absolute_error(result_df[target], result_df["prediction"])
157
+ medae = median_absolute_error(result_df[target], result_df["prediction"])
164
158
  r2 = r2_score(result_df[target], result_df["prediction"])
165
- print(f"RMSE: {rmse:.3f}")
166
- print(f"MAE: {mae:.3f}")
167
- print(f"R2: {r2:.3f}")
168
- print(f"NumRows: {len(result_df)}")
159
+ spearman_corr = spearmanr(result_df[target], result_df["prediction"]).correlation
160
+ support = len(result_df)
161
+ print(f"rmse: {rmse:.3f}")
162
+ print(f"mae: {mae:.3f}")
163
+ print(f"medae: {medae:.3f}")
164
+ print(f"r2: {r2:.3f}")
165
+ print(f"spearmanr: {spearman_corr:.3f}")
166
+ print(f"support: {support}")
169
167
 
170
168
  # Now save the models
171
169
  for name, model in models.items():
@@ -210,7 +208,7 @@ def input_fn(input_data, content_type):
210
208
  """Parse input data and return a DataFrame."""
211
209
  if not input_data:
212
210
  raise ValueError("Empty input data is not supported!")
213
-
211
+
214
212
  # Decode bytes to string if necessary
215
213
  if isinstance(input_data, bytes):
216
214
  input_data = input_data.decode("utf-8")