workbench 0.8.177__py3-none-any.whl → 0.8.227__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of workbench might be problematic. Click here for more details.

Files changed (140) hide show
  1. workbench/__init__.py +1 -0
  2. workbench/algorithms/dataframe/__init__.py +1 -2
  3. workbench/algorithms/dataframe/compound_dataset_overlap.py +321 -0
  4. workbench/algorithms/dataframe/feature_space_proximity.py +168 -75
  5. workbench/algorithms/dataframe/fingerprint_proximity.py +422 -86
  6. workbench/algorithms/dataframe/projection_2d.py +44 -21
  7. workbench/algorithms/dataframe/proximity.py +259 -305
  8. workbench/algorithms/graph/light/proximity_graph.py +12 -11
  9. workbench/algorithms/models/cleanlab_model.py +382 -0
  10. workbench/algorithms/models/noise_model.py +388 -0
  11. workbench/algorithms/sql/column_stats.py +0 -1
  12. workbench/algorithms/sql/correlations.py +0 -1
  13. workbench/algorithms/sql/descriptive_stats.py +0 -1
  14. workbench/algorithms/sql/outliers.py +3 -3
  15. workbench/api/__init__.py +5 -1
  16. workbench/api/df_store.py +17 -108
  17. workbench/api/endpoint.py +14 -12
  18. workbench/api/feature_set.py +117 -11
  19. workbench/api/meta.py +0 -1
  20. workbench/api/meta_model.py +289 -0
  21. workbench/api/model.py +52 -21
  22. workbench/api/parameter_store.py +3 -52
  23. workbench/cached/cached_meta.py +0 -1
  24. workbench/cached/cached_model.py +49 -11
  25. workbench/core/artifacts/__init__.py +11 -2
  26. workbench/core/artifacts/artifact.py +5 -5
  27. workbench/core/artifacts/df_store_core.py +114 -0
  28. workbench/core/artifacts/endpoint_core.py +319 -204
  29. workbench/core/artifacts/feature_set_core.py +249 -45
  30. workbench/core/artifacts/model_core.py +135 -82
  31. workbench/core/artifacts/parameter_store_core.py +98 -0
  32. workbench/core/cloud_platform/cloud_meta.py +0 -1
  33. workbench/core/pipelines/pipeline_executor.py +1 -1
  34. workbench/core/transforms/features_to_model/features_to_model.py +60 -44
  35. workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +43 -10
  36. workbench/core/transforms/pandas_transforms/pandas_to_features.py +38 -2
  37. workbench/core/views/training_view.py +113 -42
  38. workbench/core/views/view.py +53 -3
  39. workbench/core/views/view_utils.py +4 -4
  40. workbench/model_script_utils/model_script_utils.py +339 -0
  41. workbench/model_script_utils/pytorch_utils.py +405 -0
  42. workbench/model_script_utils/uq_harness.py +277 -0
  43. workbench/model_scripts/chemprop/chemprop.template +774 -0
  44. workbench/model_scripts/chemprop/generated_model_script.py +774 -0
  45. workbench/model_scripts/chemprop/model_script_utils.py +339 -0
  46. workbench/model_scripts/chemprop/requirements.txt +3 -0
  47. workbench/model_scripts/custom_models/chem_info/fingerprints.py +175 -0
  48. workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +0 -1
  49. workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +0 -1
  50. workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py +1 -2
  51. workbench/model_scripts/custom_models/proximity/feature_space_proximity.py +194 -0
  52. workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +8 -10
  53. workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
  54. workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +20 -21
  55. workbench/model_scripts/custom_models/uq_models/feature_space_proximity.py +194 -0
  56. workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
  57. workbench/model_scripts/custom_models/uq_models/ngboost.template +15 -16
  58. workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +15 -17
  59. workbench/model_scripts/meta_model/generated_model_script.py +209 -0
  60. workbench/model_scripts/meta_model/meta_model.template +209 -0
  61. workbench/model_scripts/pytorch_model/generated_model_script.py +443 -499
  62. workbench/model_scripts/pytorch_model/model_script_utils.py +339 -0
  63. workbench/model_scripts/pytorch_model/pytorch.template +440 -496
  64. workbench/model_scripts/pytorch_model/pytorch_utils.py +405 -0
  65. workbench/model_scripts/pytorch_model/requirements.txt +1 -1
  66. workbench/model_scripts/pytorch_model/uq_harness.py +277 -0
  67. workbench/model_scripts/scikit_learn/generated_model_script.py +7 -12
  68. workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
  69. workbench/model_scripts/script_generation.py +15 -12
  70. workbench/model_scripts/uq_models/generated_model_script.py +248 -0
  71. workbench/model_scripts/xgb_model/generated_model_script.py +371 -403
  72. workbench/model_scripts/xgb_model/model_script_utils.py +339 -0
  73. workbench/model_scripts/xgb_model/uq_harness.py +277 -0
  74. workbench/model_scripts/xgb_model/xgb_model.template +367 -399
  75. workbench/repl/workbench_shell.py +18 -14
  76. workbench/resources/open_source_api.key +1 -1
  77. workbench/scripts/endpoint_test.py +162 -0
  78. workbench/scripts/lambda_test.py +73 -0
  79. workbench/scripts/meta_model_sim.py +35 -0
  80. workbench/scripts/ml_pipeline_sqs.py +122 -6
  81. workbench/scripts/training_test.py +85 -0
  82. workbench/themes/dark/custom.css +59 -0
  83. workbench/themes/dark/plotly.json +5 -5
  84. workbench/themes/light/custom.css +153 -40
  85. workbench/themes/light/plotly.json +9 -9
  86. workbench/themes/midnight_blue/custom.css +59 -0
  87. workbench/utils/aws_utils.py +0 -1
  88. workbench/utils/chem_utils/fingerprints.py +87 -46
  89. workbench/utils/chem_utils/mol_descriptors.py +0 -1
  90. workbench/utils/chem_utils/projections.py +16 -6
  91. workbench/utils/chem_utils/vis.py +25 -27
  92. workbench/utils/chemprop_utils.py +141 -0
  93. workbench/utils/config_manager.py +2 -6
  94. workbench/utils/endpoint_utils.py +5 -7
  95. workbench/utils/license_manager.py +2 -6
  96. workbench/utils/markdown_utils.py +57 -0
  97. workbench/utils/meta_model_simulator.py +499 -0
  98. workbench/utils/metrics_utils.py +256 -0
  99. workbench/utils/model_utils.py +260 -76
  100. workbench/utils/pipeline_utils.py +0 -1
  101. workbench/utils/plot_utils.py +159 -34
  102. workbench/utils/pytorch_utils.py +87 -0
  103. workbench/utils/shap_utils.py +11 -57
  104. workbench/utils/theme_manager.py +95 -30
  105. workbench/utils/xgboost_local_crossfold.py +267 -0
  106. workbench/utils/xgboost_model_utils.py +127 -220
  107. workbench/web_interface/components/experiments/outlier_plot.py +0 -1
  108. workbench/web_interface/components/model_plot.py +16 -2
  109. workbench/web_interface/components/plugin_unit_test.py +5 -3
  110. workbench/web_interface/components/plugins/ag_table.py +2 -4
  111. workbench/web_interface/components/plugins/confusion_matrix.py +3 -6
  112. workbench/web_interface/components/plugins/model_details.py +48 -80
  113. workbench/web_interface/components/plugins/scatter_plot.py +192 -92
  114. workbench/web_interface/components/settings_menu.py +184 -0
  115. workbench/web_interface/page_views/main_page.py +0 -1
  116. {workbench-0.8.177.dist-info → workbench-0.8.227.dist-info}/METADATA +31 -17
  117. {workbench-0.8.177.dist-info → workbench-0.8.227.dist-info}/RECORD +121 -106
  118. {workbench-0.8.177.dist-info → workbench-0.8.227.dist-info}/entry_points.txt +4 -0
  119. {workbench-0.8.177.dist-info → workbench-0.8.227.dist-info}/licenses/LICENSE +1 -1
  120. workbench/core/cloud_platform/aws/aws_df_store.py +0 -404
  121. workbench/core/cloud_platform/aws/aws_parameter_store.py +0 -280
  122. workbench/model_scripts/custom_models/meta_endpoints/example.py +0 -53
  123. workbench/model_scripts/custom_models/proximity/generated_model_script.py +0 -138
  124. workbench/model_scripts/custom_models/proximity/proximity.py +0 -384
  125. workbench/model_scripts/custom_models/uq_models/generated_model_script.py +0 -494
  126. workbench/model_scripts/custom_models/uq_models/mapie.template +0 -494
  127. workbench/model_scripts/custom_models/uq_models/meta_uq.template +0 -386
  128. workbench/model_scripts/custom_models/uq_models/proximity.py +0 -384
  129. workbench/model_scripts/ensemble_xgb/generated_model_script.py +0 -279
  130. workbench/model_scripts/quant_regression/quant_regression.template +0 -279
  131. workbench/model_scripts/quant_regression/requirements.txt +0 -1
  132. workbench/themes/quartz/base_css.url +0 -1
  133. workbench/themes/quartz/custom.css +0 -117
  134. workbench/themes/quartz/plotly.json +0 -642
  135. workbench/themes/quartz_dark/base_css.url +0 -1
  136. workbench/themes/quartz_dark/custom.css +0 -131
  137. workbench/themes/quartz_dark/plotly.json +0 -642
  138. workbench/utils/resource_utils.py +0 -39
  139. {workbench-0.8.177.dist-info → workbench-0.8.227.dist-info}/WHEEL +0 -0
  140. {workbench-0.8.177.dist-info → workbench-0.8.227.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,388 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ from xgboost import XGBRegressor
4
+ from typing import List
5
+ import logging
6
+
7
+ from workbench.algorithms.dataframe.feature_space_proximity import FeatureSpaceProximity
8
+
9
+ # Set up logging
10
+ log = logging.getLogger("workbench")
11
+
12
+
13
+ class NoiseModel:
14
+ """Composite noise detection for regression data using multiple complementary signals.
15
+
16
+ The NoiseModel identifies potentially noisy or problematic samples in regression datasets
17
+ by combining three independent signals:
18
+
19
+ 1. **Underfit Model Residuals**: A deliberately simple XGBoost model (low depth, few trees)
20
+ that captures only the main trends. High residuals indicate samples in complex regions
21
+ or unusual areas of the feature space.
22
+
23
+ 2. **Overfit Model Residuals**: A deliberately complex XGBoost model (deep trees, many
24
+ iterations, no regularization) that attempts to memorize the training data. High residuals
25
+ here indicate samples the model *cannot* fit even when trying to memorize - a strong
26
+ signal of label noise. This is the "training error" approach validated in:
27
+ "Denoising Drug Discovery Data for Improved ADMET Property Prediction" (Merck, JCIM 2024)
28
+
29
+ 3. **High Target Gradient (HTG)**: Using the Proximity class, measures disagreement between
30
+ a sample's target value and its neighbors in feature space. High gradients indicate
31
+ activity cliffs or potential measurement errors where similar compounds have very
32
+ different target values.
33
+
34
+ The combined noise score weights the overfit residual signal more heavily (2x) based on
35
+ the paper's finding that training error is the most reliable noise detector for regression.
36
+
37
+ Example:
38
+ ```python
39
+ from workbench.algorithms.models.noise_model import NoiseModel
40
+
41
+ # Create noise model
42
+ noise_model = NoiseModel(df, id_column="id", features=feature_list, target="target")
43
+
44
+ # Get noise scores for all samples
45
+ scores_df = noise_model.get_scores()
46
+
47
+ # Get sample weights for training (lower weight for noisy samples)
48
+ weights = noise_model.get_sample_weights(strategy="inverse")
49
+
50
+ # Get clean subset (bottom 90% by noise score)
51
+ clean_df = noise_model.get_clean_subset(percentile=90)
52
+
53
+ # Find samples with same features but different targets (definite noise)
54
+ conflicts = noise_model.coincident_conflicts()
55
+ ```
56
+
57
+ References:
58
+ Adrian, M., Chung, Y., & Cheng, A. C. (2024). Denoising Drug Discovery Data for
59
+ Improved ADMET Property Prediction. J. Chem. Inf. Model., 64(16), 6324-6337.
60
+ """
61
+
62
+ def __init__(
63
+ self,
64
+ df: pd.DataFrame,
65
+ id_column: str,
66
+ features: List[str],
67
+ target: str,
68
+ ):
69
+ """
70
+ Initialize the NoiseModel class.
71
+
72
+ Args:
73
+ df: DataFrame containing data for noise detection.
74
+ id_column: Name of the column used as the identifier.
75
+ features: List of feature column names.
76
+ target: Name of the target column.
77
+ """
78
+ self.id_column = id_column
79
+ self.target = target
80
+
81
+ # Filter out non-numeric features
82
+ self.features = self._validate_features(df, features)
83
+
84
+ # Drop NaN rows in features and target
85
+ self.df = df.dropna(subset=self.features + [self.target]).copy()
86
+
87
+ # Compute target stats for normalization
88
+ self.target_std = self.df[self.target].std()
89
+ self.target_range = self.df[self.target].max() - self.df[self.target].min()
90
+
91
+ # Build all component models
92
+ self._build_models()
93
+
94
+ # Precompute all noise signals
95
+ self._precompute_signals()
96
+
97
+ def get_scores(self) -> pd.DataFrame:
98
+ """
99
+ Get noise scores for all samples.
100
+
101
+ Returns:
102
+ DataFrame with id, individual signal columns, and combined noise_score
103
+ """
104
+ result = self.df[[self.id_column, self.target]].copy()
105
+ result["underfit_residual"] = self.df["underfit_residual"]
106
+ result["overfit_residual"] = self.df["overfit_residual"]
107
+ result["htg_score"] = self.df["htg_score"]
108
+ result["noise_score"] = self.df["noise_score"]
109
+ return result.sort_values("noise_score", ascending=False).reset_index(drop=True)
110
+
111
+ def get_sample_weights(self, strategy: str = "inverse") -> pd.Series:
112
+ """
113
+ Get sample weights for training, indexed by id_column.
114
+
115
+ Args:
116
+ strategy: Weighting strategy
117
+ - "inverse": 1 / (1 + noise_score)
118
+ - "soft": 1 - noise_score (clipped to [0.1, 1.0])
119
+ - "threshold": 1.0 if noise_score < median, else 0.5
120
+
121
+ Returns:
122
+ Series of weights indexed by id_column
123
+ """
124
+ scores = self.df.set_index(self.id_column)["noise_score"]
125
+
126
+ if strategy == "inverse":
127
+ weights = 1.0 / (1.0 + scores)
128
+ elif strategy == "soft":
129
+ weights = (1.0 - scores).clip(lower=0.1, upper=1.0)
130
+ elif strategy == "threshold":
131
+ median_score = scores.median()
132
+ weights = (scores < median_score).apply(lambda x: 1.0 if x else 0.5)
133
+ else:
134
+ raise ValueError(f"Unknown strategy: {strategy}")
135
+
136
+ return weights
137
+
138
+ def get_clean_subset(self, percentile: float = 90.0) -> pd.DataFrame:
139
+ """
140
+ Get a subset of data with lowest noise scores.
141
+
142
+ Args:
143
+ percentile: Keep samples below this percentile of noise score (default: 90 = bottom 90%)
144
+
145
+ Returns:
146
+ DataFrame of "clean" samples
147
+ """
148
+ threshold = np.percentile(self.df["noise_score"], percentile)
149
+ return self.df[self.df["noise_score"] <= threshold].copy()
150
+
151
+ def get_noisy_samples(self, top_percent: float = 10.0) -> pd.DataFrame:
152
+ """
153
+ Get samples with highest noise scores.
154
+
155
+ Args:
156
+ top_percent: Percentage of noisiest samples to return (default: 10%)
157
+
158
+ Returns:
159
+ DataFrame of noisy samples, sorted by noise_score descending
160
+ """
161
+ percentile = 100 - top_percent
162
+ threshold = np.percentile(self.df["noise_score"], percentile)
163
+ noisy = self.df[self.df["noise_score"] >= threshold].copy()
164
+ return noisy.sort_values("noise_score", ascending=False).reset_index(drop=True)
165
+
166
+ def coincident_conflicts(self, distance_threshold: float = 1e-5) -> pd.DataFrame:
167
+ """
168
+ Find samples that map to the same point in feature space but have different targets.
169
+
170
+ These are definitive noise - same features, different target values.
171
+
172
+ Args:
173
+ distance_threshold: Maximum distance to consider "coincident" (default: 1e-5)
174
+
175
+ Returns:
176
+ DataFrame of coincident conflicts with their target differences
177
+ """
178
+ # Use proximity to find coincident points
179
+ coincident = self.df[self.df["nn_distance"] < distance_threshold].copy()
180
+
181
+ if len(coincident) == 0:
182
+ return pd.DataFrame(columns=[self.id_column, self.target, "nn_id", "nn_target", "nn_target_diff"])
183
+
184
+ return (
185
+ coincident[[self.id_column, self.target, "nn_id", "nn_target", "nn_target_diff", "noise_score"]]
186
+ .sort_values("nn_target_diff", ascending=False)
187
+ .reset_index(drop=True)
188
+ )
189
+
190
+ def _validate_features(self, df: pd.DataFrame, features: List[str]) -> List[str]:
191
+ """Remove non-numeric features and log warnings."""
192
+ non_numeric = [f for f in features if f not in df.select_dtypes(include=["number"]).columns]
193
+ if non_numeric:
194
+ log.warning(f"Non-numeric features {non_numeric} aren't currently supported, excluding them")
195
+ return [f for f in features if f not in non_numeric]
196
+
197
+ def _build_models(self) -> None:
198
+ """Build the underfit, overfit, and proximity models."""
199
+ log.info("Building noise detection models...")
200
+
201
+ X = self.df[self.features]
202
+ y = self.df[self.target]
203
+
204
+ # Underfit model: intentionally simple (high bias)
205
+ log.info(" Fitting underfit model...")
206
+ self.underfit_model = XGBRegressor(
207
+ max_depth=2,
208
+ n_estimators=20,
209
+ learning_rate=0.1,
210
+ random_state=42,
211
+ verbosity=0,
212
+ )
213
+ self.underfit_model.fit(X, y)
214
+
215
+ # Overfit model: intentionally complex (high variance, low regularization)
216
+ log.info(" Fitting overfit model...")
217
+ self.overfit_model = XGBRegressor(
218
+ max_depth=12,
219
+ n_estimators=500,
220
+ learning_rate=0.1,
221
+ reg_lambda=0.0,
222
+ reg_alpha=0.0,
223
+ min_child_weight=1,
224
+ random_state=42,
225
+ verbosity=0,
226
+ )
227
+ self.overfit_model.fit(X, y)
228
+
229
+ # Proximity model for feature space analysis
230
+ log.info(" Building proximity model...")
231
+ self.proximity = FeatureSpaceProximity(
232
+ self.df,
233
+ id_column=self.id_column,
234
+ features=self.features,
235
+ target=self.target,
236
+ )
237
+
238
+ # Copy proximity metrics to our df
239
+ self.df["nn_distance"] = self.proximity.df["nn_distance"].values
240
+ self.df["nn_id"] = self.proximity.df["nn_id"].values
241
+ self.df["nn_target"] = self.proximity.df["nn_target"].values
242
+ self.df["nn_target_diff"] = self.proximity.df["nn_target_diff"].values
243
+
244
+ log.info("Noise detection models built successfully")
245
+
246
+ def _precompute_signals(self) -> None:
247
+ """Precompute all noise signals for every sample."""
248
+ log.info("Precomputing noise signals...")
249
+
250
+ X = self.df[self.features]
251
+ y = self.df[self.target].values
252
+
253
+ # Underfit residuals (normalized by target std)
254
+ underfit_pred = self.underfit_model.predict(X)
255
+ self.df["underfit_residual"] = np.abs(y - underfit_pred) / self.target_std
256
+
257
+ # Overfit residuals (normalized by target std)
258
+ # This is the key "training error" signal from the paper
259
+ overfit_pred = self.overfit_model.predict(X)
260
+ self.df["overfit_residual"] = np.abs(y - overfit_pred) / self.target_std
261
+
262
+ # HTG score: neighbor disagreement (normalized by target std)
263
+ # Using nn_target_diff directly, normalized
264
+ self.df["htg_score"] = self.df["nn_target_diff"] / self.target_std
265
+
266
+ # Combine into overall noise score
267
+ # Scale each component to [0, 1] using percentile ranks, then average
268
+ self.df["noise_score"] = self._compute_combined_score()
269
+
270
+ log.info("Noise signals precomputed successfully")
271
+
272
+ def _compute_combined_score(self) -> np.ndarray:
273
+ """
274
+ Combine individual signals into a single noise score.
275
+
276
+ Uses percentile ranks to normalize each signal to [0, 1], then averages.
277
+ Overfit residual gets higher weight as it's the most validated signal (per the paper).
278
+ """
279
+ # Convert to percentile ranks (0-1 scale)
280
+ overfit_rank = self.df["overfit_residual"].rank(pct=True)
281
+ htg_rank = self.df["htg_score"].rank(pct=True)
282
+
283
+ # Weighted average: overfit gets 2x weight based on paper's findings
284
+ # that training error is the best noise detector
285
+ combined = (2.0 * overfit_rank + 1.0 * htg_rank) / 3.0
286
+
287
+ return combined.values
288
+
289
+
290
+ # Testing the NoiseModel class
291
+ if __name__ == "__main__":
292
+
293
+ from workbench.api import FeatureSet, Model
294
+
295
+ pd.set_option("display.max_columns", None)
296
+ pd.set_option("display.width", 1000)
297
+
298
+ # Create a sample DataFrame with some noisy points
299
+ np.random.seed(42)
300
+ n_samples = 100
301
+
302
+ # Generate clean data: y = 2*x1 + 3*x2 + noise
303
+ x1 = np.random.randn(n_samples)
304
+ x2 = np.random.randn(n_samples)
305
+ y_clean = 2 * x1 + 3 * x2 + np.random.randn(n_samples) * 0.1
306
+
307
+ # Add some noisy points (last 10 samples)
308
+ y_noisy = y_clean.copy()
309
+ y_noisy[-10:] += np.random.randn(10) * 5 # Large noise
310
+
311
+ data = {
312
+ "ID": [f"sample_{i}" for i in range(n_samples)],
313
+ "Feature1": x1,
314
+ "Feature2": x2,
315
+ "target": y_noisy,
316
+ }
317
+ df = pd.DataFrame(data)
318
+
319
+ print("=" * 80)
320
+ print("Testing NoiseModel...")
321
+ print("=" * 80)
322
+
323
+ # Create noise model
324
+ noise_model = NoiseModel(
325
+ df,
326
+ id_column="ID",
327
+ features=["Feature1", "Feature2"],
328
+ target="target",
329
+ )
330
+
331
+ # Get noise scores
332
+ print("\nTop 10 noisiest samples:")
333
+ scores = noise_model.get_scores()
334
+ print(scores.head(10))
335
+
336
+ # Check if our artificially noisy samples are detected
337
+ noisy_ids = [f"sample_{i}" for i in range(90, 100)]
338
+ detected = scores[scores["ID"].isin(noisy_ids)]
339
+ median_score = scores["noise_score"].median()
340
+ print(f"\nOf 10 noisy samples, {len(detected[detected['noise_score'] > median_score])} above median noise score")
341
+
342
+ # Get sample weights
343
+ print("\nSample weights (inverse strategy):")
344
+ weights = noise_model.get_sample_weights(strategy="inverse")
345
+ print(f" Min weight: {weights.min():.3f}")
346
+ print(f" Max weight: {weights.max():.3f}")
347
+ print(f" Mean weight: {weights.mean():.3f}")
348
+
349
+ # Get clean subset
350
+ clean = noise_model.get_clean_subset(percentile=90)
351
+ print(f"\nClean subset (bottom 90%): {len(clean)} samples")
352
+
353
+ # Get noisy samples
354
+ noisy = noise_model.get_noisy_samples(top_percent=10)
355
+ print(f"\nNoisy samples (top 10%): {len(noisy)} samples")
356
+ print(noisy[["ID", "target", "overfit_residual", "htg_score", "noise_score"]].head())
357
+
358
+ # Test with real data
359
+ print("\n" + "=" * 80)
360
+ print("Testing with AQSol data...")
361
+ print("=" * 80)
362
+ fs = FeatureSet("aqsol_features")
363
+ model = Model("aqsol-regression")
364
+
365
+ if fs.exists():
366
+ features = model.features()
367
+ target = model.target()
368
+ df = fs.pull_dataframe()
369
+
370
+ noise_model = NoiseModel(
371
+ df,
372
+ id_column=fs.id_column,
373
+ features=features,
374
+ target=target,
375
+ )
376
+
377
+ print("\nTop 10 noisiest compounds:")
378
+ scores = noise_model.get_scores()
379
+ print(scores.head(10))
380
+
381
+ print("\nCoincident conflicts:")
382
+ conflicts = noise_model.coincident_conflicts()
383
+ print(f"Found {len(conflicts)} coincident conflicts")
384
+ if len(conflicts) > 0:
385
+ print(conflicts.head())
386
+
387
+ print("\nNoise score distribution:")
388
+ print(scores["noise_score"].describe())
@@ -6,7 +6,6 @@ import pandas as pd
6
6
  # Workbench Imports
7
7
  from workbench.core.artifacts.data_source_abstract import DataSourceAbstract
8
8
 
9
-
10
9
  # Workbench Logger
11
10
  log = logging.getLogger("workbench")
12
11
 
@@ -7,7 +7,6 @@ from collections import defaultdict
7
7
  # Workbench Imports
8
8
  from workbench.core.artifacts.data_source_abstract import DataSourceAbstract
9
9
 
10
-
11
10
  # Workbench Logger
12
11
  log = logging.getLogger("workbench")
13
12
 
@@ -7,7 +7,6 @@ from collections import defaultdict
7
7
  # Workbench Imports
8
8
  from workbench.core.artifacts.data_source_abstract import DataSourceAbstract
9
9
 
10
-
11
10
  # Workbench Logger
12
11
  log = logging.getLogger("workbench")
13
12
 
@@ -209,9 +209,9 @@ class Outliers:
209
209
  else:
210
210
  return group.nlargest(n, col)
211
211
 
212
- # Group by 'outlier_group' and apply the helper function, explicitly selecting columns
213
- top_outliers = outlier_df.groupby("outlier_group", group_keys=False).apply(
214
- get_extreme_values, include_groups=True
212
+ # Group by 'outlier_group' and apply the helper function, explicitly selecting columns to silence warning
213
+ top_outliers = outlier_df.groupby("outlier_group", group_keys=False)[outlier_df.columns].apply(
214
+ get_extreme_values
215
215
  )
216
216
  return top_outliers.reset_index(drop=True)
217
217
 
workbench/api/__init__.py CHANGED
@@ -5,6 +5,7 @@ These class provide high-level APIs for the Workbench package, offering easy acc
5
5
  - DataSource: Manages AWS Data Catalog and Athena
6
6
  - FeatureSet: Manages AWS Feature Store and Feature Groups
7
7
  - Model: Manages the training and deployment of AWS Model Groups and Packages
8
+ - MetaModel: A Model that aggregates predictions from multiple child endpoints
8
9
  - ModelType: Enum for the different model types supported by Workbench
9
10
  - Endpoint: Manages the deployment and invocations/inference on AWS Endpoints
10
11
  - Meta: Provides an API to retrieve AWS Metadata for the above classes
@@ -14,7 +15,8 @@ These class provide high-level APIs for the Workbench package, offering easy acc
14
15
 
15
16
  from .data_source import DataSource
16
17
  from .feature_set import FeatureSet
17
- from .model import Model, ModelType
18
+ from .model import Model, ModelType, ModelFramework
19
+ from .meta_model import MetaModel
18
20
  from .endpoint import Endpoint
19
21
  from .meta import Meta
20
22
  from .parameter_store import ParameterStore
@@ -24,7 +26,9 @@ __all__ = [
24
26
  "DataSource",
25
27
  "FeatureSet",
26
28
  "Model",
29
+ "MetaModel",
27
30
  "ModelType",
31
+ "ModelFramework",
28
32
  "Endpoint",
29
33
  "Meta",
30
34
  "ParameterStore",
workbench/api/df_store.py CHANGED
@@ -1,35 +1,32 @@
1
1
  """DFStore: Fast/efficient storage of DataFrames using AWS S3/Parquet/Snappy"""
2
2
 
3
- from datetime import datetime
4
3
  from typing import Union
5
- import logging
6
- import pandas as pd
7
4
 
8
5
  # Workbench Imports
9
- from workbench.core.cloud_platform.aws.aws_df_store import AWSDFStore
6
+ from workbench.core.artifacts.df_store_core import DFStoreCore
10
7
 
11
8
 
12
- class DFStore(AWSDFStore):
9
+ class DFStore(DFStoreCore):
13
10
  """DFStore: Fast/efficient storage of DataFrames using AWS S3/Parquet/Snappy
14
11
 
15
- Common Usage:
16
- ```python
17
- df_store = DFStore()
12
+ Common Usage:
13
+ ```python
14
+ df_store = DFStore()
18
15
 
19
- # List Data
20
- df_store.list()
16
+ # List Data
17
+ df_store.list()
21
18
 
22
- # Add DataFrame
23
- df = pd.DataFrame({"A": [1, 2], "B": [3, 4]})
24
- df_store.upsert("/test/my_data", df)
19
+ # Add DataFrame
20
+ df = pd.DataFrame({"A": [1, 2], "B": [3, 4]})
21
+ df_store.upsert("/test/my_data", df)
25
22
 
26
- # Retrieve DataFrame
27
- df = df_store.get("/test/my_data")
28
- print(df)
23
+ # Retrieve DataFrame
24
+ df = df_store.get("/test/my_data")
25
+ print(df)
29
26
 
30
- # Delete Data
31
- df_store.delete("/test/my_data")
32
- ```
27
+ # Delete Data
28
+ df_store.delete("/test/my_data")
29
+ ```
33
30
  """
34
31
 
35
32
  def __init__(self, path_prefix: Union[str, None] = None):
@@ -38,101 +35,13 @@ class DFStore(AWSDFStore):
38
35
  Args:
39
36
  path_prefix (Union[str, None], optional): Add a path prefix to storage locations (Defaults to None)
40
37
  """
41
- self.log = logging.getLogger("workbench")
42
-
43
- # Initialize the SuperClass
44
38
  super().__init__(path_prefix=path_prefix)
45
39
 
46
- def list(self, include_cache: bool = False) -> list:
47
- """List all the objects in the data_store prefix.
48
-
49
- Args:
50
- include_cache (bool, optional): Include cache objects in the list (Defaults to False).
51
-
52
- Returns:
53
- list: A list of all the objects in the data_store prefix.
54
- """
55
- return super().list(include_cache=include_cache)
56
-
57
- def summary(self, include_cache: bool = False) -> pd.DataFrame:
58
- """Return a nicely formatted summary of object locations, sizes (in MB), and modified dates.
59
-
60
- Args:
61
- include_cache (bool, optional): Include cache objects in the summary (Defaults to False).
62
-
63
- Returns:
64
- pd.DataFrame: A formatted DataFrame with the summary details.
65
- """
66
- return super().summary(include_cache=include_cache)
67
-
68
- def details(self, include_cache: bool = False) -> pd.DataFrame:
69
- """Return a DataFrame with detailed metadata for all objects in the data_store prefix.
70
-
71
- Args:
72
- include_cache (bool, optional): Include cache objects in the details (Defaults to False).
73
-
74
- Returns:
75
- pd.DataFrame: A DataFrame with detailed metadata for all objects in the data_store prefix.
76
- """
77
- return super().details(include_cache=include_cache)
78
-
79
- def check(self, location: str) -> bool:
80
- """Check if a DataFrame exists at the specified location
81
-
82
- Args:
83
- location (str): The location of the data to check.
84
-
85
- Returns:
86
- bool: True if the data exists, False otherwise.
87
- """
88
- return super().check(location)
89
-
90
- def get(self, location: str) -> Union[pd.DataFrame, None]:
91
- """Retrieve a DataFrame from AWS S3.
92
-
93
- Args:
94
- location (str): The location of the data to retrieve.
95
-
96
- Returns:
97
- pd.DataFrame: The retrieved DataFrame or None if not found.
98
- """
99
- _df = super().get(location)
100
- if _df is None:
101
- self.log.error(f"Dataframe not found at location: {location}")
102
- return _df
103
-
104
- def upsert(self, location: str, data: Union[pd.DataFrame, pd.Series]):
105
- """Insert or update a DataFrame or Series in the AWS S3.
106
-
107
- Args:
108
- location (str): The location of the data.
109
- data (Union[pd.DataFrame, pd.Series]): The data to be stored.
110
- """
111
- super().upsert(location, data)
112
-
113
- def last_modified(self, location: str) -> Union[datetime, None]:
114
- """Get the last modified date of the DataFrame at the specified location.
115
-
116
- Args:
117
- location (str): The location of the data to check.
118
-
119
- Returns:
120
- Union[datetime, None]: The last modified date of the DataFrame or None if not found.
121
- """
122
- return super().last_modified(location)
123
-
124
- def delete(self, location: str):
125
- """Delete a DataFrame from the AWS S3.
126
-
127
- Args:
128
- location (str): The location of the data to delete.
129
- """
130
- super().delete(location)
131
-
132
40
 
133
41
  if __name__ == "__main__":
134
42
  """Exercise the DFStore Class"""
135
43
  import time
44
+ import pandas as pd
136
45
 
137
46
  # Create a DFStore manager
138
47
  df_store = DFStore()
workbench/api/endpoint.py CHANGED
@@ -44,16 +44,21 @@ class Endpoint(EndpointCore):
44
44
  """
45
45
  return super().inference(eval_df, capture_name, id_column, drop_error_rows)
46
46
 
47
- def auto_inference(self, capture: bool = False) -> pd.DataFrame:
48
- """Run inference on the Endpoint using the FeatureSet evaluation data
47
+ def auto_inference(self) -> pd.DataFrame:
48
+ """Run inference on the Endpoint using the test data from the model training view
49
49
 
50
- Args:
51
- capture (bool): Capture the inference results
50
+ Returns:
51
+ pd.DataFrame: The DataFrame with predictions
52
+ """
53
+ return super().auto_inference()
54
+
55
+ def full_inference(self) -> pd.DataFrame:
56
+ """Run inference on the Endpoint using the full data from the model training view
52
57
 
53
58
  Returns:
54
59
  pd.DataFrame: The DataFrame with predictions
55
60
  """
56
- return super().auto_inference(capture)
61
+ return super().full_inference()
57
62
 
58
63
  def fast_inference(self, eval_df: pd.DataFrame, threads: int = 4) -> pd.DataFrame:
59
64
  """Run inference on the Endpoint using the provided DataFrame
@@ -70,16 +75,13 @@ class Endpoint(EndpointCore):
70
75
  """
71
76
  return super().fast_inference(eval_df, threads=threads)
72
77
 
73
- def cross_fold_inference(self, nfolds: int = 5) -> dict:
74
- """Run cross-fold inference (only works for XGBoost models)
75
-
76
- Args:
77
- nfolds (int): The number of folds to use for cross-validation (default: 5)
78
+ def cross_fold_inference(self) -> pd.DataFrame:
79
+ """Pull cross-fold inference from model associated with this Endpoint
78
80
 
79
81
  Returns:
80
- dict: A dictionary with fold results
82
+ pd.DataFrame: A DataFrame with cross fold predictions
81
83
  """
82
- return super().cross_fold_inference(nfolds)
84
+ return super().cross_fold_inference()
83
85
 
84
86
 
85
87
  if __name__ == "__main__":