workbench 0.8.213__py3-none-any.whl → 0.8.217__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. workbench/algorithms/dataframe/feature_space_proximity.py +168 -75
  2. workbench/algorithms/dataframe/fingerprint_proximity.py +257 -80
  3. workbench/algorithms/dataframe/projection_2d.py +38 -21
  4. workbench/algorithms/dataframe/proximity.py +75 -150
  5. workbench/algorithms/graph/light/proximity_graph.py +5 -5
  6. workbench/algorithms/models/cleanlab_model.py +382 -0
  7. workbench/algorithms/models/noise_model.py +2 -2
  8. workbench/api/__init__.py +3 -0
  9. workbench/api/endpoint.py +10 -5
  10. workbench/api/feature_set.py +76 -6
  11. workbench/api/meta_model.py +289 -0
  12. workbench/api/model.py +43 -4
  13. workbench/core/artifacts/endpoint_core.py +63 -115
  14. workbench/core/artifacts/feature_set_core.py +1 -1
  15. workbench/core/artifacts/model_core.py +6 -4
  16. workbench/core/pipelines/pipeline_executor.py +1 -1
  17. workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +30 -10
  18. workbench/model_script_utils/pytorch_utils.py +11 -1
  19. workbench/model_scripts/chemprop/chemprop.template +145 -69
  20. workbench/model_scripts/chemprop/generated_model_script.py +147 -71
  21. workbench/model_scripts/custom_models/chem_info/fingerprints.py +7 -3
  22. workbench/model_scripts/custom_models/proximity/feature_space_proximity.py +194 -0
  23. workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +6 -6
  24. workbench/model_scripts/custom_models/uq_models/feature_space_proximity.py +194 -0
  25. workbench/model_scripts/custom_models/uq_models/meta_uq.template +6 -6
  26. workbench/model_scripts/meta_model/generated_model_script.py +209 -0
  27. workbench/model_scripts/meta_model/meta_model.template +209 -0
  28. workbench/model_scripts/pytorch_model/generated_model_script.py +42 -24
  29. workbench/model_scripts/pytorch_model/pytorch.template +42 -24
  30. workbench/model_scripts/pytorch_model/pytorch_utils.py +11 -1
  31. workbench/model_scripts/script_generation.py +4 -0
  32. workbench/model_scripts/xgb_model/generated_model_script.py +169 -158
  33. workbench/model_scripts/xgb_model/xgb_model.template +163 -152
  34. workbench/repl/workbench_shell.py +0 -5
  35. workbench/scripts/endpoint_test.py +2 -2
  36. workbench/utils/chem_utils/fingerprints.py +7 -3
  37. workbench/utils/chemprop_utils.py +23 -5
  38. workbench/utils/meta_model_simulator.py +471 -0
  39. workbench/utils/metrics_utils.py +94 -10
  40. workbench/utils/model_utils.py +91 -9
  41. workbench/utils/pytorch_utils.py +1 -1
  42. workbench/web_interface/components/plugins/scatter_plot.py +4 -8
  43. {workbench-0.8.213.dist-info → workbench-0.8.217.dist-info}/METADATA +2 -1
  44. {workbench-0.8.213.dist-info → workbench-0.8.217.dist-info}/RECORD +48 -43
  45. workbench/model_scripts/custom_models/proximity/proximity.py +0 -410
  46. workbench/model_scripts/custom_models/uq_models/proximity.py +0 -410
  47. {workbench-0.8.213.dist-info → workbench-0.8.217.dist-info}/WHEEL +0 -0
  48. {workbench-0.8.213.dist-info → workbench-0.8.217.dist-info}/entry_points.txt +0 -0
  49. {workbench-0.8.213.dist-info → workbench-0.8.217.dist-info}/licenses/LICENSE +0 -0
  50. {workbench-0.8.213.dist-info → workbench-0.8.217.dist-info}/top_level.txt +0 -0
@@ -5,51 +5,36 @@
5
5
  # - Out-of-fold predictions for validation metrics
6
6
  # - Categorical feature embedding via TabularMLP
7
7
  # - Compressed feature decompression
8
+ #
9
+ # NOTE: Imports are structured to minimize serverless endpoint startup time.
10
+ # Heavy imports (sklearn, awswrangler) are deferred to training time.
8
11
 
9
- import argparse
10
12
  import json
11
13
  import os
12
14
 
13
- import awswrangler as wr
14
15
  import joblib
15
16
  import numpy as np
16
17
  import pandas as pd
17
18
  import torch
18
- from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
19
- from sklearn.preprocessing import LabelEncoder
20
-
21
- # Enable Tensor Core optimization for GPUs that support it
22
- torch.set_float32_matmul_precision("medium")
23
19
 
24
20
  from model_script_utils import (
25
- check_dataframe,
26
- compute_classification_metrics,
27
- compute_regression_metrics,
28
21
  convert_categorical_types,
29
22
  decompress_features,
30
23
  expand_proba_column,
31
24
  input_fn,
32
25
  match_features_case_insensitive,
33
26
  output_fn,
34
- print_classification_metrics,
35
- print_confusion_matrix,
36
- print_regression_metrics,
37
27
  )
38
28
  from pytorch_utils import (
39
29
  FeatureScaler,
40
- create_model,
41
30
  load_model,
42
31
  predict,
43
32
  prepare_data,
44
- save_model,
45
- train_model,
46
33
  )
47
34
  from uq_harness import (
48
35
  compute_confidence,
49
36
  load_uq_models,
50
37
  predict_intervals,
51
- save_uq_models,
52
- train_uq_models,
53
38
  )
54
39
 
55
40
  # =============================================================================
@@ -59,13 +44,15 @@ DEFAULT_HYPERPARAMETERS = {
59
44
  # Training parameters
60
45
  "n_folds": 5,
61
46
  "max_epochs": 200,
62
- "early_stopping_patience": 20,
47
+ "early_stopping_patience": 30,
63
48
  "batch_size": 128,
64
- # Model architecture
65
- "layers": "256-128-64",
49
+ # Model architecture (larger capacity - ensemble provides regularization)
50
+ "layers": "512-256-128",
66
51
  "learning_rate": 1e-3,
67
- "dropout": 0.1,
52
+ "dropout": 0.05,
68
53
  "use_batch_norm": True,
54
+ # Loss function for regression (L1Loss=MAE, MSELoss=MSE, HuberLoss, SmoothL1Loss)
55
+ "loss": "L1Loss",
69
56
  # Random seed
70
57
  "seed": 42,
71
58
  }
@@ -86,7 +73,7 @@ TEMPLATE_PARAMS = {
86
73
  # Model Loading (for SageMaker inference)
87
74
  # =============================================================================
88
75
  def model_fn(model_dir: str) -> dict:
89
- """Load TabularMLP ensemble from the specified directory."""
76
+ """Load PyTorch TabularMLP ensemble from the specified directory."""
90
77
  # Load ensemble metadata
91
78
  metadata_path = os.path.join(model_dir, "ensemble_metadata.joblib")
92
79
  if os.path.exists(metadata_path):
@@ -129,7 +116,7 @@ def model_fn(model_dir: str) -> dict:
129
116
  # Inference (for SageMaker inference)
130
117
  # =============================================================================
131
118
  def predict_fn(df: pd.DataFrame, model_dict: dict) -> pd.DataFrame:
132
- """Make predictions with TabularMLP ensemble."""
119
+ """Make predictions with PyTorch TabularMLP ensemble."""
133
120
  model_type = TEMPLATE_PARAMS["model_type"]
134
121
  compressed_features = TEMPLATE_PARAMS["compressed_features"]
135
122
  model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
@@ -233,6 +220,36 @@ def predict_fn(df: pd.DataFrame, model_dict: dict) -> pd.DataFrame:
233
220
  # Training
234
221
  # =============================================================================
235
222
  if __name__ == "__main__":
223
+ # -------------------------------------------------------------------------
224
+ # Training-only imports (deferred to reduce serverless startup time)
225
+ # -------------------------------------------------------------------------
226
+ import argparse
227
+
228
+ import awswrangler as wr
229
+ from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
230
+ from sklearn.preprocessing import LabelEncoder
231
+
232
+ # Enable Tensor Core optimization for GPUs that support it
233
+ torch.set_float32_matmul_precision("medium")
234
+
235
+ from model_script_utils import (
236
+ check_dataframe,
237
+ compute_classification_metrics,
238
+ compute_regression_metrics,
239
+ print_classification_metrics,
240
+ print_confusion_matrix,
241
+ print_regression_metrics,
242
+ )
243
+ from pytorch_utils import (
244
+ create_model,
245
+ save_model,
246
+ train_model,
247
+ )
248
+ from uq_harness import (
249
+ save_uq_models,
250
+ train_uq_models,
251
+ )
252
+
236
253
  # -------------------------------------------------------------------------
237
254
  # Setup: Parse arguments and load data
238
255
  # -------------------------------------------------------------------------
@@ -377,6 +394,7 @@ if __name__ == "__main__":
377
394
  patience=hyperparameters["early_stopping_patience"],
378
395
  batch_size=hyperparameters["batch_size"],
379
396
  learning_rate=hyperparameters["learning_rate"],
397
+ loss=hyperparameters.get("loss", "L1Loss"),
380
398
  device=device,
381
399
  )
382
400
  ensemble_models.append(model)
@@ -5,51 +5,36 @@
5
5
  # - Out-of-fold predictions for validation metrics
6
6
  # - Categorical feature embedding via TabularMLP
7
7
  # - Compressed feature decompression
8
+ #
9
+ # NOTE: Imports are structured to minimize serverless endpoint startup time.
10
+ # Heavy imports (sklearn, awswrangler) are deferred to training time.
8
11
 
9
- import argparse
10
12
  import json
11
13
  import os
12
14
 
13
- import awswrangler as wr
14
15
  import joblib
15
16
  import numpy as np
16
17
  import pandas as pd
17
18
  import torch
18
- from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
19
- from sklearn.preprocessing import LabelEncoder
20
-
21
- # Enable Tensor Core optimization for GPUs that support it
22
- torch.set_float32_matmul_precision("medium")
23
19
 
24
20
  from model_script_utils import (
25
- check_dataframe,
26
- compute_classification_metrics,
27
- compute_regression_metrics,
28
21
  convert_categorical_types,
29
22
  decompress_features,
30
23
  expand_proba_column,
31
24
  input_fn,
32
25
  match_features_case_insensitive,
33
26
  output_fn,
34
- print_classification_metrics,
35
- print_confusion_matrix,
36
- print_regression_metrics,
37
27
  )
38
28
  from pytorch_utils import (
39
29
  FeatureScaler,
40
- create_model,
41
30
  load_model,
42
31
  predict,
43
32
  prepare_data,
44
- save_model,
45
- train_model,
46
33
  )
47
34
  from uq_harness import (
48
35
  compute_confidence,
49
36
  load_uq_models,
50
37
  predict_intervals,
51
- save_uq_models,
52
- train_uq_models,
53
38
  )
54
39
 
55
40
  # =============================================================================
@@ -59,13 +44,15 @@ DEFAULT_HYPERPARAMETERS = {
59
44
  # Training parameters
60
45
  "n_folds": 5,
61
46
  "max_epochs": 200,
62
- "early_stopping_patience": 20,
47
+ "early_stopping_patience": 30,
63
48
  "batch_size": 128,
64
- # Model architecture
65
- "layers": "256-128-64",
49
+ # Model architecture (larger capacity - ensemble provides regularization)
50
+ "layers": "512-256-128",
66
51
  "learning_rate": 1e-3,
67
- "dropout": 0.1,
52
+ "dropout": 0.05,
68
53
  "use_batch_norm": True,
54
+ # Loss function for regression (L1Loss=MAE, MSELoss=MSE, HuberLoss, SmoothL1Loss)
55
+ "loss": "L1Loss",
69
56
  # Random seed
70
57
  "seed": 42,
71
58
  }
@@ -86,7 +73,7 @@ TEMPLATE_PARAMS = {
86
73
  # Model Loading (for SageMaker inference)
87
74
  # =============================================================================
88
75
  def model_fn(model_dir: str) -> dict:
89
- """Load TabularMLP ensemble from the specified directory."""
76
+ """Load PyTorch TabularMLP ensemble from the specified directory."""
90
77
  # Load ensemble metadata
91
78
  metadata_path = os.path.join(model_dir, "ensemble_metadata.joblib")
92
79
  if os.path.exists(metadata_path):
@@ -129,7 +116,7 @@ def model_fn(model_dir: str) -> dict:
129
116
  # Inference (for SageMaker inference)
130
117
  # =============================================================================
131
118
  def predict_fn(df: pd.DataFrame, model_dict: dict) -> pd.DataFrame:
132
- """Make predictions with TabularMLP ensemble."""
119
+ """Make predictions with PyTorch TabularMLP ensemble."""
133
120
  model_type = TEMPLATE_PARAMS["model_type"]
134
121
  compressed_features = TEMPLATE_PARAMS["compressed_features"]
135
122
  model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
@@ -233,6 +220,36 @@ def predict_fn(df: pd.DataFrame, model_dict: dict) -> pd.DataFrame:
233
220
  # Training
234
221
  # =============================================================================
235
222
  if __name__ == "__main__":
223
+ # -------------------------------------------------------------------------
224
+ # Training-only imports (deferred to reduce serverless startup time)
225
+ # -------------------------------------------------------------------------
226
+ import argparse
227
+
228
+ import awswrangler as wr
229
+ from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
230
+ from sklearn.preprocessing import LabelEncoder
231
+
232
+ # Enable Tensor Core optimization for GPUs that support it
233
+ torch.set_float32_matmul_precision("medium")
234
+
235
+ from model_script_utils import (
236
+ check_dataframe,
237
+ compute_classification_metrics,
238
+ compute_regression_metrics,
239
+ print_classification_metrics,
240
+ print_confusion_matrix,
241
+ print_regression_metrics,
242
+ )
243
+ from pytorch_utils import (
244
+ create_model,
245
+ save_model,
246
+ train_model,
247
+ )
248
+ from uq_harness import (
249
+ save_uq_models,
250
+ train_uq_models,
251
+ )
252
+
236
253
  # -------------------------------------------------------------------------
237
254
  # Setup: Parse arguments and load data
238
255
  # -------------------------------------------------------------------------
@@ -377,6 +394,7 @@ if __name__ == "__main__":
377
394
  patience=hyperparameters["early_stopping_patience"],
378
395
  batch_size=hyperparameters["batch_size"],
379
396
  learning_rate=hyperparameters["learning_rate"],
397
+ loss=hyperparameters.get("loss", "L1Loss"),
380
398
  device=device,
381
399
  )
382
400
  ensemble_models.append(model)
@@ -245,6 +245,7 @@ def train_model(
245
245
  patience: int = 20,
246
246
  batch_size: int = 128,
247
247
  learning_rate: float = 1e-3,
248
+ loss: str = "L1Loss",
248
249
  device: str = "cpu",
249
250
  ) -> tuple[TabularMLP, dict]:
250
251
  """Train the model with early stopping.
@@ -272,7 +273,16 @@ def train_model(
272
273
  if task == "classification":
273
274
  criterion = nn.CrossEntropyLoss()
274
275
  else:
275
- criterion = nn.MSELoss()
276
+ # Map loss name to PyTorch loss class
277
+ loss_map = {
278
+ "L1Loss": nn.L1Loss,
279
+ "MSELoss": nn.MSELoss,
280
+ "HuberLoss": nn.HuberLoss,
281
+ "SmoothL1Loss": nn.SmoothL1Loss,
282
+ }
283
+ if loss not in loss_map:
284
+ raise ValueError(f"Unknown loss '{loss}'. Supported: {list(loss_map.keys())}")
285
+ criterion = loss_map[loss]()
276
286
 
277
287
  optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
278
288
 
@@ -100,6 +100,7 @@ def generate_model_script(template_params: dict) -> str:
100
100
  - model_metrics_s3_path (str): The S3 path to store the model metrics
101
101
  - train_all_data (bool): Whether to train on all (100%) of the data
102
102
  - hyperparameters (dict, optional): Hyperparameters for the model (default: None)
103
+ - child_endpoints (list[str], optional): For META models, list of child endpoint names
103
104
 
104
105
  Returns:
105
106
  str: The name of the generated model script
@@ -116,6 +117,9 @@ def generate_model_script(template_params: dict) -> str:
116
117
  elif template_params["model_framework"] == ModelFramework.CHEMPROP:
117
118
  template_name = "chemprop.template"
118
119
  model_script_dir = "chemprop"
120
+ elif template_params["model_framework"] == ModelFramework.META:
121
+ template_name = "meta_model.template"
122
+ model_script_dir = "meta_model"
119
123
  elif template_params["model_type"] in [ModelType.REGRESSOR, ModelType.UQ_REGRESSOR, ModelType.CLASSIFIER]:
120
124
  template_name = "xgb_model.template"
121
125
  model_script_dir = "xgb_model"