workbench 0.8.213__py3-none-any.whl → 0.8.217__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. workbench/algorithms/dataframe/feature_space_proximity.py +168 -75
  2. workbench/algorithms/dataframe/fingerprint_proximity.py +257 -80
  3. workbench/algorithms/dataframe/projection_2d.py +38 -21
  4. workbench/algorithms/dataframe/proximity.py +75 -150
  5. workbench/algorithms/graph/light/proximity_graph.py +5 -5
  6. workbench/algorithms/models/cleanlab_model.py +382 -0
  7. workbench/algorithms/models/noise_model.py +2 -2
  8. workbench/api/__init__.py +3 -0
  9. workbench/api/endpoint.py +10 -5
  10. workbench/api/feature_set.py +76 -6
  11. workbench/api/meta_model.py +289 -0
  12. workbench/api/model.py +43 -4
  13. workbench/core/artifacts/endpoint_core.py +63 -115
  14. workbench/core/artifacts/feature_set_core.py +1 -1
  15. workbench/core/artifacts/model_core.py +6 -4
  16. workbench/core/pipelines/pipeline_executor.py +1 -1
  17. workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +30 -10
  18. workbench/model_script_utils/pytorch_utils.py +11 -1
  19. workbench/model_scripts/chemprop/chemprop.template +145 -69
  20. workbench/model_scripts/chemprop/generated_model_script.py +147 -71
  21. workbench/model_scripts/custom_models/chem_info/fingerprints.py +7 -3
  22. workbench/model_scripts/custom_models/proximity/feature_space_proximity.py +194 -0
  23. workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +6 -6
  24. workbench/model_scripts/custom_models/uq_models/feature_space_proximity.py +194 -0
  25. workbench/model_scripts/custom_models/uq_models/meta_uq.template +6 -6
  26. workbench/model_scripts/meta_model/generated_model_script.py +209 -0
  27. workbench/model_scripts/meta_model/meta_model.template +209 -0
  28. workbench/model_scripts/pytorch_model/generated_model_script.py +42 -24
  29. workbench/model_scripts/pytorch_model/pytorch.template +42 -24
  30. workbench/model_scripts/pytorch_model/pytorch_utils.py +11 -1
  31. workbench/model_scripts/script_generation.py +4 -0
  32. workbench/model_scripts/xgb_model/generated_model_script.py +169 -158
  33. workbench/model_scripts/xgb_model/xgb_model.template +163 -152
  34. workbench/repl/workbench_shell.py +0 -5
  35. workbench/scripts/endpoint_test.py +2 -2
  36. workbench/utils/chem_utils/fingerprints.py +7 -3
  37. workbench/utils/chemprop_utils.py +23 -5
  38. workbench/utils/meta_model_simulator.py +471 -0
  39. workbench/utils/metrics_utils.py +94 -10
  40. workbench/utils/model_utils.py +91 -9
  41. workbench/utils/pytorch_utils.py +1 -1
  42. workbench/web_interface/components/plugins/scatter_plot.py +4 -8
  43. {workbench-0.8.213.dist-info → workbench-0.8.217.dist-info}/METADATA +2 -1
  44. {workbench-0.8.213.dist-info → workbench-0.8.217.dist-info}/RECORD +48 -43
  45. workbench/model_scripts/custom_models/proximity/proximity.py +0 -410
  46. workbench/model_scripts/custom_models/uq_models/proximity.py +0 -410
  47. {workbench-0.8.213.dist-info → workbench-0.8.217.dist-info}/WHEEL +0 -0
  48. {workbench-0.8.213.dist-info → workbench-0.8.217.dist-info}/entry_points.txt +0 -0
  49. {workbench-0.8.213.dist-info → workbench-0.8.217.dist-info}/licenses/LICENSE +0 -0
  50. {workbench-0.8.213.dist-info → workbench-0.8.217.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,194 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sklearn.preprocessing import StandardScaler
4
+ from sklearn.neighbors import NearestNeighbors
5
+ from typing import List, Optional
6
+ import logging
7
+
8
+ # Workbench Imports
9
+ from workbench.algorithms.dataframe.proximity import Proximity
10
+ from workbench.algorithms.dataframe.projection_2d import Projection2D
11
+
12
+ # Set up logging
13
+ log = logging.getLogger("workbench")
14
+
15
+
16
+ class FeatureSpaceProximity(Proximity):
17
+ """Proximity computations for numeric feature spaces using Euclidean distance."""
18
+
19
+ def __init__(
20
+ self,
21
+ df: pd.DataFrame,
22
+ id_column: str,
23
+ features: List[str],
24
+ target: Optional[str] = None,
25
+ include_all_columns: bool = False,
26
+ ):
27
+ """
28
+ Initialize the FeatureSpaceProximity class.
29
+
30
+ Args:
31
+ df: DataFrame containing data for neighbor computations.
32
+ id_column: Name of the column used as the identifier.
33
+ features: List of feature column names to be used for neighbor computations.
34
+ target: Name of the target column. Defaults to None.
35
+ include_all_columns: Include all DataFrame columns in neighbor results. Defaults to False.
36
+ """
37
+ # Validate and filter features before calling parent init
38
+ self._raw_features = features
39
+ super().__init__(
40
+ df, id_column=id_column, features=features, target=target, include_all_columns=include_all_columns
41
+ )
42
+
43
+ def _prepare_data(self) -> None:
44
+ """Filter out non-numeric features and drop NaN rows."""
45
+ # Validate features
46
+ self.features = self._validate_features(self.df, self._raw_features)
47
+
48
+ # Drop NaN rows for the features we're using
49
+ self.df = self.df.dropna(subset=self.features).copy()
50
+
51
+ def _validate_features(self, df: pd.DataFrame, features: List[str]) -> List[str]:
52
+ """Remove non-numeric features and log warnings."""
53
+ non_numeric = [f for f in features if f not in df.select_dtypes(include=["number"]).columns]
54
+ if non_numeric:
55
+ log.warning(f"Non-numeric features {non_numeric} aren't currently supported, excluding them")
56
+ return [f for f in features if f not in non_numeric]
57
+
58
+ def _build_model(self) -> None:
59
+ """Standardize features and fit Nearest Neighbors model."""
60
+ self.scaler = StandardScaler()
61
+ X = self.scaler.fit_transform(self.df[self.features])
62
+ self.nn = NearestNeighbors().fit(X)
63
+
64
+ def _transform_features(self, df: pd.DataFrame) -> np.ndarray:
65
+ """Transform features using the fitted scaler."""
66
+ return self.scaler.transform(df[self.features])
67
+
68
+ def _project_2d(self) -> None:
69
+ """Project the numeric features to 2D for visualization."""
70
+ if len(self.features) >= 2:
71
+ self.df = Projection2D().fit_transform(self.df, features=self.features)
72
+
73
+
74
+ # Testing the FeatureSpaceProximity class
75
+ if __name__ == "__main__":
76
+
77
+ pd.set_option("display.max_columns", None)
78
+ pd.set_option("display.width", 1000)
79
+
80
+ # Create a sample DataFrame
81
+ data = {
82
+ "ID": [1, 2, 3, 4, 5],
83
+ "Feature1": [0.1, 0.2, 0.3, 0.4, 0.5],
84
+ "Feature2": [0.5, 0.4, 0.3, 0.2, 0.1],
85
+ "Feature3": [2.5, 2.4, 2.3, 2.3, np.nan],
86
+ }
87
+ df = pd.DataFrame(data)
88
+
89
+ # Test the FeatureSpaceProximity class
90
+ features = ["Feature1", "Feature2", "Feature3"]
91
+ prox = FeatureSpaceProximity(df, id_column="ID", features=features)
92
+ print(prox.neighbors(1, n_neighbors=2))
93
+
94
+ # Test the neighbors method with radius
95
+ print(prox.neighbors(1, radius=2.0))
96
+
97
+ # Test with Features list
98
+ prox = FeatureSpaceProximity(df, id_column="ID", features=["Feature1"])
99
+ print(prox.neighbors(1))
100
+
101
+ # Create a sample DataFrame
102
+ data = {
103
+ "id": ["a", "b", "c", "d", "e"], # Testing string IDs
104
+ "Feature1": [0.1, 0.2, 0.3, 0.4, 0.5],
105
+ "Feature2": [0.5, 0.4, 0.3, 0.2, 0.1],
106
+ "target": [1, 0, 1, 0, 5],
107
+ }
108
+ df = pd.DataFrame(data)
109
+
110
+ # Test with String Ids
111
+ prox = FeatureSpaceProximity(
112
+ df,
113
+ id_column="id",
114
+ features=["Feature1", "Feature2"],
115
+ target="target",
116
+ include_all_columns=True,
117
+ )
118
+ print(prox.neighbors(["a", "b"]))
119
+
120
+ # Test duplicate IDs
121
+ data = {
122
+ "id": ["a", "b", "c", "d", "d"], # Duplicate ID (d)
123
+ "Feature1": [0.1, 0.2, 0.3, 0.4, 0.5],
124
+ "Feature2": [0.5, 0.4, 0.3, 0.2, 0.1],
125
+ "target": [1, 0, 1, 0, 5],
126
+ }
127
+ df = pd.DataFrame(data)
128
+ prox = FeatureSpaceProximity(df, id_column="id", features=["Feature1", "Feature2"], target="target")
129
+ print(df.equals(prox.df))
130
+
131
+ # Test on real data from Workbench
132
+ from workbench.api import FeatureSet, Model
133
+
134
+ fs = FeatureSet("aqsol_features")
135
+ model = Model("aqsol-regression")
136
+ features = model.features()
137
+ df = fs.pull_dataframe()
138
+ prox = FeatureSpaceProximity(df, id_column=fs.id_column, features=model.features(), target=model.target())
139
+ print("\n" + "=" * 80)
140
+ print("Testing Neighbors...")
141
+ print("=" * 80)
142
+ test_id = df[fs.id_column].tolist()[0]
143
+ print(f"\nNeighbors for ID {test_id}:")
144
+ print(prox.neighbors(test_id))
145
+
146
+ print("\n" + "=" * 80)
147
+ print("Testing isolated_compounds...")
148
+ print("=" * 80)
149
+
150
+ # Test isolated data in the top 1%
151
+ isolated_1pct = prox.isolated(top_percent=1.0)
152
+ print(f"\nTop 1% most isolated compounds (n={len(isolated_1pct)}):")
153
+ print(isolated_1pct)
154
+
155
+ # Test isolated data in the top 5%
156
+ isolated_5pct = prox.isolated(top_percent=5.0)
157
+ print(f"\nTop 5% most isolated compounds (n={len(isolated_5pct)}):")
158
+ print(isolated_5pct)
159
+
160
+ print("\n" + "=" * 80)
161
+ print("Testing target_gradients...")
162
+ print("=" * 80)
163
+
164
+ # Test with different parameters
165
+ gradients_1pct = prox.target_gradients(top_percent=1.0, min_delta=1.0)
166
+ print(f"\nTop 1% target gradients (min_delta=5.0) (n={len(gradients_1pct)}):")
167
+ print(gradients_1pct)
168
+
169
+ gradients_5pct = prox.target_gradients(top_percent=5.0, min_delta=5.0)
170
+ print(f"\nTop 5% target gradients (min_delta=5.0) (n={len(gradients_5pct)}):")
171
+ print(gradients_5pct)
172
+
173
+ # Test proximity_stats
174
+ print("\n" + "=" * 80)
175
+ print("Testing proximity_stats...")
176
+ print("=" * 80)
177
+ stats = prox.proximity_stats()
178
+ print(stats)
179
+
180
+ # Plot the distance distribution using pandas
181
+ print("\n" + "=" * 80)
182
+ print("Plotting distance distribution...")
183
+ print("=" * 80)
184
+ prox.df["nn_distance"].hist(bins=50, figsize=(10, 6), edgecolor="black")
185
+
186
+ # Visualize the 2D projection
187
+ print("\n" + "=" * 80)
188
+ print("Visualizing 2D Projection...")
189
+ print("=" * 80)
190
+ from workbench.web_interface.components.plugin_unit_test import PluginUnitTest
191
+ from workbench.web_interface.components.plugins.scatter_plot import ScatterPlot
192
+
193
+ unit_test = PluginUnitTest(ScatterPlot, input_data=prox.df[:1000], x="x", y="y", color=model.target())
194
+ unit_test.run()
@@ -18,7 +18,7 @@ import pandas as pd
18
18
  from typing import List, Tuple
19
19
 
20
20
  # Local Imports
21
- from proximity import Proximity
21
+ from feature_space_proximity import FeatureSpaceProximity
22
22
 
23
23
 
24
24
  # Template Placeholders
@@ -28,7 +28,7 @@ TEMPLATE_PARAMS = {
28
28
  "features": "{{feature_list}}",
29
29
  "compressed_features": "{{compressed_features}}",
30
30
  "train_all_data": "{{train_all_data}}",
31
- "track_columns": "{{track_columns}}",
31
+ "include_all_columns": "{{include_all_columns}}",
32
32
  }
33
33
 
34
34
 
@@ -166,7 +166,7 @@ if __name__ == "__main__":
166
166
  orig_features = features.copy()
167
167
  compressed_features = TEMPLATE_PARAMS["compressed_features"]
168
168
  train_all_data = TEMPLATE_PARAMS["train_all_data"]
169
- track_columns = TEMPLATE_PARAMS["track_columns"] # Can be None
169
+ include_all_columns = TEMPLATE_PARAMS["include_all_columns"] # Defaults to False
170
170
  validation_split = 0.2
171
171
 
172
172
  # Script arguments for input/output directories
@@ -260,8 +260,8 @@ if __name__ == "__main__":
260
260
  with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
261
261
  json.dump(orig_features, fp) # We save the original features, not the decompressed ones
262
262
 
263
- # Now the Proximity model
264
- model = Proximity(df_train, id_column, features, target, track_columns=track_columns)
263
+ # Now the FeatureSpaceProximity model
264
+ model = FeatureSpaceProximity(df_train, id_column=id_column, features=features, target=target, include_all_columns=include_all_columns)
265
265
 
266
266
  # Now serialize the model
267
267
  model.serialize(args.model_dir)
@@ -282,7 +282,7 @@ def model_fn(model_dir) -> dict:
282
282
  ngb_model = joblib.load(os.path.join(model_dir, "ngb_model.joblib"))
283
283
 
284
284
  # Deserialize the proximity model
285
- prox_model = Proximity.deserialize(model_dir)
285
+ prox_model = FeatureSpaceProximity.deserialize(model_dir)
286
286
 
287
287
  return {"xgboost": xgb_model, "ngboost": ngb_model, "proximity": prox_model}
288
288
 
@@ -0,0 +1,209 @@
1
+ # Meta Model Template for Workbench
2
+ #
3
+ # NOTE: This is called a "meta model" but it's really a "meta endpoint" - it aggregates
4
+ # predictions from multiple child endpoints. We call it a "model" because Workbench
5
+ # creates Model artifacts that get deployed as Endpoints, so this follows that pattern.
6
+ #
7
+ # Assumptions/Shortcuts:
8
+ # - All child endpoints are regression models
9
+ # - All child endpoints output 'prediction' and 'confidence' columns
10
+ # - Aggregation uses model weights (provided at meta model creation time)
11
+ #
12
+ # This template:
13
+ # - Has no real training phase (just saves metadata including model weights)
14
+ # - At inference time, calls child endpoints and aggregates their predictions
15
+
16
+ import argparse
17
+ import json
18
+ import os
19
+ from concurrent.futures import ThreadPoolExecutor, as_completed
20
+ from io import StringIO
21
+
22
+ import pandas as pd
23
+
24
+ from workbench_bridges.endpoints.fast_inference import fast_inference
25
+
26
+ # Template parameters (filled in by Workbench)
27
+ TEMPLATE_PARAMS = {
28
+ "child_endpoints": ['logd-reg-pytorch', 'logd-reg-chemprop'],
29
+ "target_column": "logd",
30
+ "model_weights": {'logd-reg-pytorch': 0.4228205813233993, 'logd-reg-chemprop': 0.5771794186766008},
31
+ "model_metrics_s3_path": "s3://sandbox-sageworks-artifacts/models/logd-meta/training",
32
+ "aws_region": "us-west-2",
33
+ }
34
+
35
+
36
+ def invoke_endpoints_parallel(endpoint_names: list[str], df: pd.DataFrame) -> dict[str, pd.DataFrame]:
37
+ """Call multiple child endpoints in parallel and collect their results.
38
+
39
+ Args:
40
+ endpoint_names: List of endpoint names to call
41
+ df: Input DataFrame to send to each endpoint
42
+
43
+ Returns:
44
+ Dict mapping endpoint_name -> result DataFrame (or None if failed)
45
+ """
46
+ results = {}
47
+
48
+ def call_endpoint(name: str) -> tuple[str, pd.DataFrame | None]:
49
+ try:
50
+ return name, fast_inference(name, df)
51
+ except Exception as e:
52
+ print(f"Error calling endpoint {name}: {e}")
53
+ return name, None
54
+
55
+ with ThreadPoolExecutor(max_workers=len(endpoint_names)) as executor:
56
+ futures = {executor.submit(call_endpoint, name): name for name in endpoint_names}
57
+ for future in as_completed(futures):
58
+ name, result = future.result()
59
+ results[name] = result
60
+
61
+ return results
62
+
63
+
64
+ def aggregate_predictions(results: dict[str, pd.DataFrame], model_weights: dict[str, float]) -> pd.DataFrame:
65
+ """Aggregate predictions from multiple endpoints using model weights.
66
+
67
+ Args:
68
+ results: Dict mapping endpoint_name -> predictions DataFrame
69
+ Each DataFrame must have 'prediction' and 'confidence' columns
70
+ model_weights: Dict mapping endpoint_name -> weight
71
+
72
+ Returns:
73
+ DataFrame with aggregated prediction, prediction_std, and confidence
74
+ """
75
+ # Filter out failed endpoints
76
+ valid_results = {k: v for k, v in results.items() if v is not None}
77
+ if not valid_results:
78
+ raise ValueError("All child endpoints failed")
79
+
80
+ # Use first result as base (for id columns, etc.)
81
+ first_df = list(valid_results.values())[0]
82
+ output_df = first_df.drop(columns=["prediction", "confidence", "prediction_std"], errors="ignore").copy()
83
+
84
+ # Build DataFrames of predictions and confidences from all endpoints
85
+ pred_df = pd.DataFrame({name: df["prediction"] for name, df in valid_results.items()})
86
+ conf_df = pd.DataFrame({name: df["confidence"] for name, df in valid_results.items()})
87
+
88
+ # Apply model weights (renormalize for valid endpoints only)
89
+ valid_weights = {k: model_weights.get(k, 1.0) for k in valid_results}
90
+ weight_sum = sum(valid_weights.values())
91
+ normalized_weights = {k: v / weight_sum for k, v in valid_weights.items()}
92
+
93
+ # Weighted average
94
+ output_df["prediction"] = sum(pred_df[name] * w for name, w in normalized_weights.items())
95
+
96
+ # Ensemble std across child endpoints
97
+ output_df["prediction_std"] = pred_df.std(axis=1)
98
+
99
+ # Aggregated confidence: weighted mean of child confidences
100
+ output_df["confidence"] = sum(conf_df[name] * w for name, w in normalized_weights.items())
101
+
102
+ return output_df
103
+
104
+
105
+ # =============================================================================
106
+ # Model Loading (for SageMaker inference)
107
+ # =============================================================================
108
+ def model_fn(model_dir: str) -> dict:
109
+ """Load meta model configuration."""
110
+ with open(os.path.join(model_dir, "meta_config.json")) as f:
111
+ config = json.load(f)
112
+
113
+ # Set AWS_REGION for fast_inference (baked in at training time)
114
+ if config.get("aws_region"):
115
+ os.environ["AWS_REGION"] = config["aws_region"]
116
+
117
+ print(f"Meta model loaded: {len(config['child_endpoints'])} child endpoints")
118
+ print(f"Model weights: {config.get('model_weights')}")
119
+ print(f"AWS region: {config.get('aws_region')}")
120
+ return config
121
+
122
+
123
+ def input_fn(input_data, content_type):
124
+ """Parse input data and return a DataFrame."""
125
+ if not input_data:
126
+ raise ValueError("Empty input data is not supported!")
127
+
128
+ # Decode bytes to string if necessary
129
+ if isinstance(input_data, bytes):
130
+ input_data = input_data.decode("utf-8")
131
+
132
+ if "text/csv" in content_type:
133
+ return pd.read_csv(StringIO(input_data))
134
+ elif "application/json" in content_type:
135
+ return pd.DataFrame(json.loads(input_data))
136
+ else:
137
+ raise ValueError(f"{content_type} not supported!")
138
+
139
+
140
+ def output_fn(output_df, accept_type):
141
+ """Supports both CSV and JSON output formats."""
142
+ if "text/csv" in accept_type:
143
+ return output_df.to_csv(index=False), "text/csv"
144
+ elif "application/json" in accept_type:
145
+ return output_df.to_json(orient="records"), "application/json"
146
+ else:
147
+ raise RuntimeError(f"{accept_type} accept type is not supported by this script.")
148
+
149
+
150
+ # =============================================================================
151
+ # Inference (for SageMaker inference)
152
+ # =============================================================================
153
+ def predict_fn(df: pd.DataFrame, config: dict) -> pd.DataFrame:
154
+ """Run inference by calling child endpoints and aggregating results."""
155
+ child_endpoints = config["child_endpoints"]
156
+ model_weights = config.get("model_weights", {})
157
+
158
+ print(f"Calling {len(child_endpoints)} child endpoints: {child_endpoints}")
159
+
160
+ # Call all child endpoints
161
+ results = invoke_endpoints_parallel(child_endpoints, df)
162
+
163
+ # Report status
164
+ for name, result in results.items():
165
+ status = f"{len(result)} rows" if result is not None else "FAILED"
166
+ print(f" {name}: {status}")
167
+
168
+ # Aggregate predictions using model weights
169
+ output_df = aggregate_predictions(results, model_weights)
170
+
171
+ print(f"Aggregated {len(output_df)} predictions from {len(results)} endpoints")
172
+ return output_df
173
+
174
+
175
+ # =============================================================================
176
+ # Training (just saves configuration - no actual training)
177
+ # =============================================================================
178
+ if __name__ == "__main__":
179
+ parser = argparse.ArgumentParser()
180
+ parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model"))
181
+ parser.add_argument("--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data"))
182
+ parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train"))
183
+ args = parser.parse_args()
184
+
185
+ child_endpoints = TEMPLATE_PARAMS["child_endpoints"]
186
+ target_column = TEMPLATE_PARAMS["target_column"]
187
+ model_weights = TEMPLATE_PARAMS["model_weights"]
188
+ aws_region = TEMPLATE_PARAMS["aws_region"]
189
+
190
+ print("=" * 60)
191
+ print("Meta Model Configuration")
192
+ print("=" * 60)
193
+ print(f"Child endpoints: {child_endpoints}")
194
+ print(f"Target column: {target_column}")
195
+ print(f"Model weights: {model_weights}")
196
+ print(f"AWS region: {aws_region}")
197
+
198
+ # Save configuration for inference
199
+ config = {
200
+ "child_endpoints": child_endpoints,
201
+ "target_column": target_column,
202
+ "model_weights": model_weights,
203
+ "aws_region": aws_region,
204
+ }
205
+
206
+ with open(os.path.join(args.model_dir, "meta_config.json"), "w") as f:
207
+ json.dump(config, f, indent=2)
208
+
209
+ print(f"\nMeta model configuration saved to {args.model_dir}")
@@ -0,0 +1,209 @@
1
+ # Meta Model Template for Workbench
2
+ #
3
+ # NOTE: This is called a "meta model" but it's really a "meta endpoint" - it aggregates
4
+ # predictions from multiple child endpoints. We call it a "model" because Workbench
5
+ # creates Model artifacts that get deployed as Endpoints, so this follows that pattern.
6
+ #
7
+ # Assumptions/Shortcuts:
8
+ # - All child endpoints are regression models
9
+ # - All child endpoints output 'prediction' and 'confidence' columns
10
+ # - Aggregation uses model weights (provided at meta model creation time)
11
+ #
12
+ # This template:
13
+ # - Has no real training phase (just saves metadata including model weights)
14
+ # - At inference time, calls child endpoints and aggregates their predictions
15
+
16
+ import argparse
17
+ import json
18
+ import os
19
+ from concurrent.futures import ThreadPoolExecutor, as_completed
20
+ from io import StringIO
21
+
22
+ import pandas as pd
23
+
24
+ from workbench_bridges.endpoints.fast_inference import fast_inference
25
+
26
+ # Template parameters (filled in by Workbench)
27
+ TEMPLATE_PARAMS = {
28
+ "child_endpoints": "{{child_endpoints}}",
29
+ "target_column": "{{target_column}}",
30
+ "model_weights": "{{model_weights}}",
31
+ "model_metrics_s3_path": "{{model_metrics_s3_path}}",
32
+ "aws_region": "{{aws_region}}",
33
+ }
34
+
35
+
36
+ def invoke_endpoints_parallel(endpoint_names: list[str], df: pd.DataFrame) -> dict[str, pd.DataFrame]:
37
+ """Call multiple child endpoints in parallel and collect their results.
38
+
39
+ Args:
40
+ endpoint_names: List of endpoint names to call
41
+ df: Input DataFrame to send to each endpoint
42
+
43
+ Returns:
44
+ Dict mapping endpoint_name -> result DataFrame (or None if failed)
45
+ """
46
+ results = {}
47
+
48
+ def call_endpoint(name: str) -> tuple[str, pd.DataFrame | None]:
49
+ try:
50
+ return name, fast_inference(name, df)
51
+ except Exception as e:
52
+ print(f"Error calling endpoint {name}: {e}")
53
+ return name, None
54
+
55
+ with ThreadPoolExecutor(max_workers=len(endpoint_names)) as executor:
56
+ futures = {executor.submit(call_endpoint, name): name for name in endpoint_names}
57
+ for future in as_completed(futures):
58
+ name, result = future.result()
59
+ results[name] = result
60
+
61
+ return results
62
+
63
+
64
+ def aggregate_predictions(results: dict[str, pd.DataFrame], model_weights: dict[str, float]) -> pd.DataFrame:
65
+ """Aggregate predictions from multiple endpoints using model weights.
66
+
67
+ Args:
68
+ results: Dict mapping endpoint_name -> predictions DataFrame
69
+ Each DataFrame must have 'prediction' and 'confidence' columns
70
+ model_weights: Dict mapping endpoint_name -> weight
71
+
72
+ Returns:
73
+ DataFrame with aggregated prediction, prediction_std, and confidence
74
+ """
75
+ # Filter out failed endpoints
76
+ valid_results = {k: v for k, v in results.items() if v is not None}
77
+ if not valid_results:
78
+ raise ValueError("All child endpoints failed")
79
+
80
+ # Use first result as base (for id columns, etc.)
81
+ first_df = list(valid_results.values())[0]
82
+ output_df = first_df.drop(columns=["prediction", "confidence", "prediction_std"], errors="ignore").copy()
83
+
84
+ # Build DataFrames of predictions and confidences from all endpoints
85
+ pred_df = pd.DataFrame({name: df["prediction"] for name, df in valid_results.items()})
86
+ conf_df = pd.DataFrame({name: df["confidence"] for name, df in valid_results.items()})
87
+
88
+ # Apply model weights (renormalize for valid endpoints only)
89
+ valid_weights = {k: model_weights.get(k, 1.0) for k in valid_results}
90
+ weight_sum = sum(valid_weights.values())
91
+ normalized_weights = {k: v / weight_sum for k, v in valid_weights.items()}
92
+
93
+ # Weighted average
94
+ output_df["prediction"] = sum(pred_df[name] * w for name, w in normalized_weights.items())
95
+
96
+ # Ensemble std across child endpoints
97
+ output_df["prediction_std"] = pred_df.std(axis=1)
98
+
99
+ # Aggregated confidence: weighted mean of child confidences
100
+ output_df["confidence"] = sum(conf_df[name] * w for name, w in normalized_weights.items())
101
+
102
+ return output_df
103
+
104
+
105
+ # =============================================================================
106
+ # Model Loading (for SageMaker inference)
107
+ # =============================================================================
108
+ def model_fn(model_dir: str) -> dict:
109
+ """Load meta model configuration."""
110
+ with open(os.path.join(model_dir, "meta_config.json")) as f:
111
+ config = json.load(f)
112
+
113
+ # Set AWS_REGION for fast_inference (baked in at training time)
114
+ if config.get("aws_region"):
115
+ os.environ["AWS_REGION"] = config["aws_region"]
116
+
117
+ print(f"Meta model loaded: {len(config['child_endpoints'])} child endpoints")
118
+ print(f"Model weights: {config.get('model_weights')}")
119
+ print(f"AWS region: {config.get('aws_region')}")
120
+ return config
121
+
122
+
123
+ def input_fn(input_data, content_type):
124
+ """Parse input data and return a DataFrame."""
125
+ if not input_data:
126
+ raise ValueError("Empty input data is not supported!")
127
+
128
+ # Decode bytes to string if necessary
129
+ if isinstance(input_data, bytes):
130
+ input_data = input_data.decode("utf-8")
131
+
132
+ if "text/csv" in content_type:
133
+ return pd.read_csv(StringIO(input_data))
134
+ elif "application/json" in content_type:
135
+ return pd.DataFrame(json.loads(input_data))
136
+ else:
137
+ raise ValueError(f"{content_type} not supported!")
138
+
139
+
140
+ def output_fn(output_df, accept_type):
141
+ """Supports both CSV and JSON output formats."""
142
+ if "text/csv" in accept_type:
143
+ return output_df.to_csv(index=False), "text/csv"
144
+ elif "application/json" in accept_type:
145
+ return output_df.to_json(orient="records"), "application/json"
146
+ else:
147
+ raise RuntimeError(f"{accept_type} accept type is not supported by this script.")
148
+
149
+
150
+ # =============================================================================
151
+ # Inference (for SageMaker inference)
152
+ # =============================================================================
153
+ def predict_fn(df: pd.DataFrame, config: dict) -> pd.DataFrame:
154
+ """Run inference by calling child endpoints and aggregating results."""
155
+ child_endpoints = config["child_endpoints"]
156
+ model_weights = config.get("model_weights", {})
157
+
158
+ print(f"Calling {len(child_endpoints)} child endpoints: {child_endpoints}")
159
+
160
+ # Call all child endpoints
161
+ results = invoke_endpoints_parallel(child_endpoints, df)
162
+
163
+ # Report status
164
+ for name, result in results.items():
165
+ status = f"{len(result)} rows" if result is not None else "FAILED"
166
+ print(f" {name}: {status}")
167
+
168
+ # Aggregate predictions using model weights
169
+ output_df = aggregate_predictions(results, model_weights)
170
+
171
+ print(f"Aggregated {len(output_df)} predictions from {len(results)} endpoints")
172
+ return output_df
173
+
174
+
175
+ # =============================================================================
176
+ # Training (just saves configuration - no actual training)
177
+ # =============================================================================
178
+ if __name__ == "__main__":
179
+ parser = argparse.ArgumentParser()
180
+ parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model"))
181
+ parser.add_argument("--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data"))
182
+ parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train"))
183
+ args = parser.parse_args()
184
+
185
+ child_endpoints = TEMPLATE_PARAMS["child_endpoints"]
186
+ target_column = TEMPLATE_PARAMS["target_column"]
187
+ model_weights = TEMPLATE_PARAMS["model_weights"]
188
+ aws_region = TEMPLATE_PARAMS["aws_region"]
189
+
190
+ print("=" * 60)
191
+ print("Meta Model Configuration")
192
+ print("=" * 60)
193
+ print(f"Child endpoints: {child_endpoints}")
194
+ print(f"Target column: {target_column}")
195
+ print(f"Model weights: {model_weights}")
196
+ print(f"AWS region: {aws_region}")
197
+
198
+ # Save configuration for inference
199
+ config = {
200
+ "child_endpoints": child_endpoints,
201
+ "target_column": target_column,
202
+ "model_weights": model_weights,
203
+ "aws_region": aws_region,
204
+ }
205
+
206
+ with open(os.path.join(args.model_dir, "meta_config.json"), "w") as f:
207
+ json.dump(config, f, indent=2)
208
+
209
+ print(f"\nMeta model configuration saved to {args.model_dir}")