workbench 0.8.174__py3-none-any.whl → 0.8.227__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of workbench might be problematic. Click here for more details.

Files changed (145) hide show
  1. workbench/__init__.py +1 -0
  2. workbench/algorithms/dataframe/__init__.py +1 -2
  3. workbench/algorithms/dataframe/compound_dataset_overlap.py +321 -0
  4. workbench/algorithms/dataframe/feature_space_proximity.py +168 -75
  5. workbench/algorithms/dataframe/fingerprint_proximity.py +422 -86
  6. workbench/algorithms/dataframe/projection_2d.py +44 -21
  7. workbench/algorithms/dataframe/proximity.py +259 -305
  8. workbench/algorithms/graph/light/proximity_graph.py +12 -11
  9. workbench/algorithms/models/cleanlab_model.py +382 -0
  10. workbench/algorithms/models/noise_model.py +388 -0
  11. workbench/algorithms/sql/column_stats.py +0 -1
  12. workbench/algorithms/sql/correlations.py +0 -1
  13. workbench/algorithms/sql/descriptive_stats.py +0 -1
  14. workbench/algorithms/sql/outliers.py +3 -3
  15. workbench/api/__init__.py +5 -1
  16. workbench/api/df_store.py +17 -108
  17. workbench/api/endpoint.py +14 -12
  18. workbench/api/feature_set.py +117 -11
  19. workbench/api/meta.py +0 -1
  20. workbench/api/meta_model.py +289 -0
  21. workbench/api/model.py +52 -21
  22. workbench/api/parameter_store.py +3 -52
  23. workbench/cached/cached_meta.py +0 -1
  24. workbench/cached/cached_model.py +49 -11
  25. workbench/core/artifacts/__init__.py +11 -2
  26. workbench/core/artifacts/artifact.py +7 -7
  27. workbench/core/artifacts/data_capture_core.py +8 -1
  28. workbench/core/artifacts/df_store_core.py +114 -0
  29. workbench/core/artifacts/endpoint_core.py +323 -205
  30. workbench/core/artifacts/feature_set_core.py +249 -45
  31. workbench/core/artifacts/model_core.py +133 -101
  32. workbench/core/artifacts/parameter_store_core.py +98 -0
  33. workbench/core/cloud_platform/aws/aws_account_clamp.py +48 -2
  34. workbench/core/cloud_platform/cloud_meta.py +0 -1
  35. workbench/core/pipelines/pipeline_executor.py +1 -1
  36. workbench/core/transforms/features_to_model/features_to_model.py +60 -44
  37. workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +43 -10
  38. workbench/core/transforms/pandas_transforms/pandas_to_features.py +38 -2
  39. workbench/core/views/training_view.py +113 -42
  40. workbench/core/views/view.py +53 -3
  41. workbench/core/views/view_utils.py +4 -4
  42. workbench/model_script_utils/model_script_utils.py +339 -0
  43. workbench/model_script_utils/pytorch_utils.py +405 -0
  44. workbench/model_script_utils/uq_harness.py +277 -0
  45. workbench/model_scripts/chemprop/chemprop.template +774 -0
  46. workbench/model_scripts/chemprop/generated_model_script.py +774 -0
  47. workbench/model_scripts/chemprop/model_script_utils.py +339 -0
  48. workbench/model_scripts/chemprop/requirements.txt +3 -0
  49. workbench/model_scripts/custom_models/chem_info/fingerprints.py +175 -0
  50. workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +18 -7
  51. workbench/model_scripts/custom_models/chem_info/mol_standardize.py +80 -58
  52. workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +0 -1
  53. workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py +1 -2
  54. workbench/model_scripts/custom_models/proximity/feature_space_proximity.py +194 -0
  55. workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +8 -10
  56. workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
  57. workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +20 -21
  58. workbench/model_scripts/custom_models/uq_models/feature_space_proximity.py +194 -0
  59. workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
  60. workbench/model_scripts/custom_models/uq_models/ngboost.template +15 -16
  61. workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +15 -17
  62. workbench/model_scripts/meta_model/generated_model_script.py +209 -0
  63. workbench/model_scripts/meta_model/meta_model.template +209 -0
  64. workbench/model_scripts/pytorch_model/generated_model_script.py +443 -499
  65. workbench/model_scripts/pytorch_model/model_script_utils.py +339 -0
  66. workbench/model_scripts/pytorch_model/pytorch.template +440 -496
  67. workbench/model_scripts/pytorch_model/pytorch_utils.py +405 -0
  68. workbench/model_scripts/pytorch_model/requirements.txt +1 -1
  69. workbench/model_scripts/pytorch_model/uq_harness.py +277 -0
  70. workbench/model_scripts/scikit_learn/generated_model_script.py +7 -12
  71. workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
  72. workbench/model_scripts/script_generation.py +15 -12
  73. workbench/model_scripts/uq_models/generated_model_script.py +248 -0
  74. workbench/model_scripts/xgb_model/generated_model_script.py +371 -403
  75. workbench/model_scripts/xgb_model/model_script_utils.py +339 -0
  76. workbench/model_scripts/xgb_model/uq_harness.py +277 -0
  77. workbench/model_scripts/xgb_model/xgb_model.template +367 -399
  78. workbench/repl/workbench_shell.py +18 -14
  79. workbench/resources/open_source_api.key +1 -1
  80. workbench/scripts/endpoint_test.py +162 -0
  81. workbench/scripts/lambda_test.py +73 -0
  82. workbench/scripts/meta_model_sim.py +35 -0
  83. workbench/scripts/ml_pipeline_sqs.py +122 -6
  84. workbench/scripts/training_test.py +85 -0
  85. workbench/themes/dark/custom.css +59 -0
  86. workbench/themes/dark/plotly.json +5 -5
  87. workbench/themes/light/custom.css +153 -40
  88. workbench/themes/light/plotly.json +9 -9
  89. workbench/themes/midnight_blue/custom.css +59 -0
  90. workbench/utils/aws_utils.py +0 -1
  91. workbench/utils/chem_utils/fingerprints.py +87 -46
  92. workbench/utils/chem_utils/mol_descriptors.py +18 -7
  93. workbench/utils/chem_utils/mol_standardize.py +80 -58
  94. workbench/utils/chem_utils/projections.py +16 -6
  95. workbench/utils/chem_utils/vis.py +25 -27
  96. workbench/utils/chemprop_utils.py +141 -0
  97. workbench/utils/config_manager.py +2 -6
  98. workbench/utils/endpoint_utils.py +5 -7
  99. workbench/utils/license_manager.py +2 -6
  100. workbench/utils/markdown_utils.py +57 -0
  101. workbench/utils/meta_model_simulator.py +499 -0
  102. workbench/utils/metrics_utils.py +256 -0
  103. workbench/utils/model_utils.py +274 -87
  104. workbench/utils/pipeline_utils.py +0 -1
  105. workbench/utils/plot_utils.py +159 -34
  106. workbench/utils/pytorch_utils.py +87 -0
  107. workbench/utils/shap_utils.py +11 -57
  108. workbench/utils/theme_manager.py +95 -30
  109. workbench/utils/xgboost_local_crossfold.py +267 -0
  110. workbench/utils/xgboost_model_utils.py +127 -220
  111. workbench/web_interface/components/experiments/outlier_plot.py +0 -1
  112. workbench/web_interface/components/model_plot.py +16 -2
  113. workbench/web_interface/components/plugin_unit_test.py +5 -3
  114. workbench/web_interface/components/plugins/ag_table.py +2 -4
  115. workbench/web_interface/components/plugins/confusion_matrix.py +3 -6
  116. workbench/web_interface/components/plugins/model_details.py +48 -80
  117. workbench/web_interface/components/plugins/scatter_plot.py +192 -92
  118. workbench/web_interface/components/settings_menu.py +184 -0
  119. workbench/web_interface/page_views/main_page.py +0 -1
  120. {workbench-0.8.174.dist-info → workbench-0.8.227.dist-info}/METADATA +31 -17
  121. {workbench-0.8.174.dist-info → workbench-0.8.227.dist-info}/RECORD +125 -111
  122. {workbench-0.8.174.dist-info → workbench-0.8.227.dist-info}/entry_points.txt +4 -0
  123. {workbench-0.8.174.dist-info → workbench-0.8.227.dist-info}/licenses/LICENSE +1 -1
  124. workbench/core/cloud_platform/aws/aws_df_store.py +0 -404
  125. workbench/core/cloud_platform/aws/aws_parameter_store.py +0 -280
  126. workbench/model_scripts/custom_models/meta_endpoints/example.py +0 -53
  127. workbench/model_scripts/custom_models/proximity/generated_model_script.py +0 -138
  128. workbench/model_scripts/custom_models/proximity/proximity.py +0 -384
  129. workbench/model_scripts/custom_models/uq_models/generated_model_script.py +0 -393
  130. workbench/model_scripts/custom_models/uq_models/mapie.template +0 -502
  131. workbench/model_scripts/custom_models/uq_models/meta_uq.template +0 -386
  132. workbench/model_scripts/custom_models/uq_models/proximity.py +0 -384
  133. workbench/model_scripts/ensemble_xgb/generated_model_script.py +0 -279
  134. workbench/model_scripts/quant_regression/quant_regression.template +0 -279
  135. workbench/model_scripts/quant_regression/requirements.txt +0 -1
  136. workbench/themes/quartz/base_css.url +0 -1
  137. workbench/themes/quartz/custom.css +0 -117
  138. workbench/themes/quartz/plotly.json +0 -642
  139. workbench/themes/quartz_dark/base_css.url +0 -1
  140. workbench/themes/quartz_dark/custom.css +0 -131
  141. workbench/themes/quartz_dark/plotly.json +0 -642
  142. workbench/utils/fast_inference.py +0 -167
  143. workbench/utils/resource_utils.py +0 -39
  144. {workbench-0.8.174.dist-info → workbench-0.8.227.dist-info}/WHEEL +0 -0
  145. {workbench-0.8.174.dist-info → workbench-0.8.227.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,209 @@
1
+ # Meta Model Template for Workbench
2
+ #
3
+ # NOTE: This is called a "meta model" but it's really a "meta endpoint" - it aggregates
4
+ # predictions from multiple child endpoints. We call it a "model" because Workbench
5
+ # creates Model artifacts that get deployed as Endpoints, so this follows that pattern.
6
+ #
7
+ # Assumptions/Shortcuts:
8
+ # - All child endpoints are regression models
9
+ # - All child endpoints output 'prediction' and 'confidence' columns
10
+ # - Aggregation uses model weights (provided at meta model creation time)
11
+ #
12
+ # This template:
13
+ # - Has no real training phase (just saves metadata including model weights)
14
+ # - At inference time, calls child endpoints and aggregates their predictions
15
+
16
+ import argparse
17
+ import json
18
+ import os
19
+ from concurrent.futures import ThreadPoolExecutor, as_completed
20
+ from io import StringIO
21
+
22
+ import pandas as pd
23
+
24
+ from workbench_bridges.endpoints.fast_inference import fast_inference
25
+
26
+ # Template parameters (filled in by Workbench)
27
+ TEMPLATE_PARAMS = {
28
+ "child_endpoints": "{{child_endpoints}}",
29
+ "target_column": "{{target_column}}",
30
+ "model_weights": "{{model_weights}}",
31
+ "model_metrics_s3_path": "{{model_metrics_s3_path}}",
32
+ "aws_region": "{{aws_region}}",
33
+ }
34
+
35
+
36
+ def invoke_endpoints_parallel(endpoint_names: list[str], df: pd.DataFrame) -> dict[str, pd.DataFrame]:
37
+ """Call multiple child endpoints in parallel and collect their results.
38
+
39
+ Args:
40
+ endpoint_names: List of endpoint names to call
41
+ df: Input DataFrame to send to each endpoint
42
+
43
+ Returns:
44
+ Dict mapping endpoint_name -> result DataFrame (or None if failed)
45
+ """
46
+ results = {}
47
+
48
+ def call_endpoint(name: str) -> tuple[str, pd.DataFrame | None]:
49
+ try:
50
+ return name, fast_inference(name, df)
51
+ except Exception as e:
52
+ print(f"Error calling endpoint {name}: {e}")
53
+ return name, None
54
+
55
+ with ThreadPoolExecutor(max_workers=len(endpoint_names)) as executor:
56
+ futures = {executor.submit(call_endpoint, name): name for name in endpoint_names}
57
+ for future in as_completed(futures):
58
+ name, result = future.result()
59
+ results[name] = result
60
+
61
+ return results
62
+
63
+
64
+ def aggregate_predictions(results: dict[str, pd.DataFrame], model_weights: dict[str, float]) -> pd.DataFrame:
65
+ """Aggregate predictions from multiple endpoints using model weights.
66
+
67
+ Args:
68
+ results: Dict mapping endpoint_name -> predictions DataFrame
69
+ Each DataFrame must have 'prediction' and 'confidence' columns
70
+ model_weights: Dict mapping endpoint_name -> weight
71
+
72
+ Returns:
73
+ DataFrame with aggregated prediction, prediction_std, and confidence
74
+ """
75
+ # Filter out failed endpoints
76
+ valid_results = {k: v for k, v in results.items() if v is not None}
77
+ if not valid_results:
78
+ raise ValueError("All child endpoints failed")
79
+
80
+ # Use first result as base (for id columns, etc.)
81
+ first_df = list(valid_results.values())[0]
82
+ output_df = first_df.drop(columns=["prediction", "confidence", "prediction_std"], errors="ignore").copy()
83
+
84
+ # Build DataFrames of predictions and confidences from all endpoints
85
+ pred_df = pd.DataFrame({name: df["prediction"] for name, df in valid_results.items()})
86
+ conf_df = pd.DataFrame({name: df["confidence"] for name, df in valid_results.items()})
87
+
88
+ # Apply model weights (renormalize for valid endpoints only)
89
+ valid_weights = {k: model_weights.get(k, 1.0) for k in valid_results}
90
+ weight_sum = sum(valid_weights.values())
91
+ normalized_weights = {k: v / weight_sum for k, v in valid_weights.items()}
92
+
93
+ # Weighted average
94
+ output_df["prediction"] = sum(pred_df[name] * w for name, w in normalized_weights.items())
95
+
96
+ # Ensemble std across child endpoints
97
+ output_df["prediction_std"] = pred_df.std(axis=1)
98
+
99
+ # Aggregated confidence: weighted mean of child confidences
100
+ output_df["confidence"] = sum(conf_df[name] * w for name, w in normalized_weights.items())
101
+
102
+ return output_df
103
+
104
+
105
+ # =============================================================================
106
+ # Model Loading (for SageMaker inference)
107
+ # =============================================================================
108
+ def model_fn(model_dir: str) -> dict:
109
+ """Load meta model configuration."""
110
+ with open(os.path.join(model_dir, "meta_config.json")) as f:
111
+ config = json.load(f)
112
+
113
+ # Set AWS_REGION for fast_inference (baked in at training time)
114
+ if config.get("aws_region"):
115
+ os.environ["AWS_REGION"] = config["aws_region"]
116
+
117
+ print(f"Meta model loaded: {len(config['child_endpoints'])} child endpoints")
118
+ print(f"Model weights: {config.get('model_weights')}")
119
+ print(f"AWS region: {config.get('aws_region')}")
120
+ return config
121
+
122
+
123
+ def input_fn(input_data, content_type):
124
+ """Parse input data and return a DataFrame."""
125
+ if not input_data:
126
+ raise ValueError("Empty input data is not supported!")
127
+
128
+ # Decode bytes to string if necessary
129
+ if isinstance(input_data, bytes):
130
+ input_data = input_data.decode("utf-8")
131
+
132
+ if "text/csv" in content_type:
133
+ return pd.read_csv(StringIO(input_data))
134
+ elif "application/json" in content_type:
135
+ return pd.DataFrame(json.loads(input_data))
136
+ else:
137
+ raise ValueError(f"{content_type} not supported!")
138
+
139
+
140
+ def output_fn(output_df, accept_type):
141
+ """Supports both CSV and JSON output formats."""
142
+ if "text/csv" in accept_type:
143
+ return output_df.to_csv(index=False), "text/csv"
144
+ elif "application/json" in accept_type:
145
+ return output_df.to_json(orient="records"), "application/json"
146
+ else:
147
+ raise RuntimeError(f"{accept_type} accept type is not supported by this script.")
148
+
149
+
150
+ # =============================================================================
151
+ # Inference (for SageMaker inference)
152
+ # =============================================================================
153
+ def predict_fn(df: pd.DataFrame, config: dict) -> pd.DataFrame:
154
+ """Run inference by calling child endpoints and aggregating results."""
155
+ child_endpoints = config["child_endpoints"]
156
+ model_weights = config.get("model_weights", {})
157
+
158
+ print(f"Calling {len(child_endpoints)} child endpoints: {child_endpoints}")
159
+
160
+ # Call all child endpoints
161
+ results = invoke_endpoints_parallel(child_endpoints, df)
162
+
163
+ # Report status
164
+ for name, result in results.items():
165
+ status = f"{len(result)} rows" if result is not None else "FAILED"
166
+ print(f" {name}: {status}")
167
+
168
+ # Aggregate predictions using model weights
169
+ output_df = aggregate_predictions(results, model_weights)
170
+
171
+ print(f"Aggregated {len(output_df)} predictions from {len(results)} endpoints")
172
+ return output_df
173
+
174
+
175
+ # =============================================================================
176
+ # Training (just saves configuration - no actual training)
177
+ # =============================================================================
178
+ if __name__ == "__main__":
179
+ parser = argparse.ArgumentParser()
180
+ parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model"))
181
+ parser.add_argument("--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data"))
182
+ parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train"))
183
+ args = parser.parse_args()
184
+
185
+ child_endpoints = TEMPLATE_PARAMS["child_endpoints"]
186
+ target_column = TEMPLATE_PARAMS["target_column"]
187
+ model_weights = TEMPLATE_PARAMS["model_weights"]
188
+ aws_region = TEMPLATE_PARAMS["aws_region"]
189
+
190
+ print("=" * 60)
191
+ print("Meta Model Configuration")
192
+ print("=" * 60)
193
+ print(f"Child endpoints: {child_endpoints}")
194
+ print(f"Target column: {target_column}")
195
+ print(f"Model weights: {model_weights}")
196
+ print(f"AWS region: {aws_region}")
197
+
198
+ # Save configuration for inference
199
+ config = {
200
+ "child_endpoints": child_endpoints,
201
+ "target_column": target_column,
202
+ "model_weights": model_weights,
203
+ "aws_region": aws_region,
204
+ }
205
+
206
+ with open(os.path.join(args.model_dir, "meta_config.json"), "w") as f:
207
+ json.dump(config, f, indent=2)
208
+
209
+ print(f"\nMeta model configuration saved to {args.model_dir}")