workbench 0.8.171__py3-none-any.whl → 0.8.173__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. workbench/algorithms/graph/light/proximity_graph.py +2 -1
  2. workbench/api/compound.py +1 -1
  3. workbench/api/feature_set.py +4 -4
  4. workbench/api/monitor.py +1 -16
  5. workbench/core/artifacts/artifact.py +11 -3
  6. workbench/core/artifacts/data_capture_core.py +315 -0
  7. workbench/core/artifacts/endpoint_core.py +9 -3
  8. workbench/core/artifacts/model_core.py +37 -14
  9. workbench/core/artifacts/monitor_core.py +33 -249
  10. workbench/core/cloud_platform/aws/aws_account_clamp.py +4 -1
  11. workbench/core/transforms/data_to_features/light/molecular_descriptors.py +4 -4
  12. workbench/core/transforms/features_to_model/features_to_model.py +4 -4
  13. workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +471 -0
  14. workbench/model_scripts/custom_models/chem_info/mol_standardize.py +428 -0
  15. workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +7 -9
  16. workbench/model_scripts/custom_models/uq_models/generated_model_script.py +19 -9
  17. workbench/model_scripts/custom_models/uq_models/mapie.template +502 -0
  18. workbench/model_scripts/custom_models/uq_models/meta_uq.template +8 -5
  19. workbench/model_scripts/custom_models/uq_models/requirements.txt +1 -3
  20. workbench/model_scripts/script_generation.py +5 -0
  21. workbench/model_scripts/xgb_model/generated_model_script.py +5 -5
  22. workbench/repl/workbench_shell.py +3 -3
  23. workbench/utils/chem_utils/__init__.py +0 -0
  24. workbench/utils/chem_utils/fingerprints.py +134 -0
  25. workbench/utils/chem_utils/misc.py +194 -0
  26. workbench/utils/chem_utils/mol_descriptors.py +471 -0
  27. workbench/utils/chem_utils/mol_standardize.py +428 -0
  28. workbench/utils/chem_utils/mol_tagging.py +348 -0
  29. workbench/utils/chem_utils/projections.py +209 -0
  30. workbench/utils/chem_utils/salts.py +256 -0
  31. workbench/utils/chem_utils/sdf.py +292 -0
  32. workbench/utils/chem_utils/toxicity.py +250 -0
  33. workbench/utils/chem_utils/vis.py +253 -0
  34. workbench/utils/model_utils.py +1 -1
  35. workbench/utils/monitor_utils.py +49 -56
  36. workbench/utils/pandas_utils.py +3 -3
  37. workbench/utils/workbench_sqs.py +1 -1
  38. workbench/utils/xgboost_model_utils.py +1 -0
  39. workbench/web_interface/components/plugins/generated_compounds.py +1 -1
  40. {workbench-0.8.171.dist-info → workbench-0.8.173.dist-info}/METADATA +1 -1
  41. {workbench-0.8.171.dist-info → workbench-0.8.173.dist-info}/RECORD +45 -34
  42. workbench/model_scripts/custom_models/chem_info/local_utils.py +0 -769
  43. workbench/model_scripts/custom_models/chem_info/tautomerize.py +0 -83
  44. workbench/model_scripts/custom_models/uq_models/mapie_xgb.template +0 -203
  45. workbench/utils/chem_utils.py +0 -1556
  46. {workbench-0.8.171.dist-info → workbench-0.8.173.dist-info}/WHEEL +0 -0
  47. {workbench-0.8.171.dist-info → workbench-0.8.173.dist-info}/entry_points.txt +0 -0
  48. {workbench-0.8.171.dist-info → workbench-0.8.173.dist-info}/licenses/LICENSE +0 -0
  49. {workbench-0.8.171.dist-info → workbench-0.8.173.dist-info}/top_level.txt +0 -0
@@ -1,83 +0,0 @@
1
- # Model: tautomerization_processor
2
- #
3
- # Description: The tautomerization_processor model uses RDKit to perform tautomer enumeration
4
- # and canonicalization of chemical compounds. Tautomerization is the chemical process where
5
- # compounds can interconvert between structurally distinct forms, often affecting their
6
- # chemical properties and reactivity. This model provides a robust approach to identifying
7
- # and processing tautomers, crucial for improving molecular modeling and cheminformatics tasks
8
- # like virtual screening, QSAR modeling, and property prediction.
9
- #
10
- import argparse
11
- import os
12
- import joblib
13
- from io import StringIO
14
- import pandas as pd
15
- import json
16
-
17
- # Local imports
18
- from local_utils import tautomerize_smiles
19
-
20
-
21
- # TRAINING SECTION
22
- #
23
- # This section (__main__) is where SageMaker will execute the job and save the model artifacts.
24
- #
25
- if __name__ == "__main__":
26
- # Script arguments for input/output directories
27
- parser = argparse.ArgumentParser()
28
- parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model"))
29
- parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train"))
30
- parser.add_argument(
31
- "--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data")
32
- )
33
- args = parser.parse_args()
34
-
35
- # This model doesn't get trained; it's a feature processing 'model'
36
-
37
- # Sagemaker expects a model artifact, so we'll save a placeholder
38
- placeholder_model = {}
39
- joblib.dump(placeholder_model, os.path.join(args.model_dir, "model.joblib"))
40
-
41
-
42
- # Model loading and prediction functions
43
- def model_fn(model_dir):
44
- return joblib.load(os.path.join(model_dir, "model.joblib"))
45
-
46
-
47
- def input_fn(input_data, content_type):
48
- """Parse input data and return a DataFrame."""
49
- if not input_data:
50
- raise ValueError("Empty input data is not supported!")
51
-
52
- # Decode bytes to string if necessary
53
- if isinstance(input_data, bytes):
54
- input_data = input_data.decode("utf-8")
55
-
56
- if "text/csv" in content_type:
57
- return pd.read_csv(StringIO(input_data))
58
- elif "application/json" in content_type:
59
- return pd.DataFrame(json.loads(input_data)) # Assumes JSON array of records
60
- else:
61
- raise ValueError(f"{content_type} not supported!")
62
-
63
-
64
- def output_fn(output_df, accept_type):
65
- """Supports both CSV and JSON output formats."""
66
- use_explicit_na = False
67
- if "text/csv" in accept_type:
68
- if use_explicit_na:
69
- csv_output = output_df.fillna("N/A").to_csv(index=False) # CSV with N/A for missing values
70
- else:
71
- csv_output = output_df.to_csv(index=False)
72
- return csv_output, "text/csv"
73
- elif "application/json" in accept_type:
74
- return output_df.to_json(orient="records"), "application/json" # JSON array of records (NaNs -> null)
75
- else:
76
- raise RuntimeError(f"{accept_type} accept type is not supported by this script.")
77
-
78
-
79
- # Prediction function
80
- def predict_fn(df, model):
81
- # Perform Tautomerization
82
- df = tautomerize_smiles(df)
83
- return df
@@ -1,203 +0,0 @@
1
- # Model: HistGradientBoosting with MAPIE Conformalized Quantile Regression
2
- from mapie.regression import MapieQuantileRegressor
3
- from sklearn.ensemble import HistGradientBoostingRegressor
4
- from sklearn.model_selection import train_test_split
5
- import numpy as np
6
-
7
- # Template Placeholders
8
- TEMPLATE_PARAMS = {
9
- "features": "{{feature_list}}",
10
- "target": "{{target_column}}",
11
- "train_all_data": "{{train_all_data}}"
12
- }
13
-
14
- from io import StringIO
15
- import json
16
- import argparse
17
- import joblib
18
- import os
19
- import pandas as pd
20
-
21
-
22
- # Function to check if dataframe is empty
23
- def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
24
- """Check if the DataFrame is empty and raise an error if so."""
25
- if df.empty:
26
- msg = f"*** The training data {df_name} has 0 rows! ***STOPPING***"
27
- print(msg)
28
- raise ValueError(msg)
29
-
30
-
31
- def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> pd.DataFrame:
32
- """
33
- Matches and renames DataFrame columns to match model feature names (case-insensitive).
34
- Prioritizes exact matches, then case-insensitive matches.
35
-
36
- Raises ValueError if any model features cannot be matched.
37
- """
38
- df_columns_lower = {col.lower(): col for col in df.columns}
39
- rename_dict = {}
40
- missing = []
41
- for feature in model_features:
42
- if feature in df.columns:
43
- continue # Exact match
44
- elif feature.lower() in df_columns_lower:
45
- rename_dict[df_columns_lower[feature.lower()]] = feature
46
- else:
47
- missing.append(feature)
48
-
49
- if missing:
50
- raise ValueError(f"Features not found: {missing}")
51
-
52
- # Rename the DataFrame columns to match the model features
53
- return df.rename(columns=rename_dict)
54
-
55
-
56
- # TRAINING SECTION
57
- #
58
- # This section (__main__) is where SageMaker will execute the training job
59
- # and save the model artifacts to the model directory.
60
- #
61
- if __name__ == "__main__":
62
- # Template Parameters
63
- features = TEMPLATE_PARAMS["features"]
64
- target = TEMPLATE_PARAMS["target"]
65
- train_all_data = TEMPLATE_PARAMS["train_all_data"]
66
- validation_split = 0.2
67
-
68
- # Script arguments for input/output directories
69
- parser = argparse.ArgumentParser()
70
- parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model"))
71
- parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train"))
72
- parser.add_argument(
73
- "--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data")
74
- )
75
- args = parser.parse_args()
76
-
77
- # Load training data from the specified directory
78
- training_files = [
79
- os.path.join(args.train, file)
80
- for file in os.listdir(args.train) if file.endswith(".csv")
81
- ]
82
- df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
83
-
84
- # Check if the DataFrame is empty
85
- check_dataframe(df, "training_df")
86
-
87
- # Training data split logic
88
- if train_all_data:
89
- # Use all data for both training and validation
90
- print("Training on all data...")
91
- df_train = df.copy()
92
- df_val = df.copy()
93
- elif "training" in df.columns:
94
- # Split data based on a 'training' column if it exists
95
- print("Splitting data based on 'training' column...")
96
- df_train = df[df["training"]].copy()
97
- df_val = df[~df["training"]].copy()
98
- else:
99
- # Perform a random split if no 'training' column is found
100
- print("Splitting data randomly...")
101
- df_train, df_val = train_test_split(df, test_size=validation_split, random_state=42)
102
-
103
- # Create HistGradientBoosting base model configured for quantile regression
104
- base_estimator = HistGradientBoostingRegressor(
105
- loss='quantile', # Required for MAPIE CQR
106
- quantile=0.5, # Will be overridden by MAPIE for different quantiles
107
- max_iter=1000,
108
- max_depth=6,
109
- learning_rate=0.01,
110
- random_state=42
111
- )
112
-
113
- # Create MAPIE CQR predictor - it will create quantile versions internally
114
- model = MapieQuantileRegressor(
115
- estimator=base_estimator,
116
- method="quantile",
117
- cv="split",
118
- alpha=0.05 # For 95% coverage
119
- )
120
-
121
- # Prepare features and targets for training
122
- X_train = df_train[features]
123
- X_val = df_val[features]
124
- y_train = df_train[target]
125
- y_val = df_val[target]
126
-
127
- # Fit the MAPIE CQR model (train/calibration is handled internally)
128
- model.fit(X_train, y_train)
129
-
130
- # Save the trained model and any necessary assets
131
- joblib.dump(model, os.path.join(args.model_dir, "model.joblib"))
132
-
133
- # Save the feature list to validate input during predictions
134
- with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
135
- json.dump(features, fp)
136
-
137
-
138
- #
139
- # Inference Section
140
- #
141
- def model_fn(model_dir):
142
- """Load and return the model from the specified directory."""
143
- return joblib.load(os.path.join(model_dir, "model.joblib"))
144
-
145
-
146
- def input_fn(input_data, content_type):
147
- """Parse input data and return a DataFrame."""
148
- if not input_data:
149
- raise ValueError("Empty input data is not supported!")
150
-
151
- # Decode bytes to string if necessary
152
- if isinstance(input_data, bytes):
153
- input_data = input_data.decode("utf-8")
154
-
155
- if "text/csv" in content_type:
156
- return pd.read_csv(StringIO(input_data))
157
- elif "application/json" in content_type:
158
- return pd.DataFrame(json.loads(input_data)) # Assumes JSON array of records
159
- else:
160
- raise ValueError(f"{content_type} not supported!")
161
-
162
-
163
- def output_fn(output_df, accept_type):
164
- """Supports both CSV and JSON output formats."""
165
- if "text/csv" in accept_type:
166
- csv_output = output_df.fillna("N/A").to_csv(index=False) # CSV with N/A for missing values
167
- return csv_output, "text/csv"
168
- elif "application/json" in accept_type:
169
- return output_df.to_json(orient="records"), "application/json" # JSON array of records (NaNs -> null)
170
- else:
171
- raise RuntimeError(f"{accept_type} accept type is not supported by this script.")
172
-
173
-
174
- def predict_fn(df, model):
175
- """Make predictions using MAPIE CQR and return the DataFrame with results."""
176
- model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
177
-
178
- # Load feature columns from the saved file
179
- with open(os.path.join(model_dir, "feature_columns.json")) as fp:
180
- model_features = json.load(fp)
181
-
182
- # Match features in a case-insensitive manner
183
- matched_df = match_features_case_insensitive(df, model_features)
184
-
185
- # Get CQR predictions - returns point prediction and intervals
186
- X_pred = matched_df[model_features]
187
- y_pred, y_pis = model.predict(X_pred)
188
-
189
- # Add predictions to dataframe with 95% intervals
190
- df["prediction"] = y_pred
191
- df["q_025"] = y_pis[:, 0, 0] # Lower bound (2.5th percentile)
192
- df["q_975"] = y_pis[:, 1, 0] # Upper bound (97.5th percentile)
193
-
194
- # Calculate std estimate from 95% interval
195
- interval_width_95 = df["q_975"] - df["q_025"]
196
- df["prediction_std"] = interval_width_95 / 3.92 # 95% CI = ±1.96σ, so width = 3.92σ
197
-
198
- # Calculate 50% intervals using normal approximation
199
- df["q_25"] = df["prediction"] - 0.674 * df["prediction_std"]
200
- df["q_75"] = df["prediction"] + 0.674 * df["prediction_std"]
201
-
202
- # Return the modified DataFrame
203
- return df