workbench 0.8.161__py3-none-any.whl → 0.8.192__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- workbench/algorithms/dataframe/proximity.py +143 -102
- workbench/algorithms/graph/light/proximity_graph.py +2 -1
- workbench/api/compound.py +1 -1
- workbench/api/endpoint.py +12 -0
- workbench/api/feature_set.py +4 -4
- workbench/api/meta.py +5 -2
- workbench/api/model.py +16 -12
- workbench/api/monitor.py +1 -16
- workbench/core/artifacts/artifact.py +11 -3
- workbench/core/artifacts/data_capture_core.py +355 -0
- workbench/core/artifacts/endpoint_core.py +168 -78
- workbench/core/artifacts/feature_set_core.py +72 -13
- workbench/core/artifacts/model_core.py +50 -15
- workbench/core/artifacts/monitor_core.py +33 -248
- workbench/core/cloud_platform/aws/aws_account_clamp.py +50 -1
- workbench/core/cloud_platform/aws/aws_meta.py +12 -5
- workbench/core/cloud_platform/aws/aws_session.py +4 -4
- workbench/core/transforms/data_to_features/light/molecular_descriptors.py +4 -4
- workbench/core/transforms/features_to_model/features_to_model.py +9 -4
- workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +36 -6
- workbench/core/transforms/pandas_transforms/pandas_to_features.py +27 -0
- workbench/core/views/training_view.py +49 -53
- workbench/core/views/view.py +51 -1
- workbench/core/views/view_utils.py +4 -4
- workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +483 -0
- workbench/model_scripts/custom_models/chem_info/mol_standardize.py +450 -0
- workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +7 -9
- workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +3 -5
- workbench/model_scripts/custom_models/proximity/proximity.py +143 -102
- workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
- workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +10 -17
- workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
- workbench/model_scripts/custom_models/uq_models/meta_uq.template +156 -58
- workbench/model_scripts/custom_models/uq_models/ngboost.template +20 -14
- workbench/model_scripts/custom_models/uq_models/proximity.py +143 -102
- workbench/model_scripts/custom_models/uq_models/requirements.txt +1 -3
- workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +5 -13
- workbench/model_scripts/pytorch_model/pytorch.template +19 -20
- workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
- workbench/model_scripts/script_generation.py +7 -2
- workbench/model_scripts/uq_models/mapie.template +492 -0
- workbench/model_scripts/uq_models/requirements.txt +1 -0
- workbench/model_scripts/xgb_model/xgb_model.template +31 -40
- workbench/repl/workbench_shell.py +11 -6
- workbench/scripts/lambda_launcher.py +63 -0
- workbench/scripts/ml_pipeline_batch.py +137 -0
- workbench/scripts/ml_pipeline_sqs.py +186 -0
- workbench/scripts/monitor_cloud_watch.py +20 -100
- workbench/utils/aws_utils.py +4 -3
- workbench/utils/chem_utils/__init__.py +0 -0
- workbench/utils/chem_utils/fingerprints.py +134 -0
- workbench/utils/chem_utils/misc.py +194 -0
- workbench/utils/chem_utils/mol_descriptors.py +483 -0
- workbench/utils/chem_utils/mol_standardize.py +450 -0
- workbench/utils/chem_utils/mol_tagging.py +348 -0
- workbench/utils/chem_utils/projections.py +209 -0
- workbench/utils/chem_utils/salts.py +256 -0
- workbench/utils/chem_utils/sdf.py +292 -0
- workbench/utils/chem_utils/toxicity.py +250 -0
- workbench/utils/chem_utils/vis.py +253 -0
- workbench/utils/cloudwatch_handler.py +1 -1
- workbench/utils/cloudwatch_utils.py +137 -0
- workbench/utils/config_manager.py +3 -7
- workbench/utils/endpoint_utils.py +5 -7
- workbench/utils/license_manager.py +2 -6
- workbench/utils/model_utils.py +76 -30
- workbench/utils/monitor_utils.py +44 -62
- workbench/utils/pandas_utils.py +3 -3
- workbench/utils/shap_utils.py +10 -2
- workbench/utils/workbench_logging.py +0 -3
- workbench/utils/workbench_sqs.py +1 -1
- workbench/utils/xgboost_model_utils.py +283 -145
- workbench/web_interface/components/plugins/dashboard_status.py +3 -1
- workbench/web_interface/components/plugins/generated_compounds.py +1 -1
- workbench/web_interface/components/plugins/scatter_plot.py +3 -3
- {workbench-0.8.161.dist-info → workbench-0.8.192.dist-info}/METADATA +4 -4
- {workbench-0.8.161.dist-info → workbench-0.8.192.dist-info}/RECORD +81 -76
- {workbench-0.8.161.dist-info → workbench-0.8.192.dist-info}/entry_points.txt +3 -0
- workbench/model_scripts/custom_models/chem_info/local_utils.py +0 -769
- workbench/model_scripts/custom_models/chem_info/tautomerize.py +0 -83
- workbench/model_scripts/custom_models/proximity/generated_model_script.py +0 -138
- workbench/model_scripts/custom_models/uq_models/generated_model_script.py +0 -393
- workbench/model_scripts/custom_models/uq_models/mapie_xgb.template +0 -203
- workbench/model_scripts/ensemble_xgb/generated_model_script.py +0 -279
- workbench/model_scripts/pytorch_model/generated_model_script.py +0 -565
- workbench/model_scripts/quant_regression/quant_regression.template +0 -279
- workbench/model_scripts/quant_regression/requirements.txt +0 -1
- workbench/model_scripts/scikit_learn/generated_model_script.py +0 -307
- workbench/model_scripts/xgb_model/generated_model_script.py +0 -477
- workbench/utils/chem_utils.py +0 -1556
- workbench/utils/execution_environment.py +0 -211
- workbench/utils/fast_inference.py +0 -167
- workbench/utils/resource_utils.py +0 -39
- {workbench-0.8.161.dist-info → workbench-0.8.192.dist-info}/WHEEL +0 -0
- {workbench-0.8.161.dist-info → workbench-0.8.192.dist-info}/licenses/LICENSE +0 -0
- {workbench-0.8.161.dist-info → workbench-0.8.192.dist-info}/top_level.txt +0 -0
|
@@ -1,34 +1,33 @@
|
|
|
1
1
|
# Model: NGBoost Regressor with Distribution output
|
|
2
2
|
from ngboost import NGBRegressor
|
|
3
|
-
from
|
|
3
|
+
from ngboost.distns import Cauchy
|
|
4
|
+
from xgboost import XGBRegressor # Point Estimator
|
|
4
5
|
from sklearn.model_selection import train_test_split
|
|
5
6
|
|
|
6
7
|
# Model Performance Scores
|
|
7
|
-
from sklearn.metrics import
|
|
8
|
-
mean_absolute_error,
|
|
9
|
-
r2_score,
|
|
10
|
-
root_mean_squared_error
|
|
11
|
-
)
|
|
8
|
+
from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
|
|
12
9
|
|
|
13
10
|
from io import StringIO
|
|
14
11
|
import json
|
|
15
12
|
import argparse
|
|
16
13
|
import joblib
|
|
17
14
|
import os
|
|
15
|
+
import numpy as np
|
|
18
16
|
import pandas as pd
|
|
17
|
+
from typing import List, Tuple
|
|
19
18
|
|
|
20
19
|
# Local Imports
|
|
21
20
|
from proximity import Proximity
|
|
22
21
|
|
|
23
22
|
|
|
24
|
-
|
|
25
23
|
# Template Placeholders
|
|
26
24
|
TEMPLATE_PARAMS = {
|
|
27
25
|
"id_column": "{{id_column}}",
|
|
28
|
-
"features": "{{feature_list}}",
|
|
29
26
|
"target": "{{target_column}}",
|
|
27
|
+
"features": "{{feature_list}}",
|
|
28
|
+
"compressed_features": "{{compressed_features}}",
|
|
30
29
|
"train_all_data": "{{train_all_data}}",
|
|
31
|
-
"track_columns": "{{track_columns}}"
|
|
30
|
+
"track_columns": "{{track_columns}}",
|
|
32
31
|
}
|
|
33
32
|
|
|
34
33
|
|
|
@@ -72,16 +71,99 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
|
|
|
72
71
|
return df.rename(columns=rename_dict)
|
|
73
72
|
|
|
74
73
|
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
74
|
+
def convert_categorical_types(df: pd.DataFrame, features: list, category_mappings={}) -> tuple:
|
|
75
|
+
"""
|
|
76
|
+
Converts appropriate columns to categorical type with consistent mappings.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
df (pd.DataFrame): The DataFrame to process.
|
|
80
|
+
features (list): List of feature names to consider for conversion.
|
|
81
|
+
category_mappings (dict, optional): Existing category mappings. If empty dict, we're in
|
|
82
|
+
training mode. If populated, we're in inference mode.
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
tuple: (processed DataFrame, category mappings dictionary)
|
|
86
|
+
"""
|
|
87
|
+
# Training mode
|
|
88
|
+
if category_mappings == {}:
|
|
89
|
+
for col in df.select_dtypes(include=["object", "string"]):
|
|
90
|
+
if col in features and df[col].nunique() < 20:
|
|
91
|
+
print(f"Training mode: Converting {col} to category")
|
|
92
|
+
df[col] = df[col].astype("category")
|
|
93
|
+
category_mappings[col] = df[col].cat.categories.tolist() # Store category mappings
|
|
94
|
+
|
|
95
|
+
# Inference mode
|
|
96
|
+
else:
|
|
97
|
+
for col, categories in category_mappings.items():
|
|
98
|
+
if col in df.columns:
|
|
99
|
+
print(f"Inference mode: Applying categorical mapping for {col}")
|
|
100
|
+
df[col] = pd.Categorical(df[col], categories=categories) # Apply consistent categorical mapping
|
|
101
|
+
|
|
102
|
+
return df, category_mappings
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def decompress_features(
|
|
106
|
+
df: pd.DataFrame, features: List[str], compressed_features: List[str]
|
|
107
|
+
) -> Tuple[pd.DataFrame, List[str]]:
|
|
108
|
+
"""Prepare features for the model by decompressing bitstring features
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
df (pd.DataFrame): The features DataFrame
|
|
112
|
+
features (List[str]): Full list of feature names
|
|
113
|
+
compressed_features (List[str]): List of feature names to decompress (bitstrings)
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
pd.DataFrame: DataFrame with the decompressed features
|
|
117
|
+
List[str]: Updated list of feature names after decompression
|
|
118
|
+
|
|
119
|
+
Raises:
|
|
120
|
+
ValueError: If any missing values are found in the specified features
|
|
121
|
+
"""
|
|
122
|
+
|
|
123
|
+
# Check for any missing values in the required features
|
|
124
|
+
missing_counts = df[features].isna().sum()
|
|
125
|
+
if missing_counts.any():
|
|
126
|
+
missing_features = missing_counts[missing_counts > 0]
|
|
127
|
+
print(
|
|
128
|
+
f"WARNING: Found missing values in features: {missing_features.to_dict()}. "
|
|
129
|
+
"WARNING: You might want to remove/replace all NaN values before processing."
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
# Decompress the specified compressed features
|
|
133
|
+
decompressed_features = features.copy()
|
|
134
|
+
for feature in compressed_features:
|
|
135
|
+
if (feature not in df.columns) or (feature not in features):
|
|
136
|
+
print(f"Feature '{feature}' not in the features list, skipping decompression.")
|
|
137
|
+
continue
|
|
138
|
+
|
|
139
|
+
# Remove the feature from the list of features to avoid duplication
|
|
140
|
+
decompressed_features.remove(feature)
|
|
141
|
+
|
|
142
|
+
# Handle all compressed features as bitstrings
|
|
143
|
+
bit_matrix = np.array([list(bitstring) for bitstring in df[feature]], dtype=np.uint8)
|
|
144
|
+
prefix = feature[:3]
|
|
145
|
+
|
|
146
|
+
# Create all new columns at once - avoids fragmentation
|
|
147
|
+
new_col_names = [f"{prefix}_{i}" for i in range(bit_matrix.shape[1])]
|
|
148
|
+
new_df = pd.DataFrame(bit_matrix, columns=new_col_names, index=df.index)
|
|
149
|
+
|
|
150
|
+
# Add to features list
|
|
151
|
+
decompressed_features.extend(new_col_names)
|
|
152
|
+
|
|
153
|
+
# Drop original column and concatenate new ones
|
|
154
|
+
df = df.drop(columns=[feature])
|
|
155
|
+
df = pd.concat([df, new_df], axis=1)
|
|
156
|
+
|
|
157
|
+
return df, decompressed_features
|
|
158
|
+
|
|
159
|
+
|
|
80
160
|
if __name__ == "__main__":
|
|
81
161
|
# Template Parameters
|
|
82
162
|
id_column = TEMPLATE_PARAMS["id_column"]
|
|
83
|
-
features = TEMPLATE_PARAMS["features"]
|
|
84
163
|
target = TEMPLATE_PARAMS["target"]
|
|
164
|
+
features = TEMPLATE_PARAMS["features"]
|
|
165
|
+
orig_features = features.copy()
|
|
166
|
+
compressed_features = TEMPLATE_PARAMS["compressed_features"]
|
|
85
167
|
train_all_data = TEMPLATE_PARAMS["train_all_data"]
|
|
86
168
|
track_columns = TEMPLATE_PARAMS["track_columns"] # Can be None
|
|
87
169
|
validation_split = 0.2
|
|
@@ -95,53 +177,62 @@ if __name__ == "__main__":
|
|
|
95
177
|
)
|
|
96
178
|
args = parser.parse_args()
|
|
97
179
|
|
|
98
|
-
#
|
|
99
|
-
training_files = [
|
|
100
|
-
os.path.join(args.train, file)
|
|
101
|
-
for file in os.listdir(args.train) if file.endswith(".csv")
|
|
102
|
-
]
|
|
180
|
+
# Read the training data into DataFrames
|
|
181
|
+
training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
|
|
103
182
|
print(f"Training Files: {training_files}")
|
|
104
183
|
|
|
105
184
|
# Combine files and read them all into a single pandas dataframe
|
|
106
|
-
|
|
185
|
+
all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
|
|
186
|
+
|
|
187
|
+
# Check if the dataframe is empty
|
|
188
|
+
check_dataframe(all_df, "training_df")
|
|
189
|
+
|
|
190
|
+
# Features/Target output
|
|
191
|
+
print(f"Target: {target}")
|
|
192
|
+
print(f"Features: {str(features)}")
|
|
107
193
|
|
|
108
|
-
#
|
|
109
|
-
|
|
194
|
+
# Convert any features that might be categorical to 'category' type
|
|
195
|
+
all_df, category_mappings = convert_categorical_types(all_df, features)
|
|
110
196
|
|
|
111
|
-
#
|
|
197
|
+
# If we have compressed features, decompress them
|
|
198
|
+
if compressed_features:
|
|
199
|
+
print(f"Decompressing features {compressed_features}...")
|
|
200
|
+
all_df, features = decompress_features(all_df, features, compressed_features)
|
|
201
|
+
|
|
202
|
+
# Do we want to train on all the data?
|
|
112
203
|
if train_all_data:
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
print("
|
|
120
|
-
df_train =
|
|
121
|
-
df_val =
|
|
204
|
+
print("Training on ALL of the data")
|
|
205
|
+
df_train = all_df.copy()
|
|
206
|
+
df_val = all_df.copy()
|
|
207
|
+
|
|
208
|
+
# Does the dataframe have a training column?
|
|
209
|
+
elif "training" in all_df.columns:
|
|
210
|
+
print("Found training column, splitting data based on training column")
|
|
211
|
+
df_train = all_df[all_df["training"]]
|
|
212
|
+
df_val = all_df[~all_df["training"]]
|
|
122
213
|
else:
|
|
123
|
-
#
|
|
124
|
-
print("
|
|
125
|
-
df_train, df_val = train_test_split(
|
|
214
|
+
# Just do a random training Split
|
|
215
|
+
print("WARNING: No training column found, splitting data with random state=42")
|
|
216
|
+
df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
|
|
217
|
+
print(f"FIT/TRAIN: {df_train.shape}")
|
|
218
|
+
print(f"VALIDATION: {df_val.shape}")
|
|
126
219
|
|
|
127
220
|
# We're using XGBoost for point predictions and NGBoost for uncertainty quantification
|
|
128
221
|
xgb_model = XGBRegressor()
|
|
129
|
-
ngb_model = NGBRegressor()
|
|
222
|
+
ngb_model = NGBRegressor() # Dist=Cauchy) Seems to give HUGE prediction intervals
|
|
130
223
|
|
|
131
224
|
# Prepare features and targets for training
|
|
132
225
|
X_train = df_train[features]
|
|
133
|
-
|
|
226
|
+
X_validate = df_val[features]
|
|
134
227
|
y_train = df_train[target]
|
|
135
|
-
|
|
228
|
+
y_validate = df_val[target]
|
|
136
229
|
|
|
137
230
|
# Train both models using the training data
|
|
138
231
|
xgb_model.fit(X_train, y_train)
|
|
139
|
-
ngb_model.fit(X_train, y_train, X_val=
|
|
232
|
+
ngb_model.fit(X_train, y_train, X_val=X_validate, Y_val=y_validate)
|
|
140
233
|
|
|
141
234
|
# Make Predictions on the Validation Set
|
|
142
235
|
print(f"Making Predictions on Validation Set...")
|
|
143
|
-
y_validate = df_val[target]
|
|
144
|
-
X_validate = df_val[features]
|
|
145
236
|
preds = xgb_model.predict(X_validate)
|
|
146
237
|
|
|
147
238
|
# Calculate various model performance metrics (regression)
|
|
@@ -159,9 +250,9 @@ if __name__ == "__main__":
|
|
|
159
250
|
# Save the trained NGBoost model
|
|
160
251
|
joblib.dump(ngb_model, os.path.join(args.model_dir, "ngb_model.joblib"))
|
|
161
252
|
|
|
162
|
-
# Save the
|
|
253
|
+
# Save the features (this will validate input during predictions)
|
|
163
254
|
with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
|
|
164
|
-
json.dump(
|
|
255
|
+
json.dump(orig_features, fp) # We save the original features, not the decompressed ones
|
|
165
256
|
|
|
166
257
|
# Now the Proximity model
|
|
167
258
|
model = Proximity(df_train, id_column, features, target, track_columns=track_columns)
|
|
@@ -187,11 +278,7 @@ def model_fn(model_dir) -> dict:
|
|
|
187
278
|
# Deserialize the proximity model
|
|
188
279
|
prox_model = Proximity.deserialize(model_dir)
|
|
189
280
|
|
|
190
|
-
return {
|
|
191
|
-
"xgboost": xgb_model,
|
|
192
|
-
"ngboost": ngb_model,
|
|
193
|
-
"proximity": prox_model
|
|
194
|
-
}
|
|
281
|
+
return {"xgboost": xgb_model, "ngboost": ngb_model, "proximity": prox_model}
|
|
195
282
|
|
|
196
283
|
|
|
197
284
|
def input_fn(input_data, content_type):
|
|
@@ -251,20 +338,31 @@ def predict_fn(df, models) -> pd.DataFrame:
|
|
|
251
338
|
dist_params = y_dists.params
|
|
252
339
|
|
|
253
340
|
# Extract mean and std from distribution parameters
|
|
254
|
-
df["prediction_uq"] = dist_params[
|
|
255
|
-
df["prediction_std"] = dist_params[
|
|
341
|
+
df["prediction_uq"] = dist_params["loc"] # mean
|
|
342
|
+
df["prediction_std"] = dist_params["scale"] # standard deviation
|
|
256
343
|
|
|
257
344
|
# Add 95% prediction intervals using ppf (percent point function)
|
|
258
|
-
|
|
259
|
-
|
|
345
|
+
# Note: Our hybrid model uses XGB point prediction and NGBoost UQ
|
|
346
|
+
# so we need to adjust the bounds to include the point prediction
|
|
347
|
+
df["q_025"] = np.minimum(y_dists.ppf(0.025), df["prediction"])
|
|
348
|
+
df["q_975"] = np.maximum(y_dists.ppf(0.975), df["prediction"])
|
|
349
|
+
|
|
350
|
+
# Add 90% prediction intervals
|
|
351
|
+
df["q_05"] = y_dists.ppf(0.05) # 5th percentile
|
|
352
|
+
df["q_95"] = y_dists.ppf(0.95) # 95th percentile
|
|
353
|
+
|
|
354
|
+
# Add 80% prediction intervals
|
|
355
|
+
df["q_10"] = y_dists.ppf(0.10) # 10th percentile
|
|
356
|
+
df["q_90"] = y_dists.ppf(0.90) # 90th percentile
|
|
260
357
|
|
|
261
358
|
# Add 50% prediction intervals
|
|
262
|
-
df["q_25"] = y_dists.ppf(0.25)
|
|
263
|
-
df["q_75"] = y_dists.ppf(0.75)
|
|
359
|
+
df["q_25"] = y_dists.ppf(0.25) # 25th percentile
|
|
360
|
+
df["q_75"] = y_dists.ppf(0.75) # 75th percentile
|
|
264
361
|
|
|
265
|
-
#
|
|
266
|
-
|
|
267
|
-
|
|
362
|
+
# Reorder the quantile columns for easier reading
|
|
363
|
+
quantile_cols = ["q_025", "q_05", "q_10", "q_25", "q_75", "q_90", "q_95", "q_975"]
|
|
364
|
+
other_cols = [col for col in df.columns if col not in quantile_cols]
|
|
365
|
+
df = df[other_cols + quantile_cols]
|
|
268
366
|
|
|
269
367
|
# Compute Nearest neighbors with Proximity model
|
|
270
368
|
models["proximity"].neighbors(df)
|
|
@@ -3,11 +3,7 @@ from ngboost import NGBRegressor
|
|
|
3
3
|
from sklearn.model_selection import train_test_split
|
|
4
4
|
|
|
5
5
|
# Model Performance Scores
|
|
6
|
-
from sklearn.metrics import
|
|
7
|
-
mean_absolute_error,
|
|
8
|
-
r2_score,
|
|
9
|
-
root_mean_squared_error
|
|
10
|
-
)
|
|
6
|
+
from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
|
|
11
7
|
|
|
12
8
|
from io import StringIO
|
|
13
9
|
import json
|
|
@@ -21,7 +17,7 @@ import pandas as pd
|
|
|
21
17
|
TEMPLATE_PARAMS = {
|
|
22
18
|
"features": "{{feature_list}}",
|
|
23
19
|
"target": "{{target_column}}",
|
|
24
|
-
"train_all_data": "{{train_all_data}}"
|
|
20
|
+
"train_all_data": "{{train_all_data}}",
|
|
25
21
|
}
|
|
26
22
|
|
|
27
23
|
|
|
@@ -87,10 +83,7 @@ if __name__ == "__main__":
|
|
|
87
83
|
args = parser.parse_args()
|
|
88
84
|
|
|
89
85
|
# Load training data from the specified directory
|
|
90
|
-
training_files = [
|
|
91
|
-
os.path.join(args.train, file)
|
|
92
|
-
for file in os.listdir(args.train) if file.endswith(".csv")
|
|
93
|
-
]
|
|
86
|
+
training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
|
|
94
87
|
print(f"Training Files: {training_files}")
|
|
95
88
|
|
|
96
89
|
# Combine files and read them all into a single pandas dataframe
|
|
@@ -212,16 +205,29 @@ def predict_fn(df, model) -> pd.DataFrame:
|
|
|
212
205
|
dist_params = y_dists.params
|
|
213
206
|
|
|
214
207
|
# Extract mean and std from distribution parameters
|
|
215
|
-
df["prediction"] = dist_params[
|
|
216
|
-
df["prediction_std"] = dist_params[
|
|
208
|
+
df["prediction"] = dist_params["loc"] # mean
|
|
209
|
+
df["prediction_std"] = dist_params["scale"] # standard deviation
|
|
217
210
|
|
|
218
211
|
# Add 95% prediction intervals using ppf (percent point function)
|
|
219
212
|
df["q_025"] = y_dists.ppf(0.025) # 2.5th percentile
|
|
220
213
|
df["q_975"] = y_dists.ppf(0.975) # 97.5th percentile
|
|
221
214
|
|
|
215
|
+
# Add 90% prediction intervals
|
|
216
|
+
df["q_05"] = y_dists.ppf(0.05) # 5th percentile
|
|
217
|
+
df["q_95"] = y_dists.ppf(0.95) # 95th percentile
|
|
218
|
+
|
|
219
|
+
# Add 80% prediction intervals
|
|
220
|
+
df["q_10"] = y_dists.ppf(0.10) # 10th percentile
|
|
221
|
+
df["q_90"] = y_dists.ppf(0.90) # 90th percentile
|
|
222
|
+
|
|
222
223
|
# Add 50% prediction intervals
|
|
223
|
-
df["q_25"] = y_dists.ppf(0.25)
|
|
224
|
-
df["q_75"] = y_dists.ppf(0.75)
|
|
224
|
+
df["q_25"] = y_dists.ppf(0.25) # 25th percentile
|
|
225
|
+
df["q_75"] = y_dists.ppf(0.75) # 75th percentile
|
|
226
|
+
|
|
227
|
+
# Reorder the quantile columns for easier reading
|
|
228
|
+
quantile_cols = ["q_025", "q_05", "q_10", "q_25", "q_75", "q_90", "q_95", "q_975"]
|
|
229
|
+
other_cols = [col for col in df.columns if col not in quantile_cols]
|
|
230
|
+
df = df[other_cols + quantile_cols]
|
|
225
231
|
|
|
226
232
|
# Return the modified DataFrame
|
|
227
233
|
return df
|