workbench 0.8.202__py3-none-any.whl → 0.8.220__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of workbench might be problematic. Click here for more details.
- workbench/algorithms/dataframe/compound_dataset_overlap.py +321 -0
- workbench/algorithms/dataframe/feature_space_proximity.py +168 -75
- workbench/algorithms/dataframe/fingerprint_proximity.py +421 -85
- workbench/algorithms/dataframe/projection_2d.py +44 -21
- workbench/algorithms/dataframe/proximity.py +78 -150
- workbench/algorithms/graph/light/proximity_graph.py +5 -5
- workbench/algorithms/models/cleanlab_model.py +382 -0
- workbench/algorithms/models/noise_model.py +388 -0
- workbench/algorithms/sql/outliers.py +3 -3
- workbench/api/__init__.py +3 -0
- workbench/api/df_store.py +17 -108
- workbench/api/endpoint.py +13 -11
- workbench/api/feature_set.py +111 -8
- workbench/api/meta_model.py +289 -0
- workbench/api/model.py +45 -12
- workbench/api/parameter_store.py +3 -52
- workbench/cached/cached_model.py +4 -4
- workbench/core/artifacts/artifact.py +5 -5
- workbench/core/artifacts/df_store_core.py +114 -0
- workbench/core/artifacts/endpoint_core.py +228 -237
- workbench/core/artifacts/feature_set_core.py +185 -230
- workbench/core/artifacts/model_core.py +34 -26
- workbench/core/artifacts/parameter_store_core.py +98 -0
- workbench/core/pipelines/pipeline_executor.py +1 -1
- workbench/core/transforms/features_to_model/features_to_model.py +22 -10
- workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +41 -10
- workbench/core/transforms/pandas_transforms/pandas_to_features.py +11 -2
- workbench/model_script_utils/model_script_utils.py +339 -0
- workbench/model_script_utils/pytorch_utils.py +405 -0
- workbench/model_script_utils/uq_harness.py +278 -0
- workbench/model_scripts/chemprop/chemprop.template +428 -631
- workbench/model_scripts/chemprop/generated_model_script.py +432 -635
- workbench/model_scripts/chemprop/model_script_utils.py +339 -0
- workbench/model_scripts/chemprop/requirements.txt +2 -10
- workbench/model_scripts/custom_models/chem_info/fingerprints.py +87 -46
- workbench/model_scripts/custom_models/proximity/feature_space_proximity.py +194 -0
- workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +6 -6
- workbench/model_scripts/custom_models/uq_models/feature_space_proximity.py +194 -0
- workbench/model_scripts/meta_model/generated_model_script.py +209 -0
- workbench/model_scripts/meta_model/meta_model.template +209 -0
- workbench/model_scripts/pytorch_model/generated_model_script.py +374 -613
- workbench/model_scripts/pytorch_model/model_script_utils.py +339 -0
- workbench/model_scripts/pytorch_model/pytorch.template +370 -609
- workbench/model_scripts/pytorch_model/pytorch_utils.py +405 -0
- workbench/model_scripts/pytorch_model/requirements.txt +1 -1
- workbench/model_scripts/pytorch_model/uq_harness.py +278 -0
- workbench/model_scripts/script_generation.py +6 -5
- workbench/model_scripts/uq_models/generated_model_script.py +65 -422
- workbench/model_scripts/xgb_model/generated_model_script.py +372 -395
- workbench/model_scripts/xgb_model/model_script_utils.py +339 -0
- workbench/model_scripts/xgb_model/uq_harness.py +278 -0
- workbench/model_scripts/xgb_model/xgb_model.template +366 -396
- workbench/repl/workbench_shell.py +0 -5
- workbench/resources/open_source_api.key +1 -1
- workbench/scripts/endpoint_test.py +2 -2
- workbench/scripts/meta_model_sim.py +35 -0
- workbench/scripts/training_test.py +85 -0
- workbench/utils/chem_utils/fingerprints.py +87 -46
- workbench/utils/chem_utils/projections.py +16 -6
- workbench/utils/chemprop_utils.py +36 -655
- workbench/utils/meta_model_simulator.py +499 -0
- workbench/utils/metrics_utils.py +256 -0
- workbench/utils/model_utils.py +192 -54
- workbench/utils/pytorch_utils.py +33 -472
- workbench/utils/shap_utils.py +1 -55
- workbench/utils/xgboost_local_crossfold.py +267 -0
- workbench/utils/xgboost_model_utils.py +49 -356
- workbench/web_interface/components/model_plot.py +7 -1
- workbench/web_interface/components/plugins/model_details.py +30 -68
- workbench/web_interface/components/plugins/scatter_plot.py +4 -8
- {workbench-0.8.202.dist-info → workbench-0.8.220.dist-info}/METADATA +6 -5
- {workbench-0.8.202.dist-info → workbench-0.8.220.dist-info}/RECORD +76 -60
- {workbench-0.8.202.dist-info → workbench-0.8.220.dist-info}/entry_points.txt +2 -0
- workbench/core/cloud_platform/aws/aws_df_store.py +0 -404
- workbench/core/cloud_platform/aws/aws_parameter_store.py +0 -296
- workbench/model_scripts/custom_models/meta_endpoints/example.py +0 -53
- workbench/model_scripts/custom_models/proximity/proximity.py +0 -410
- workbench/model_scripts/custom_models/uq_models/meta_uq.template +0 -377
- workbench/model_scripts/custom_models/uq_models/proximity.py +0 -410
- workbench/model_scripts/uq_models/mapie.template +0 -605
- workbench/model_scripts/uq_models/requirements.txt +0 -1
- {workbench-0.8.202.dist-info → workbench-0.8.220.dist-info}/WHEEL +0 -0
- {workbench-0.8.202.dist-info → workbench-0.8.220.dist-info}/licenses/LICENSE +0 -0
- {workbench-0.8.202.dist-info → workbench-0.8.220.dist-info}/top_level.txt +0 -0
|
@@ -1,475 +1,445 @@
|
|
|
1
|
-
#
|
|
2
|
-
|
|
3
|
-
|
|
1
|
+
# XGBoost Model Template for Workbench
|
|
2
|
+
#
|
|
3
|
+
# This template handles both classification and regression models with:
|
|
4
|
+
# - K-fold cross-validation ensemble training (or single train/val split)
|
|
5
|
+
# - Out-of-fold predictions for validation metrics
|
|
6
|
+
# - Uncertainty quantification for regression models
|
|
7
|
+
# - Sample weights support
|
|
8
|
+
# - Categorical feature handling
|
|
9
|
+
# - Compressed feature decompression
|
|
10
|
+
#
|
|
11
|
+
# NOTE: Imports are structured to minimize serverless endpoint startup time.
|
|
12
|
+
# Heavy imports (sklearn, awswrangler) are deferred to training time.
|
|
13
|
+
|
|
14
|
+
import json
|
|
15
|
+
import os
|
|
16
|
+
|
|
17
|
+
import joblib
|
|
4
18
|
import numpy as np
|
|
19
|
+
import pandas as pd
|
|
20
|
+
import xgboost as xgb
|
|
5
21
|
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
22
|
+
from model_script_utils import (
|
|
23
|
+
convert_categorical_types,
|
|
24
|
+
decompress_features,
|
|
25
|
+
expand_proba_column,
|
|
26
|
+
input_fn,
|
|
27
|
+
match_features_case_insensitive,
|
|
28
|
+
output_fn,
|
|
29
|
+
)
|
|
30
|
+
from uq_harness import (
|
|
31
|
+
compute_confidence,
|
|
32
|
+
load_uq_models,
|
|
33
|
+
predict_intervals,
|
|
14
34
|
)
|
|
15
|
-
from scipy.stats import spearmanr
|
|
16
35
|
|
|
17
|
-
#
|
|
18
|
-
|
|
36
|
+
# =============================================================================
|
|
37
|
+
# Default Hyperparameters
|
|
38
|
+
# =============================================================================
|
|
39
|
+
DEFAULT_HYPERPARAMETERS = {
|
|
40
|
+
# Training parameters
|
|
41
|
+
"n_folds": 5, # Number of CV folds (1 = single train/val split)
|
|
42
|
+
# Core tree parameters
|
|
43
|
+
"n_estimators": 300,
|
|
44
|
+
"max_depth": 7,
|
|
45
|
+
"learning_rate": 0.05,
|
|
46
|
+
# Sampling parameters (less aggressive - ensemble provides regularization)
|
|
47
|
+
"subsample": 0.8,
|
|
48
|
+
"colsample_bytree": 0.8,
|
|
49
|
+
# Regularization (lighter - ensemble averaging reduces overfitting)
|
|
50
|
+
"min_child_weight": 3,
|
|
51
|
+
"gamma": 0.1,
|
|
52
|
+
"reg_alpha": 0.1,
|
|
53
|
+
"reg_lambda": 1.0,
|
|
54
|
+
# Random seed
|
|
55
|
+
"seed": 42,
|
|
56
|
+
}
|
|
19
57
|
|
|
20
|
-
#
|
|
21
|
-
|
|
58
|
+
# Workbench-specific parameters (not passed to XGBoost)
|
|
59
|
+
WORKBENCH_PARAMS = {"n_folds"}
|
|
22
60
|
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
import argparse
|
|
26
|
-
import joblib
|
|
27
|
-
import os
|
|
28
|
-
import pandas as pd
|
|
29
|
-
from typing import List, Tuple
|
|
61
|
+
# Regression-only parameters (filtered out for classifiers)
|
|
62
|
+
REGRESSION_ONLY_PARAMS = {"objective"}
|
|
30
63
|
|
|
31
|
-
# Template
|
|
64
|
+
# Template parameters (filled in by Workbench)
|
|
32
65
|
TEMPLATE_PARAMS = {
|
|
33
66
|
"model_type": "{{model_type}}",
|
|
34
67
|
"target": "{{target_column}}",
|
|
35
68
|
"features": "{{feature_list}}",
|
|
69
|
+
"id_column": "{{id_column}}",
|
|
36
70
|
"compressed_features": "{{compressed_features}}",
|
|
37
71
|
"model_metrics_s3_path": "{{model_metrics_s3_path}}",
|
|
38
|
-
"train_all_data": "{{train_all_data}}",
|
|
39
72
|
"hyperparameters": "{{hyperparameters}}",
|
|
40
73
|
}
|
|
41
74
|
|
|
42
75
|
|
|
43
|
-
#
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
def expand_proba_column(df: pd.DataFrame, class_labels: List[str]) -> pd.DataFrame:
|
|
59
|
-
"""
|
|
60
|
-
Expands a column in a DataFrame containing a list of probabilities into separate columns.
|
|
61
|
-
|
|
62
|
-
Args:
|
|
63
|
-
df (pd.DataFrame): DataFrame containing a "pred_proba" column
|
|
64
|
-
class_labels (List[str]): List of class labels
|
|
65
|
-
|
|
66
|
-
Returns:
|
|
67
|
-
pd.DataFrame: DataFrame with the "pred_proba" expanded into separate columns
|
|
68
|
-
"""
|
|
69
|
-
|
|
70
|
-
# Sanity check
|
|
71
|
-
proba_column = "pred_proba"
|
|
72
|
-
if proba_column not in df.columns:
|
|
73
|
-
raise ValueError('DataFrame does not contain a "pred_proba" column')
|
|
74
|
-
|
|
75
|
-
# Construct new column names with '_proba' suffix
|
|
76
|
-
proba_splits = [f"{label}_proba" for label in class_labels]
|
|
77
|
-
|
|
78
|
-
# Expand the proba_column into separate columns for each probability
|
|
79
|
-
proba_df = pd.DataFrame(df[proba_column].tolist(), columns=proba_splits)
|
|
80
|
-
|
|
81
|
-
# Drop any proba columns and reset the index in prep for the concat
|
|
82
|
-
df = df.drop(columns=[proba_column] + proba_splits, errors="ignore")
|
|
83
|
-
df = df.reset_index(drop=True)
|
|
76
|
+
# =============================================================================
|
|
77
|
+
# Model Loading (for SageMaker inference)
|
|
78
|
+
# =============================================================================
|
|
79
|
+
def model_fn(model_dir: str) -> dict:
|
|
80
|
+
"""Load XGBoost ensemble from the specified directory."""
|
|
81
|
+
# Load ensemble metadata
|
|
82
|
+
metadata_path = os.path.join(model_dir, "ensemble_metadata.json")
|
|
83
|
+
if os.path.exists(metadata_path):
|
|
84
|
+
with open(metadata_path) as f:
|
|
85
|
+
metadata = json.load(f)
|
|
86
|
+
n_ensemble = metadata["n_ensemble"]
|
|
87
|
+
else:
|
|
88
|
+
n_ensemble = 1 # Legacy single model
|
|
84
89
|
|
|
85
|
-
#
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
90
|
+
# Load ensemble models
|
|
91
|
+
ensemble_models = []
|
|
92
|
+
for i in range(n_ensemble):
|
|
93
|
+
model_path = os.path.join(model_dir, f"xgb_model_{i}.joblib")
|
|
94
|
+
if not os.path.exists(model_path):
|
|
95
|
+
model_path = os.path.join(model_dir, "xgb_model.joblib") # Legacy fallback
|
|
96
|
+
ensemble_models.append(joblib.load(model_path))
|
|
89
97
|
|
|
98
|
+
print(f"Loaded {len(ensemble_models)} model(s)")
|
|
90
99
|
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
100
|
+
# Load label encoder (classifier only)
|
|
101
|
+
label_encoder = None
|
|
102
|
+
encoder_path = os.path.join(model_dir, "label_encoder.joblib")
|
|
103
|
+
if os.path.exists(encoder_path):
|
|
104
|
+
label_encoder = joblib.load(encoder_path)
|
|
105
|
+
|
|
106
|
+
# Load category mappings
|
|
107
|
+
category_mappings = {}
|
|
108
|
+
category_path = os.path.join(model_dir, "category_mappings.json")
|
|
109
|
+
if os.path.exists(category_path):
|
|
110
|
+
with open(category_path) as f:
|
|
111
|
+
category_mappings = json.load(f)
|
|
112
|
+
|
|
113
|
+
# Load UQ models (regression only)
|
|
114
|
+
uq_models, uq_metadata = None, None
|
|
115
|
+
uq_path = os.path.join(model_dir, "uq_metadata.json")
|
|
116
|
+
if os.path.exists(uq_path):
|
|
117
|
+
uq_models, uq_metadata = load_uq_models(model_dir)
|
|
118
|
+
|
|
119
|
+
return {
|
|
120
|
+
"ensemble_models": ensemble_models,
|
|
121
|
+
"n_ensemble": n_ensemble,
|
|
122
|
+
"label_encoder": label_encoder,
|
|
123
|
+
"category_mappings": category_mappings,
|
|
124
|
+
"uq_models": uq_models,
|
|
125
|
+
"uq_metadata": uq_metadata,
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
# =============================================================================
|
|
130
|
+
# Inference (for SageMaker inference)
|
|
131
|
+
# =============================================================================
|
|
132
|
+
def predict_fn(df: pd.DataFrame, model_dict: dict) -> pd.DataFrame:
|
|
133
|
+
"""Make predictions with XGBoost ensemble."""
|
|
134
|
+
model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
|
|
135
|
+
with open(os.path.join(model_dir, "feature_columns.json")) as f:
|
|
136
|
+
features = json.load(f)
|
|
137
|
+
print(f"Model Features: {features}")
|
|
108
138
|
|
|
109
|
-
|
|
110
|
-
|
|
139
|
+
# Extract model components
|
|
140
|
+
ensemble_models = model_dict["ensemble_models"]
|
|
141
|
+
label_encoder = model_dict.get("label_encoder")
|
|
142
|
+
category_mappings = model_dict.get("category_mappings", {})
|
|
143
|
+
uq_models = model_dict.get("uq_models")
|
|
144
|
+
uq_metadata = model_dict.get("uq_metadata")
|
|
145
|
+
compressed_features = TEMPLATE_PARAMS["compressed_features"]
|
|
111
146
|
|
|
112
|
-
#
|
|
113
|
-
|
|
147
|
+
# Prepare features
|
|
148
|
+
matched_df = match_features_case_insensitive(df, features)
|
|
149
|
+
matched_df, _ = convert_categorical_types(matched_df, features, category_mappings)
|
|
114
150
|
|
|
151
|
+
if compressed_features:
|
|
152
|
+
print("Decompressing features for prediction...")
|
|
153
|
+
matched_df, features = decompress_features(matched_df, features, compressed_features)
|
|
115
154
|
|
|
116
|
-
|
|
117
|
-
"""
|
|
118
|
-
Converts appropriate columns to categorical type with consistent mappings.
|
|
155
|
+
X = matched_df[features]
|
|
119
156
|
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
category_mappings (dict, optional): Existing category mappings. If empty dict, we're in
|
|
124
|
-
training mode. If populated, we're in inference mode.
|
|
157
|
+
# Collect ensemble predictions
|
|
158
|
+
all_preds = [m.predict(X) for m in ensemble_models]
|
|
159
|
+
ensemble_preds = np.stack(all_preds, axis=0)
|
|
125
160
|
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
for col in df.select_dtypes(include=["object", "string"]):
|
|
132
|
-
if col in features and df[col].nunique() < 20:
|
|
133
|
-
print(f"Training mode: Converting {col} to category")
|
|
134
|
-
df[col] = df[col].astype("category")
|
|
135
|
-
category_mappings[col] = df[col].cat.categories.tolist() # Store category mappings
|
|
161
|
+
if label_encoder is not None:
|
|
162
|
+
# Classification: average probabilities, then argmax
|
|
163
|
+
all_probs = [m.predict_proba(X) for m in ensemble_models]
|
|
164
|
+
avg_probs = np.mean(np.stack(all_probs, axis=0), axis=0)
|
|
165
|
+
class_preds = np.argmax(avg_probs, axis=1)
|
|
136
166
|
|
|
137
|
-
|
|
167
|
+
df["prediction"] = label_encoder.inverse_transform(class_preds)
|
|
168
|
+
df["pred_proba"] = [p.tolist() for p in avg_probs]
|
|
169
|
+
df = expand_proba_column(df, label_encoder.classes_)
|
|
138
170
|
else:
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
df[col] = pd.Categorical(df[col], categories=categories) # Apply consistent categorical mapping
|
|
143
|
-
|
|
144
|
-
return df, category_mappings
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
def decompress_features(
|
|
148
|
-
df: pd.DataFrame, features: List[str], compressed_features: List[str]
|
|
149
|
-
) -> Tuple[pd.DataFrame, List[str]]:
|
|
150
|
-
"""Prepare features for the model by decompressing bitstring features
|
|
151
|
-
|
|
152
|
-
Args:
|
|
153
|
-
df (pd.DataFrame): The features DataFrame
|
|
154
|
-
features (List[str]): Full list of feature names
|
|
155
|
-
compressed_features (List[str]): List of feature names to decompress (bitstrings)
|
|
156
|
-
|
|
157
|
-
Returns:
|
|
158
|
-
pd.DataFrame: DataFrame with the decompressed features
|
|
159
|
-
List[str]: Updated list of feature names after decompression
|
|
160
|
-
|
|
161
|
-
Raises:
|
|
162
|
-
ValueError: If any missing values are found in the specified features
|
|
163
|
-
"""
|
|
164
|
-
|
|
165
|
-
# Check for any missing values in the required features
|
|
166
|
-
missing_counts = df[features].isna().sum()
|
|
167
|
-
if missing_counts.any():
|
|
168
|
-
missing_features = missing_counts[missing_counts > 0]
|
|
169
|
-
print(
|
|
170
|
-
f"WARNING: Found missing values in features: {missing_features.to_dict()}. "
|
|
171
|
-
"WARNING: You might want to remove/replace all NaN values before processing."
|
|
172
|
-
)
|
|
173
|
-
|
|
174
|
-
# Decompress the specified compressed features
|
|
175
|
-
decompressed_features = features.copy()
|
|
176
|
-
for feature in compressed_features:
|
|
177
|
-
if (feature not in df.columns) or (feature not in features):
|
|
178
|
-
print(f"Feature '{feature}' not in the features list, skipping decompression.")
|
|
179
|
-
continue
|
|
180
|
-
|
|
181
|
-
# Remove the feature from the list of features to avoid duplication
|
|
182
|
-
decompressed_features.remove(feature)
|
|
183
|
-
|
|
184
|
-
# Handle all compressed features as bitstrings
|
|
185
|
-
bit_matrix = np.array([list(bitstring) for bitstring in df[feature]], dtype=np.uint8)
|
|
186
|
-
prefix = feature[:3]
|
|
187
|
-
|
|
188
|
-
# Create all new columns at once - avoids fragmentation
|
|
189
|
-
new_col_names = [f"{prefix}_{i}" for i in range(bit_matrix.shape[1])]
|
|
190
|
-
new_df = pd.DataFrame(bit_matrix, columns=new_col_names, index=df.index)
|
|
191
|
-
|
|
192
|
-
# Add to features list
|
|
193
|
-
decompressed_features.extend(new_col_names)
|
|
171
|
+
# Regression: average predictions
|
|
172
|
+
df["prediction"] = np.mean(ensemble_preds, axis=0)
|
|
173
|
+
df["prediction_std"] = np.std(ensemble_preds, axis=0)
|
|
194
174
|
|
|
195
|
-
#
|
|
196
|
-
|
|
197
|
-
|
|
175
|
+
# Add UQ intervals if available
|
|
176
|
+
if uq_models and uq_metadata:
|
|
177
|
+
df = predict_intervals(df, X, uq_models, uq_metadata)
|
|
178
|
+
df = compute_confidence(df, uq_metadata["median_interval_width"], "q_10", "q_90")
|
|
198
179
|
|
|
199
|
-
|
|
180
|
+
print(f"Inference complete: {len(df)} predictions, {len(ensemble_models)} ensemble members")
|
|
181
|
+
return df
|
|
200
182
|
|
|
201
183
|
|
|
184
|
+
# =============================================================================
|
|
185
|
+
# Training
|
|
186
|
+
# =============================================================================
|
|
202
187
|
if __name__ == "__main__":
|
|
203
|
-
|
|
188
|
+
# -------------------------------------------------------------------------
|
|
189
|
+
# Training-only imports (deferred to reduce serverless startup time)
|
|
190
|
+
# -------------------------------------------------------------------------
|
|
191
|
+
import argparse
|
|
192
|
+
|
|
193
|
+
import awswrangler as wr
|
|
194
|
+
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
|
|
195
|
+
from sklearn.preprocessing import LabelEncoder
|
|
196
|
+
|
|
197
|
+
from model_script_utils import (
|
|
198
|
+
check_dataframe,
|
|
199
|
+
compute_classification_metrics,
|
|
200
|
+
compute_regression_metrics,
|
|
201
|
+
print_classification_metrics,
|
|
202
|
+
print_confusion_matrix,
|
|
203
|
+
print_regression_metrics,
|
|
204
|
+
)
|
|
205
|
+
from uq_harness import (
|
|
206
|
+
save_uq_models,
|
|
207
|
+
train_uq_models,
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
# -------------------------------------------------------------------------
|
|
211
|
+
# Setup: Parse arguments and load data
|
|
212
|
+
# -------------------------------------------------------------------------
|
|
213
|
+
parser = argparse.ArgumentParser()
|
|
214
|
+
parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model"))
|
|
215
|
+
parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train"))
|
|
216
|
+
parser.add_argument("--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data"))
|
|
217
|
+
args = parser.parse_args()
|
|
204
218
|
|
|
205
|
-
#
|
|
219
|
+
# Extract template parameters
|
|
206
220
|
target = TEMPLATE_PARAMS["target"]
|
|
207
221
|
features = TEMPLATE_PARAMS["features"]
|
|
208
222
|
orig_features = features.copy()
|
|
223
|
+
id_column = TEMPLATE_PARAMS["id_column"]
|
|
209
224
|
compressed_features = TEMPLATE_PARAMS["compressed_features"]
|
|
210
225
|
model_type = TEMPLATE_PARAMS["model_type"]
|
|
211
226
|
model_metrics_s3_path = TEMPLATE_PARAMS["model_metrics_s3_path"]
|
|
212
|
-
|
|
213
|
-
hyperparameters = TEMPLATE_PARAMS["hyperparameters"]
|
|
214
|
-
validation_split = 0.2
|
|
215
|
-
|
|
216
|
-
# Script arguments for input/output directories
|
|
217
|
-
parser = argparse.ArgumentParser()
|
|
218
|
-
parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model"))
|
|
219
|
-
parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train"))
|
|
220
|
-
parser.add_argument(
|
|
221
|
-
"--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data")
|
|
222
|
-
)
|
|
223
|
-
args = parser.parse_args()
|
|
227
|
+
hyperparameters = {**DEFAULT_HYPERPARAMETERS, **(TEMPLATE_PARAMS["hyperparameters"] or {})}
|
|
224
228
|
|
|
225
|
-
#
|
|
226
|
-
training_files = [os.path.join(args.train,
|
|
229
|
+
# Load training data
|
|
230
|
+
training_files = [os.path.join(args.train, f) for f in os.listdir(args.train) if f.endswith(".csv")]
|
|
227
231
|
print(f"Training Files: {training_files}")
|
|
228
|
-
|
|
229
|
-
# Combine files and read them all into a single pandas dataframe
|
|
230
|
-
all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
|
|
231
|
-
|
|
232
|
-
# Check if the dataframe is empty
|
|
232
|
+
all_df = pd.concat([pd.read_csv(f, engine="python") for f in training_files])
|
|
233
233
|
check_dataframe(all_df, "training_df")
|
|
234
234
|
|
|
235
|
-
# Features/Target output
|
|
236
235
|
print(f"Target: {target}")
|
|
237
|
-
print(f"Features: {
|
|
236
|
+
print(f"Features: {features}")
|
|
237
|
+
print(f"Hyperparameters: {hyperparameters}")
|
|
238
238
|
|
|
239
|
-
#
|
|
239
|
+
# -------------------------------------------------------------------------
|
|
240
|
+
# Preprocessing: Categorical features and decompression
|
|
241
|
+
# -------------------------------------------------------------------------
|
|
240
242
|
all_df, category_mappings = convert_categorical_types(all_df, features)
|
|
241
243
|
|
|
242
|
-
# If we have compressed features, decompress them
|
|
243
244
|
if compressed_features:
|
|
244
|
-
print(f"Decompressing features {compressed_features}
|
|
245
|
+
print(f"Decompressing features: {compressed_features}")
|
|
245
246
|
all_df, features = decompress_features(all_df, features, compressed_features)
|
|
246
247
|
|
|
247
|
-
#
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
df_val = all_df.copy()
|
|
252
|
-
|
|
253
|
-
# Does the dataframe have a training column?
|
|
254
|
-
elif "training" in all_df.columns:
|
|
255
|
-
print("Found training column, splitting data based on training column")
|
|
256
|
-
df_train = all_df[all_df["training"]]
|
|
257
|
-
df_val = all_df[~all_df["training"]]
|
|
258
|
-
else:
|
|
259
|
-
# Just do a random training Split
|
|
260
|
-
print("WARNING: No training column found, splitting data with random state=42")
|
|
261
|
-
df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
|
|
262
|
-
print(f"FIT/TRAIN: {df_train.shape}")
|
|
263
|
-
print(f"VALIDATION: {df_val.shape}")
|
|
264
|
-
|
|
265
|
-
# Use any hyperparameters to set up both the trainer and model configurations
|
|
266
|
-
print(f"Hyperparameters: {hyperparameters}")
|
|
267
|
-
|
|
268
|
-
# Now spin up our XGB Model
|
|
248
|
+
# -------------------------------------------------------------------------
|
|
249
|
+
# Classification setup
|
|
250
|
+
# -------------------------------------------------------------------------
|
|
251
|
+
label_encoder = None
|
|
269
252
|
if model_type == "classifier":
|
|
270
|
-
xgb_model = xgb.XGBClassifier(enable_categorical=True, **hyperparameters)
|
|
271
|
-
|
|
272
|
-
# Encode the target column
|
|
273
253
|
label_encoder = LabelEncoder()
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
else:
|
|
278
|
-
xgb_model = xgb.XGBRegressor(enable_categorical=True, **hyperparameters)
|
|
279
|
-
label_encoder = None # We don't need this for regression
|
|
280
|
-
|
|
281
|
-
# Grab our Features, Target and Train the Model
|
|
282
|
-
y_train = df_train[target]
|
|
283
|
-
X_train = df_train[features]
|
|
284
|
-
xgb_model.fit(X_train, y_train)
|
|
285
|
-
|
|
286
|
-
# Make Predictions on the Validation Set
|
|
287
|
-
print(f"Making Predictions on Validation Set...")
|
|
288
|
-
y_validate = df_val[target]
|
|
289
|
-
X_validate = df_val[features]
|
|
290
|
-
preds = xgb_model.predict(X_validate)
|
|
291
|
-
if model_type == "classifier":
|
|
292
|
-
# Also get the probabilities for each class
|
|
293
|
-
print("Processing Probabilities...")
|
|
294
|
-
probs = xgb_model.predict_proba(X_validate)
|
|
295
|
-
df_val["pred_proba"] = [p.tolist() for p in probs]
|
|
296
|
-
|
|
297
|
-
# Expand the pred_proba column into separate columns for each class
|
|
298
|
-
print(df_val.columns)
|
|
299
|
-
df_val = expand_proba_column(df_val, label_encoder.classes_)
|
|
300
|
-
print(df_val.columns)
|
|
301
|
-
|
|
302
|
-
# Decode the target and prediction labels
|
|
303
|
-
y_validate = label_encoder.inverse_transform(y_validate)
|
|
304
|
-
preds = label_encoder.inverse_transform(preds)
|
|
305
|
-
|
|
306
|
-
# Save predictions to S3 (just the target, prediction, and '_proba' columns)
|
|
307
|
-
df_val["prediction"] = preds
|
|
308
|
-
output_columns = [target, "prediction"]
|
|
309
|
-
output_columns += [col for col in df_val.columns if col.endswith("_proba")]
|
|
310
|
-
wr.s3.to_csv(
|
|
311
|
-
df_val[output_columns],
|
|
312
|
-
path=f"{model_metrics_s3_path}/validation_predictions.csv",
|
|
313
|
-
index=False,
|
|
314
|
-
)
|
|
254
|
+
all_df[target] = label_encoder.fit_transform(all_df[target])
|
|
255
|
+
print(f"Class labels: {label_encoder.classes_.tolist()}")
|
|
315
256
|
|
|
316
|
-
#
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
257
|
+
# -------------------------------------------------------------------------
|
|
258
|
+
# Cross-validation setup
|
|
259
|
+
# -------------------------------------------------------------------------
|
|
260
|
+
n_folds = hyperparameters["n_folds"]
|
|
261
|
+
xgb_params = {k: v for k, v in hyperparameters.items() if k not in WORKBENCH_PARAMS}
|
|
320
262
|
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
# Put the scores into a dataframe
|
|
325
|
-
score_df = pd.DataFrame(
|
|
326
|
-
{
|
|
327
|
-
target: label_names,
|
|
328
|
-
"precision": scores[0],
|
|
329
|
-
"recall": scores[1],
|
|
330
|
-
"f1": scores[2],
|
|
331
|
-
"support": scores[3],
|
|
332
|
-
}
|
|
333
|
-
)
|
|
334
|
-
|
|
335
|
-
# We need to get creative with the Classification Metrics
|
|
336
|
-
metrics = ["precision", "recall", "f1", "support"]
|
|
337
|
-
for t in label_names:
|
|
338
|
-
for m in metrics:
|
|
339
|
-
value = score_df.loc[score_df[target] == t, m].iloc[0]
|
|
340
|
-
print(f"Metrics:{t}:{m} {value}")
|
|
341
|
-
|
|
342
|
-
# Compute and output the confusion matrix
|
|
343
|
-
conf_mtx = confusion_matrix(y_validate, preds, labels=label_names)
|
|
344
|
-
for i, row_name in enumerate(label_names):
|
|
345
|
-
for j, col_name in enumerate(label_names):
|
|
346
|
-
value = conf_mtx[i, j]
|
|
347
|
-
print(f"ConfusionMatrix:{row_name}:{col_name} {value}")
|
|
263
|
+
# Map 'seed' to 'random_state' for XGBoost
|
|
264
|
+
if "seed" in xgb_params:
|
|
265
|
+
xgb_params["random_state"] = xgb_params.pop("seed")
|
|
348
266
|
|
|
267
|
+
# Handle objective: filter regression-only params for classifiers, set default for regressors
|
|
268
|
+
if model_type == "classifier":
|
|
269
|
+
xgb_params = {k: v for k, v in xgb_params.items() if k not in REGRESSION_ONLY_PARAMS}
|
|
349
270
|
else:
|
|
350
|
-
#
|
|
351
|
-
|
|
352
|
-
mae = mean_absolute_error(y_validate, preds)
|
|
353
|
-
medae = median_absolute_error(y_validate, preds)
|
|
354
|
-
r2 = r2_score(y_validate, preds)
|
|
355
|
-
spearman_corr = spearmanr(y_validate, preds).correlation
|
|
356
|
-
support = len(df_val)
|
|
357
|
-
print(f"rmse: {rmse:.3f}")
|
|
358
|
-
print(f"mae: {mae:.3f}")
|
|
359
|
-
print(f"medae: {medae:.3f}")
|
|
360
|
-
print(f"r2: {r2:.3f}")
|
|
361
|
-
print(f"spearmanr: {spearman_corr:.3f}")
|
|
362
|
-
print(f"support: {support}")
|
|
363
|
-
|
|
364
|
-
# Now save the model to the standard place/name
|
|
365
|
-
joblib.dump(xgb_model, os.path.join(args.model_dir, "xgb_model.joblib"))
|
|
366
|
-
|
|
367
|
-
# Save the label encoder if we have one
|
|
368
|
-
if label_encoder:
|
|
369
|
-
joblib.dump(label_encoder, os.path.join(args.model_dir, "label_encoder.joblib"))
|
|
370
|
-
|
|
371
|
-
# Save the features (this will validate input during predictions)
|
|
372
|
-
with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
|
|
373
|
-
json.dump(orig_features, fp) # We save the original features, not the decompressed ones
|
|
374
|
-
|
|
375
|
-
# Save the category mappings
|
|
376
|
-
with open(os.path.join(args.model_dir, "category_mappings.json"), "w") as fp:
|
|
377
|
-
json.dump(category_mappings, fp)
|
|
271
|
+
# Default to MAE (reg:absoluteerror) for regression if not specified
|
|
272
|
+
xgb_params.setdefault("objective", "reg:absoluteerror")
|
|
378
273
|
|
|
274
|
+
print(f"XGBoost params: {xgb_params}")
|
|
379
275
|
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
276
|
+
if n_folds == 1:
|
|
277
|
+
# Single train/val split
|
|
278
|
+
if "training" in all_df.columns:
|
|
279
|
+
print("Using 'training' column for train/val split")
|
|
280
|
+
train_idx = np.where(all_df["training"])[0]
|
|
281
|
+
val_idx = np.where(~all_df["training"])[0]
|
|
282
|
+
else:
|
|
283
|
+
print("WARNING: No 'training' column found, using random 80/20 split")
|
|
284
|
+
indices = np.arange(len(all_df))
|
|
285
|
+
train_idx, val_idx = train_test_split(indices, test_size=0.2, random_state=42)
|
|
286
|
+
folds = [(train_idx, val_idx)]
|
|
287
|
+
else:
|
|
288
|
+
# K-fold cross-validation
|
|
289
|
+
if model_type == "classifier":
|
|
290
|
+
kfold = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
|
|
291
|
+
folds = list(kfold.split(all_df, all_df[target]))
|
|
292
|
+
else:
|
|
293
|
+
kfold = KFold(n_splits=n_folds, shuffle=True, random_state=42)
|
|
294
|
+
folds = list(kfold.split(all_df))
|
|
391
295
|
|
|
392
|
-
|
|
393
|
-
if isinstance(input_data, bytes):
|
|
394
|
-
input_data = input_data.decode("utf-8")
|
|
296
|
+
print(f"Training {'single model' if n_folds == 1 else f'{n_folds}-fold ensemble'}...")
|
|
395
297
|
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
298
|
+
# -------------------------------------------------------------------------
|
|
299
|
+
# Training loop
|
|
300
|
+
# -------------------------------------------------------------------------
|
|
301
|
+
# Initialize out-of-fold storage
|
|
302
|
+
oof_predictions = np.full(len(all_df), np.nan, dtype=np.float64)
|
|
303
|
+
if model_type == "classifier":
|
|
304
|
+
num_classes = len(label_encoder.classes_)
|
|
305
|
+
oof_proba = np.full((len(all_df), num_classes), np.nan, dtype=np.float64)
|
|
400
306
|
else:
|
|
401
|
-
|
|
402
|
-
|
|
307
|
+
oof_proba = None
|
|
308
|
+
|
|
309
|
+
# Check for sample weights
|
|
310
|
+
has_sample_weights = "sample_weight" in all_df.columns
|
|
311
|
+
if has_sample_weights:
|
|
312
|
+
sw = all_df["sample_weight"]
|
|
313
|
+
print(f"Using sample weights: min={sw.min():.2f}, max={sw.max():.2f}, mean={sw.mean():.2f}")
|
|
314
|
+
|
|
315
|
+
# Train ensemble
|
|
316
|
+
ensemble_models = []
|
|
317
|
+
for fold_idx, (train_idx, val_idx) in enumerate(folds):
|
|
318
|
+
print(f"\n{'='*50}")
|
|
319
|
+
print(f"Fold {fold_idx + 1}/{len(folds)} - Train: {len(train_idx)}, Val: {len(val_idx)}")
|
|
320
|
+
print(f"{'='*50}")
|
|
321
|
+
|
|
322
|
+
# Prepare fold data
|
|
323
|
+
X_train = all_df.iloc[train_idx][features]
|
|
324
|
+
y_train = all_df.iloc[train_idx][target]
|
|
325
|
+
X_val = all_df.iloc[val_idx][features]
|
|
326
|
+
sample_weights = all_df.iloc[train_idx]["sample_weight"] if has_sample_weights else None
|
|
327
|
+
|
|
328
|
+
# Create model with fold-specific random state for diversity
|
|
329
|
+
fold_params = {**xgb_params, "random_state": xgb_params.get("random_state", 42) + fold_idx}
|
|
330
|
+
if model_type == "classifier":
|
|
331
|
+
model = xgb.XGBClassifier(enable_categorical=True, **fold_params)
|
|
332
|
+
else:
|
|
333
|
+
model = xgb.XGBRegressor(enable_categorical=True, **fold_params)
|
|
334
|
+
|
|
335
|
+
# Train
|
|
336
|
+
model.fit(X_train, y_train, sample_weight=sample_weights)
|
|
337
|
+
ensemble_models.append(model)
|
|
338
|
+
|
|
339
|
+
# Out-of-fold predictions
|
|
340
|
+
oof_predictions[val_idx] = model.predict(X_val)
|
|
341
|
+
if model_type == "classifier":
|
|
342
|
+
oof_proba[val_idx] = model.predict_proba(X_val)
|
|
343
|
+
|
|
344
|
+
print(f"\nTraining complete! Trained {len(ensemble_models)} model(s).")
|
|
345
|
+
|
|
346
|
+
# -------------------------------------------------------------------------
|
|
347
|
+
# Prepare validation results
|
|
348
|
+
# -------------------------------------------------------------------------
|
|
349
|
+
if n_folds == 1:
|
|
350
|
+
# Single fold: only validation rows
|
|
351
|
+
val_mask = ~np.isnan(oof_predictions)
|
|
352
|
+
df_val = all_df[val_mask].copy()
|
|
353
|
+
predictions = oof_predictions[val_mask]
|
|
354
|
+
if oof_proba is not None:
|
|
355
|
+
oof_proba = oof_proba[val_mask]
|
|
356
|
+
else:
|
|
357
|
+
# K-fold: all rows have out-of-fold predictions
|
|
358
|
+
df_val = all_df.copy()
|
|
359
|
+
predictions = oof_predictions
|
|
403
360
|
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
361
|
+
# Decode labels for classification
|
|
362
|
+
if model_type == "classifier":
|
|
363
|
+
df_val[target] = label_encoder.inverse_transform(df_val[target].astype(int))
|
|
364
|
+
df_val["prediction"] = label_encoder.inverse_transform(predictions.astype(int))
|
|
365
|
+
if oof_proba is not None:
|
|
366
|
+
df_val["pred_proba"] = [p.tolist() for p in oof_proba]
|
|
367
|
+
df_val = expand_proba_column(df_val, label_encoder.classes_)
|
|
411
368
|
else:
|
|
412
|
-
|
|
369
|
+
df_val["prediction"] = predictions
|
|
413
370
|
|
|
371
|
+
# -------------------------------------------------------------------------
|
|
372
|
+
# Compute and print metrics
|
|
373
|
+
# -------------------------------------------------------------------------
|
|
374
|
+
y_true = df_val[target].values
|
|
375
|
+
y_pred = df_val["prediction"].values
|
|
414
376
|
|
|
415
|
-
|
|
416
|
-
|
|
377
|
+
if model_type == "classifier":
|
|
378
|
+
label_names = label_encoder.classes_
|
|
379
|
+
score_df = compute_classification_metrics(y_true, y_pred, label_names, target)
|
|
380
|
+
print_classification_metrics(score_df, target, label_names)
|
|
381
|
+
print_confusion_matrix(y_true, y_pred, label_names)
|
|
382
|
+
else:
|
|
383
|
+
metrics = compute_regression_metrics(y_true, y_pred)
|
|
384
|
+
print_regression_metrics(metrics)
|
|
385
|
+
|
|
386
|
+
# Compute ensemble prediction_std
|
|
387
|
+
if n_folds > 1:
|
|
388
|
+
all_preds = np.stack([m.predict(all_df[features]) for m in ensemble_models])
|
|
389
|
+
df_val["prediction_std"] = np.std(all_preds, axis=0)
|
|
390
|
+
print(f"Ensemble std - mean: {df_val['prediction_std'].mean():.4f}, max: {df_val['prediction_std'].max():.4f}")
|
|
391
|
+
else:
|
|
392
|
+
df_val["prediction_std"] = 0.0
|
|
393
|
+
|
|
394
|
+
# Train UQ models for uncertainty quantification
|
|
395
|
+
print("\n" + "=" * 50)
|
|
396
|
+
print("Training UQ Models")
|
|
397
|
+
print("=" * 50)
|
|
398
|
+
uq_models, uq_metadata = train_uq_models(
|
|
399
|
+
all_df[features], all_df[target], df_val[features], y_true
|
|
400
|
+
)
|
|
401
|
+
df_val = predict_intervals(df_val, df_val[features], uq_models, uq_metadata)
|
|
402
|
+
df_val = compute_confidence(df_val, uq_metadata["median_interval_width"])
|
|
417
403
|
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
404
|
+
# -------------------------------------------------------------------------
|
|
405
|
+
# Save validation predictions to S3
|
|
406
|
+
# -------------------------------------------------------------------------
|
|
407
|
+
output_columns = []
|
|
408
|
+
if id_column in df_val.columns:
|
|
409
|
+
output_columns.append(id_column)
|
|
410
|
+
output_columns += [target, "prediction"]
|
|
421
411
|
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
compressed_features = TEMPLATE_PARAMS["compressed_features"]
|
|
412
|
+
if model_type != "classifier":
|
|
413
|
+
output_columns.append("prediction_std")
|
|
414
|
+
output_columns += [c for c in df_val.columns if c.startswith("q_") or c == "confidence"]
|
|
426
415
|
|
|
427
|
-
|
|
428
|
-
model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
|
|
429
|
-
with open(os.path.join(model_dir, "feature_columns.json")) as fp:
|
|
430
|
-
features = json.load(fp)
|
|
431
|
-
print(f"Model Features: {features}")
|
|
416
|
+
output_columns += [c for c in df_val.columns if c.endswith("_proba")]
|
|
432
417
|
|
|
433
|
-
|
|
434
|
-
with open(os.path.join(model_dir, "category_mappings.json")) as fp:
|
|
435
|
-
category_mappings = json.load(fp)
|
|
418
|
+
wr.s3.to_csv(df_val[output_columns], f"{model_metrics_s3_path}/validation_predictions.csv", index=False)
|
|
436
419
|
|
|
437
|
-
#
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
420
|
+
# -------------------------------------------------------------------------
|
|
421
|
+
# Save model artifacts
|
|
422
|
+
# -------------------------------------------------------------------------
|
|
423
|
+
for idx, m in enumerate(ensemble_models):
|
|
424
|
+
joblib.dump(m, os.path.join(args.model_dir, f"xgb_model_{idx}.joblib"))
|
|
425
|
+
print(f"Saved {len(ensemble_models)} model(s)")
|
|
441
426
|
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
# - Incoming data has columns that are mixed case ("ID", "Tacos", "Count", "Likes_Tacos")
|
|
445
|
-
matched_df = match_features_case_insensitive(df, features)
|
|
427
|
+
with open(os.path.join(args.model_dir, "ensemble_metadata.json"), "w") as f:
|
|
428
|
+
json.dump({"n_ensemble": len(ensemble_models), "n_folds": n_folds}, f)
|
|
446
429
|
|
|
447
|
-
|
|
448
|
-
|
|
430
|
+
with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as f:
|
|
431
|
+
json.dump(orig_features, f)
|
|
449
432
|
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
print("Decompressing features for prediction...")
|
|
453
|
-
matched_df, features = decompress_features(matched_df, features, compressed_features)
|
|
433
|
+
with open(os.path.join(args.model_dir, "category_mappings.json"), "w") as f:
|
|
434
|
+
json.dump(category_mappings, f)
|
|
454
435
|
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
predictions = model.predict(X)
|
|
436
|
+
with open(os.path.join(args.model_dir, "hyperparameters.json"), "w") as f:
|
|
437
|
+
json.dump(hyperparameters, f, indent=2)
|
|
458
438
|
|
|
459
|
-
# If we have a label encoder, decode the predictions
|
|
460
439
|
if label_encoder:
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
# Set the predictions on the DataFrame
|
|
464
|
-
df["prediction"] = predictions
|
|
465
|
-
|
|
466
|
-
# Does our model have a 'predict_proba' method? If so we will call it and add the results to the DataFrame
|
|
467
|
-
if getattr(model, "predict_proba", None):
|
|
468
|
-
probs = model.predict_proba(matched_df[features])
|
|
469
|
-
df["pred_proba"] = [p.tolist() for p in probs]
|
|
440
|
+
joblib.dump(label_encoder, os.path.join(args.model_dir, "label_encoder.joblib"))
|
|
470
441
|
|
|
471
|
-
|
|
472
|
-
|
|
442
|
+
if model_type != "classifier":
|
|
443
|
+
save_uq_models(uq_models, uq_metadata, args.model_dir)
|
|
473
444
|
|
|
474
|
-
|
|
475
|
-
return df
|
|
445
|
+
print(f"\nModel training complete! Artifacts saved to {args.model_dir}")
|