workbench 0.8.178__py3-none-any.whl → 0.8.179__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of workbench might be problematic. Click here for more details.

Files changed (25) hide show
  1. workbench/api/endpoint.py +3 -2
  2. workbench/core/artifacts/endpoint_core.py +5 -5
  3. workbench/core/artifacts/feature_set_core.py +32 -2
  4. workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +3 -5
  5. workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
  6. workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +10 -17
  7. workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
  8. workbench/model_scripts/custom_models/uq_models/generated_model_script.py +37 -34
  9. workbench/model_scripts/custom_models/uq_models/mapie.template +35 -32
  10. workbench/model_scripts/custom_models/uq_models/meta_uq.template +7 -22
  11. workbench/model_scripts/custom_models/uq_models/ngboost.template +5 -12
  12. workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +5 -13
  13. workbench/model_scripts/pytorch_model/pytorch.template +9 -18
  14. workbench/model_scripts/quant_regression/quant_regression.template +5 -10
  15. workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
  16. workbench/model_scripts/xgb_model/generated_model_script.py +24 -33
  17. workbench/model_scripts/xgb_model/xgb_model.template +23 -32
  18. workbench/utils/model_utils.py +2 -1
  19. workbench/utils/xgboost_model_utils.py +160 -137
  20. {workbench-0.8.178.dist-info → workbench-0.8.179.dist-info}/METADATA +1 -1
  21. {workbench-0.8.178.dist-info → workbench-0.8.179.dist-info}/RECORD +25 -25
  22. {workbench-0.8.178.dist-info → workbench-0.8.179.dist-info}/WHEEL +0 -0
  23. {workbench-0.8.178.dist-info → workbench-0.8.179.dist-info}/entry_points.txt +0 -0
  24. {workbench-0.8.178.dist-info → workbench-0.8.179.dist-info}/licenses/LICENSE +0 -0
  25. {workbench-0.8.178.dist-info → workbench-0.8.179.dist-info}/top_level.txt +0 -0
@@ -5,11 +5,7 @@ from xgboost import XGBRegressor
5
5
  from sklearn.model_selection import train_test_split
6
6
 
7
7
  # Model Performance Scores
8
- from sklearn.metrics import (
9
- mean_absolute_error,
10
- r2_score,
11
- root_mean_squared_error
12
- )
8
+ from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
13
9
 
14
10
  from io import StringIO
15
11
  import json
@@ -25,7 +21,8 @@ TEMPLATE_PARAMS = {
25
21
  "target": "{{target_column}}",
26
22
  "features": "{{feature_list}}",
27
23
  "compressed_features": "{{compressed_features}}",
28
- "train_all_data": "{{train_all_data}}"
24
+ "train_all_data": "{{train_all_data}}",
25
+ "hyperparameters": "{{hyperparameters}}",
29
26
  }
30
27
 
31
28
 
@@ -101,7 +98,7 @@ def convert_categorical_types(df: pd.DataFrame, features: list, category_mapping
101
98
 
102
99
 
103
100
  def decompress_features(
104
- df: pd.DataFrame, features: List[str], compressed_features: List[str]
101
+ df: pd.DataFrame, features: List[str], compressed_features: List[str]
105
102
  ) -> Tuple[pd.DataFrame, List[str]]:
106
103
  """Prepare features for the model by decompressing bitstring features
107
104
 
@@ -162,6 +159,7 @@ if __name__ == "__main__":
162
159
  orig_features = features.copy()
163
160
  compressed_features = TEMPLATE_PARAMS["compressed_features"]
164
161
  train_all_data = TEMPLATE_PARAMS["train_all_data"]
162
+ hyperparameters = TEMPLATE_PARAMS["hyperparameters"]
165
163
  validation_split = 0.2
166
164
 
167
165
  # Script arguments for input/output directories
@@ -174,11 +172,7 @@ if __name__ == "__main__":
174
172
  args = parser.parse_args()
175
173
 
176
174
  # Read the training data into DataFrames
177
- training_files = [
178
- os.path.join(args.train, file)
179
- for file in os.listdir(args.train)
180
- if file.endswith(".csv")
181
- ]
175
+ training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
182
176
  print(f"Training Files: {training_files}")
183
177
 
184
178
  # Combine files and read them all into a single pandas dataframe
@@ -213,9 +207,7 @@ if __name__ == "__main__":
213
207
  else:
214
208
  # Just do a random training Split
215
209
  print("WARNING: No training column found, splitting data with random state=42")
216
- df_train, df_val = train_test_split(
217
- all_df, test_size=validation_split, random_state=42
218
- )
210
+ df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
219
211
  print(f"FIT/TRAIN: {df_train.shape}")
220
212
  print(f"VALIDATION: {df_val.shape}")
221
213
 
@@ -227,7 +219,8 @@ if __name__ == "__main__":
227
219
 
228
220
  # Train XGBoost for point predictions
229
221
  print("\nTraining XGBoost for point predictions...")
230
- xgb_model = XGBRegressor(enable_categorical=True)
222
+ print(f" Hyperparameters: {hyperparameters}")
223
+ xgb_model = XGBRegressor(enable_categorical=True, **hyperparameters)
231
224
  xgb_model.fit(X_train, y_train)
232
225
 
233
226
  # Evaluate XGBoost performance
@@ -272,7 +265,7 @@ if __name__ == "__main__":
272
265
  colsample_bytree=0.8,
273
266
  random_state=42,
274
267
  verbose=-1,
275
- force_col_wise=True
268
+ force_col_wise=True,
276
269
  )
277
270
  est.fit(X_train, y_train)
278
271
  quantile_estimators.append(est)
@@ -280,9 +273,7 @@ if __name__ == "__main__":
280
273
  # Create MAPIE CQR model for this confidence level
281
274
  print(f" Setting up MAPIE CQR for {confidence_level * 100:.0f}% confidence...")
282
275
  mapie_model = ConformalizedQuantileRegressor(
283
- quantile_estimators,
284
- confidence_level=confidence_level,
285
- prefit=True
276
+ quantile_estimators, confidence_level=confidence_level, prefit=True
286
277
  )
287
278
 
288
279
  # Conformalize the model
@@ -337,8 +328,8 @@ if __name__ == "__main__":
337
328
  "xgb_rmse": float(xgb_rmse),
338
329
  "xgb_mae": float(xgb_mae),
339
330
  "xgb_r2": float(xgb_r2),
340
- "n_validation": len(df_val)
341
- }
331
+ "n_validation": len(df_val),
332
+ },
342
333
  }
343
334
  with open(os.path.join(args.model_dir, "model_config.json"), "w") as fp:
344
335
  json.dump(model_config, fp, indent=2)
@@ -379,7 +370,7 @@ def model_fn(model_dir) -> dict:
379
370
  "xgb_model": xgb_model,
380
371
  "mapie_models": mapie_models,
381
372
  "confidence_levels": config["confidence_levels"],
382
- "category_mappings": category_mappings
373
+ "category_mappings": category_mappings,
383
374
  }
384
375
 
385
376
 
@@ -404,7 +395,7 @@ def output_fn(output_df, accept_type):
404
395
  """Supports both CSV and JSON output formats."""
405
396
  if "text/csv" in accept_type:
406
397
  # Convert categorical columns to string to avoid fillna issues
407
- for col in output_df.select_dtypes(include=['category']).columns:
398
+ for col in output_df.select_dtypes(include=["category"]).columns:
408
399
  output_df[col] = output_df[col].astype(str)
409
400
  csv_output = output_df.fillna("N/A").to_csv(index=False)
410
401
  return csv_output, "text/csv"
@@ -425,6 +416,10 @@ def predict_fn(df, models) -> pd.DataFrame:
425
416
  pd.DataFrame: DataFrame with XGBoost predictions and conformalized intervals
426
417
  """
427
418
 
419
+ # Flag for outlier stretch adjustment for the prediction intervals
420
+ # if the predicted values are outside the intervals
421
+ outlier_stretch = False
422
+
428
423
  # Grab our feature columns (from training)
429
424
  model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
430
425
  with open(os.path.join(model_dir, "feature_columns.json")) as fp:
@@ -435,11 +430,7 @@ def predict_fn(df, models) -> pd.DataFrame:
435
430
 
436
431
  # Apply categorical mappings if they exist
437
432
  if models.get("category_mappings"):
438
- matched_df, _ = convert_categorical_types(
439
- matched_df,
440
- model_features,
441
- models["category_mappings"]
442
- )
433
+ matched_df, _ = convert_categorical_types(matched_df, model_features, models["category_mappings"])
443
434
 
444
435
  # Get features for prediction
445
436
  X = matched_df[model_features]
@@ -475,7 +466,7 @@ def predict_fn(df, models) -> pd.DataFrame:
475
466
  # Add median (q_50) from XGBoost prediction
476
467
  df["q_50"] = df["prediction"]
477
468
 
478
- # Calculate a psueduo-standard deviation from the 68% interval width
469
+ # Calculate a pseudo-standard deviation from the 68% interval width
479
470
  df["prediction_std"] = (df["q_84"] - df["q_16"]) / 2.0
480
471
 
481
472
  # Reorder the quantile columns for easier reading
@@ -484,7 +475,19 @@ def predict_fn(df, models) -> pd.DataFrame:
484
475
  df = df[other_cols + quantile_cols]
485
476
 
486
477
  # Adjust the outer quantiles to ensure they encompass the prediction
487
- df["q_025"] = np.minimum(df["q_025"], df["prediction"])
488
- df["q_975"] = np.maximum(df["q_975"], df["prediction"])
478
+ if outlier_stretch:
479
+ # Lower intervals adjustments
480
+ df["q_025"] = np.minimum(df["q_025"], df["prediction"])
481
+ df["q_05"] = np.minimum(df["q_05"], df["prediction"])
482
+ df["q_10"] = np.minimum(df["q_10"], df["prediction"])
483
+ df["q_16"] = np.minimum(df["q_16"], df["prediction"])
484
+ df["q_25"] = np.minimum(df["q_25"], df["prediction"])
485
+
486
+ # Upper intervals adjustments
487
+ df["q_75"] = np.maximum(df["q_75"], df["prediction"])
488
+ df["q_84"] = np.maximum(df["q_84"], df["prediction"])
489
+ df["q_90"] = np.maximum(df["q_90"], df["prediction"])
490
+ df["q_95"] = np.maximum(df["q_95"], df["prediction"])
491
+ df["q_975"] = np.maximum(df["q_975"], df["prediction"])
489
492
 
490
493
  return df
@@ -5,11 +5,7 @@ from xgboost import XGBRegressor # Point Estimator
5
5
  from sklearn.model_selection import train_test_split
6
6
 
7
7
  # Model Performance Scores
8
- from sklearn.metrics import (
9
- mean_absolute_error,
10
- r2_score,
11
- root_mean_squared_error
12
- )
8
+ from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
13
9
 
14
10
  from io import StringIO
15
11
  import json
@@ -24,7 +20,6 @@ from typing import List, Tuple
24
20
  from proximity import Proximity
25
21
 
26
22
 
27
-
28
23
  # Template Placeholders
29
24
  TEMPLATE_PARAMS = {
30
25
  "id_column": "{{id_column}}",
@@ -32,7 +27,7 @@ TEMPLATE_PARAMS = {
32
27
  "features": "{{feature_list}}",
33
28
  "compressed_features": "{{compressed_features}}",
34
29
  "train_all_data": "{{train_all_data}}",
35
- "track_columns": "{{track_columns}}"
30
+ "track_columns": "{{track_columns}}",
36
31
  }
37
32
 
38
33
 
@@ -183,11 +178,7 @@ if __name__ == "__main__":
183
178
  args = parser.parse_args()
184
179
 
185
180
  # Read the training data into DataFrames
186
- training_files = [
187
- os.path.join(args.train, file)
188
- for file in os.listdir(args.train)
189
- if file.endswith(".csv")
190
- ]
181
+ training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
191
182
  print(f"Training Files: {training_files}")
192
183
 
193
184
  # Combine files and read them all into a single pandas dataframe
@@ -222,9 +213,7 @@ if __name__ == "__main__":
222
213
  else:
223
214
  # Just do a random training Split
224
215
  print("WARNING: No training column found, splitting data with random state=42")
225
- df_train, df_val = train_test_split(
226
- all_df, test_size=validation_split, random_state=42
227
- )
216
+ df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
228
217
  print(f"FIT/TRAIN: {df_train.shape}")
229
218
  print(f"VALIDATION: {df_val.shape}")
230
219
 
@@ -289,11 +278,7 @@ def model_fn(model_dir) -> dict:
289
278
  # Deserialize the proximity model
290
279
  prox_model = Proximity.deserialize(model_dir)
291
280
 
292
- return {
293
- "xgboost": xgb_model,
294
- "ngboost": ngb_model,
295
- "proximity": prox_model
296
- }
281
+ return {"xgboost": xgb_model, "ngboost": ngb_model, "proximity": prox_model}
297
282
 
298
283
 
299
284
  def input_fn(input_data, content_type):
@@ -353,8 +338,8 @@ def predict_fn(df, models) -> pd.DataFrame:
353
338
  dist_params = y_dists.params
354
339
 
355
340
  # Extract mean and std from distribution parameters
356
- df["prediction_uq"] = dist_params['loc'] # mean
357
- df["prediction_std"] = dist_params['scale'] # standard deviation
341
+ df["prediction_uq"] = dist_params["loc"] # mean
342
+ df["prediction_std"] = dist_params["scale"] # standard deviation
358
343
 
359
344
  # Add 95% prediction intervals using ppf (percent point function)
360
345
  # Note: Our hybrid model uses XGB point prediction and NGBoost UQ
@@ -3,11 +3,7 @@ from ngboost import NGBRegressor
3
3
  from sklearn.model_selection import train_test_split
4
4
 
5
5
  # Model Performance Scores
6
- from sklearn.metrics import (
7
- mean_absolute_error,
8
- r2_score,
9
- root_mean_squared_error
10
- )
6
+ from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
11
7
 
12
8
  from io import StringIO
13
9
  import json
@@ -21,7 +17,7 @@ import pandas as pd
21
17
  TEMPLATE_PARAMS = {
22
18
  "features": "{{feature_list}}",
23
19
  "target": "{{target_column}}",
24
- "train_all_data": "{{train_all_data}}"
20
+ "train_all_data": "{{train_all_data}}",
25
21
  }
26
22
 
27
23
 
@@ -87,10 +83,7 @@ if __name__ == "__main__":
87
83
  args = parser.parse_args()
88
84
 
89
85
  # Load training data from the specified directory
90
- training_files = [
91
- os.path.join(args.train, file)
92
- for file in os.listdir(args.train) if file.endswith(".csv")
93
- ]
86
+ training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
94
87
  print(f"Training Files: {training_files}")
95
88
 
96
89
  # Combine files and read them all into a single pandas dataframe
@@ -212,8 +205,8 @@ def predict_fn(df, model) -> pd.DataFrame:
212
205
  dist_params = y_dists.params
213
206
 
214
207
  # Extract mean and std from distribution parameters
215
- df["prediction"] = dist_params['loc'] # mean
216
- df["prediction_std"] = dist_params['scale'] # standard deviation
208
+ df["prediction"] = dist_params["loc"] # mean
209
+ df["prediction_std"] = dist_params["scale"] # standard deviation
217
210
 
218
211
  # Add 95% prediction intervals using ppf (percent point function)
219
212
  df["q_025"] = y_dists.ppf(0.025) # 2.5th percentile
@@ -3,7 +3,7 @@ TEMPLATE_PARAMS = {
3
3
  "model_type": "{{model_type}}",
4
4
  "target_column": "{{target_column}}",
5
5
  "feature_list": "{{feature_list}}",
6
- "model_metrics_s3_path": "{{model_metrics_s3_path}}"
6
+ "model_metrics_s3_path": "{{model_metrics_s3_path}}",
7
7
  }
8
8
 
9
9
  # Imports for XGB Model
@@ -12,11 +12,7 @@ import awswrangler as wr
12
12
  import numpy as np
13
13
 
14
14
  # Model Performance Scores
15
- from sklearn.metrics import (
16
- mean_absolute_error,
17
- r2_score,
18
- root_mean_squared_error
19
- )
15
+ from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
20
16
 
21
17
  from io import StringIO
22
18
  import json
@@ -39,6 +35,7 @@ def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
39
35
  print(msg)
40
36
  raise ValueError(msg)
41
37
 
38
+
42
39
  def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> pd.DataFrame:
43
40
  """
44
41
  Matches and renames the DataFrame's column names to match the model's feature names (case-insensitive).
@@ -95,11 +92,7 @@ if __name__ == "__main__":
95
92
  args = parser.parse_args()
96
93
 
97
94
  # Read the training data into DataFrames
98
- training_files = [
99
- os.path.join(args.train, file)
100
- for file in os.listdir(args.train)
101
- if file.endswith(".csv")
102
- ]
95
+ training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
103
96
  print(f"Training Files: {training_files}")
104
97
 
105
98
  # Combine files and read them all into a single pandas dataframe
@@ -150,7 +143,6 @@ if __name__ == "__main__":
150
143
  result_df["residual"] = result_df[target] - result_df["prediction"]
151
144
  result_df["residual_abs"] = result_df["residual"].abs()
152
145
 
153
-
154
146
  # Save the results dataframe to S3
155
147
  wr.s3.to_csv(
156
148
  result_df,
@@ -210,7 +202,7 @@ def input_fn(input_data, content_type):
210
202
  """Parse input data and return a DataFrame."""
211
203
  if not input_data:
212
204
  raise ValueError("Empty input data is not supported!")
213
-
205
+
214
206
  # Decode bytes to string if necessary
215
207
  if isinstance(input_data, bytes):
216
208
  input_data = input_data.decode("utf-8")
@@ -36,12 +36,12 @@ from typing import List, Tuple
36
36
  # Template Parameters
37
37
  TEMPLATE_PARAMS = {
38
38
  "model_type": "{{model_type}}",
39
- "target_column": "{{target_column}}",
39
+ "target": "{{target_column}}",
40
40
  "features": "{{feature_list}}",
41
41
  "compressed_features": "{{compressed_features}}",
42
42
  "model_metrics_s3_path": "{{model_metrics_s3_path}}",
43
43
  "train_all_data": "{{train_all_data}}",
44
- "hyperparameters": "{{hyperparameters}}"
44
+ "hyperparameters": "{{hyperparameters}}",
45
45
  }
46
46
 
47
47
 
@@ -103,7 +103,6 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
103
103
  df_columns_lower = {col.lower(): col for col in df.columns}
104
104
  rename_dict = {}
105
105
  missing = []
106
-
107
106
  for feature in model_features:
108
107
  if feature in df.columns:
109
108
  continue # Exact match
@@ -115,6 +114,7 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
115
114
  if missing:
116
115
  raise ValueError(f"Features not found: {missing}")
117
116
 
117
+ # Rename the DataFrame columns to match the model features
118
118
  return df.rename(columns=rename_dict)
119
119
 
120
120
 
@@ -210,7 +210,7 @@ def model_fn(model_dir):
210
210
  original_cwd = os.getcwd()
211
211
  try:
212
212
  # Change to /tmp because Pytorch Tabular needs write access (creates a .pt_tmp directory)
213
- os.chdir('/tmp')
213
+ os.chdir("/tmp")
214
214
 
215
215
  # Load the model
216
216
  model_path = os.path.join(model_dir, "tabular_model")
@@ -328,7 +328,7 @@ if __name__ == "__main__":
328
328
  """The main function is for training the PyTorch Tabular model"""
329
329
 
330
330
  # Harness Template Parameters
331
- target = TEMPLATE_PARAMS["target_column"]
331
+ target = TEMPLATE_PARAMS["target"]
332
332
  features = TEMPLATE_PARAMS["features"]
333
333
  orig_features = features.copy()
334
334
  compressed_features = TEMPLATE_PARAMS["compressed_features"]
@@ -348,11 +348,7 @@ if __name__ == "__main__":
348
348
  args = parser.parse_args()
349
349
 
350
350
  # Read the training data into DataFrames
351
- training_files = [
352
- os.path.join(args.train, file)
353
- for file in os.listdir(args.train)
354
- if file.endswith(".csv")
355
- ]
351
+ training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
356
352
  print(f"Training Files: {training_files}")
357
353
 
358
354
  # Combine files and read them all into a single pandas dataframe
@@ -433,8 +429,7 @@ if __name__ == "__main__":
433
429
  }
434
430
 
435
431
  # Override defaults with training_config if present
436
- training_overrides = {k: v for k, v in hyperparameters.get('training_config', {}).items()
437
- if k in trainer_defaults}
432
+ training_overrides = {k: v for k, v in hyperparameters.get("training_config", {}).items() if k in trainer_defaults}
438
433
  # Print overwrites
439
434
  for key, value in training_overrides.items():
440
435
  print(f"TRAINING CONFIG Override: {key}: {trainer_defaults[key]} → {value}")
@@ -451,8 +446,7 @@ if __name__ == "__main__":
451
446
  "initialization": "kaiming",
452
447
  }
453
448
  # Override defaults with model_config if present
454
- model_overrides = {k: v for k, v in hyperparameters.get('model_config', {}).items()
455
- if k in model_defaults}
449
+ model_overrides = {k: v for k, v in hyperparameters.get("model_config", {}).items() if k in model_defaults}
456
450
  # Print overwrites
457
451
  for key, value in model_overrides.items():
458
452
  print(f"MODEL CONFIG Override: {key}: {model_defaults[key]} → {value}")
@@ -461,10 +455,7 @@ if __name__ == "__main__":
461
455
  # Use CategoryEmbedding model configuration for general-purpose tabular modeling.
462
456
  # Works effectively for both regression and classification as the foundational
463
457
  # architecture in PyTorch Tabular
464
- model_config = CategoryEmbeddingModelConfig(
465
- task=task,
466
- **model_params
467
- )
458
+ model_config = CategoryEmbeddingModelConfig(task=task, **model_params)
468
459
  optimizer_config = OptimizerConfig()
469
460
 
470
461
  #####################################
@@ -4,11 +4,7 @@ import awswrangler as wr
4
4
  from sklearn.model_selection import train_test_split
5
5
 
6
6
  # Model Performance Scores
7
- from sklearn.metrics import (
8
- mean_absolute_error,
9
- r2_score,
10
- root_mean_squared_error
11
- )
7
+ from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
12
8
 
13
9
  from io import StringIO
14
10
  import json
@@ -22,9 +18,10 @@ TEMPLATE_PARAMS = {
22
18
  "target_column": "{{target_column}}",
23
19
  "features": "{{feature_list}}",
24
20
  "model_metrics_s3_path": "{{model_metrics_s3_path}}",
25
- "train_all_data": "{{train_all_data}}"
21
+ "train_all_data": "{{train_all_data}}",
26
22
  }
27
23
 
24
+
28
25
  # Function to check if dataframe is empty
29
26
  def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
30
27
  """
@@ -64,6 +61,7 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
64
61
  # Rename the DataFrame columns to match the model features
65
62
  return df.rename(columns=rename_dict)
66
63
 
64
+
67
65
  if __name__ == "__main__":
68
66
  """The main function is for training the XGBoost Quantile Regression models"""
69
67
 
@@ -86,10 +84,7 @@ if __name__ == "__main__":
86
84
  args = parser.parse_args()
87
85
 
88
86
  # Load training data from the specified directory
89
- training_files = [
90
- os.path.join(args.train, file)
91
- for file in os.listdir(args.train) if file.endswith(".csv")
92
- ]
87
+ training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
93
88
  print(f"Training Files: {training_files}")
94
89
 
95
90
  # Combine files and read them all into a single pandas dataframe
@@ -8,7 +8,7 @@ TEMPLATE_PARAMS = {
8
8
  "feature_list": "{{feature_list}}",
9
9
  "model_class": "{{model_class}}",
10
10
  "model_metrics_s3_path": "{{model_metrics_s3_path}}",
11
- "train_all_data": "{{train_all_data}}"
11
+ "train_all_data": "{{train_all_data}}",
12
12
  }
13
13
 
14
14
  import awswrangler as wr
@@ -99,10 +99,7 @@ if __name__ == "__main__":
99
99
  args = parser.parse_args()
100
100
 
101
101
  # Load training data from the specified directory
102
- training_files = [
103
- os.path.join(args.train, file)
104
- for file in os.listdir(args.train) if file.endswith(".csv")
105
- ]
102
+ training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
106
103
  all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
107
104
 
108
105
  # Check if the DataFrame is empty
@@ -116,10 +113,7 @@ if __name__ == "__main__":
116
113
 
117
114
  if needs_standardization:
118
115
  # Create a pipeline with standardization and the model
119
- model = Pipeline([
120
- ("scaler", StandardScaler()),
121
- ("model", model)
122
- ])
116
+ model = Pipeline([("scaler", StandardScaler()), ("model", model)])
123
117
 
124
118
  # Handle logic based on the model_type
125
119
  if model_type in ["classifier", "regressor"]:
@@ -206,6 +200,7 @@ if __name__ == "__main__":
206
200
  with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
207
201
  json.dump(feature_list, fp)
208
202
 
203
+
209
204
  #
210
205
  # Inference Section
211
206
  #