workbench 0.8.197__py3-none-any.whl → 0.8.201__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. workbench/algorithms/dataframe/proximity.py +19 -12
  2. workbench/api/__init__.py +2 -1
  3. workbench/api/feature_set.py +7 -4
  4. workbench/api/model.py +1 -1
  5. workbench/core/artifacts/__init__.py +11 -2
  6. workbench/core/artifacts/endpoint_core.py +84 -46
  7. workbench/core/artifacts/feature_set_core.py +69 -1
  8. workbench/core/artifacts/model_core.py +37 -7
  9. workbench/core/cloud_platform/aws/aws_parameter_store.py +18 -2
  10. workbench/core/transforms/features_to_model/features_to_model.py +23 -20
  11. workbench/core/views/view.py +2 -2
  12. workbench/model_scripts/chemprop/chemprop.template +931 -0
  13. workbench/model_scripts/chemprop/generated_model_script.py +931 -0
  14. workbench/model_scripts/chemprop/requirements.txt +11 -0
  15. workbench/model_scripts/custom_models/chem_info/fingerprints.py +134 -0
  16. workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py +1 -1
  17. workbench/model_scripts/custom_models/proximity/proximity.py +19 -12
  18. workbench/model_scripts/custom_models/uq_models/proximity.py +19 -12
  19. workbench/model_scripts/pytorch_model/generated_model_script.py +130 -88
  20. workbench/model_scripts/pytorch_model/pytorch.template +128 -86
  21. workbench/model_scripts/scikit_learn/generated_model_script.py +302 -0
  22. workbench/model_scripts/script_generation.py +10 -7
  23. workbench/model_scripts/uq_models/generated_model_script.py +25 -18
  24. workbench/model_scripts/uq_models/mapie.template +23 -16
  25. workbench/model_scripts/xgb_model/generated_model_script.py +6 -6
  26. workbench/model_scripts/xgb_model/xgb_model.template +2 -2
  27. workbench/repl/workbench_shell.py +14 -5
  28. workbench/scripts/endpoint_test.py +162 -0
  29. workbench/scripts/{lambda_launcher.py → lambda_test.py} +10 -0
  30. workbench/utils/chemprop_utils.py +724 -0
  31. workbench/utils/pytorch_utils.py +497 -0
  32. workbench/utils/xgboost_model_utils.py +12 -5
  33. {workbench-0.8.197.dist-info → workbench-0.8.201.dist-info}/METADATA +2 -2
  34. {workbench-0.8.197.dist-info → workbench-0.8.201.dist-info}/RECORD +38 -30
  35. {workbench-0.8.197.dist-info → workbench-0.8.201.dist-info}/entry_points.txt +2 -1
  36. {workbench-0.8.197.dist-info → workbench-0.8.201.dist-info}/WHEEL +0 -0
  37. {workbench-0.8.197.dist-info → workbench-0.8.201.dist-info}/licenses/LICENSE +0 -0
  38. {workbench-0.8.197.dist-info → workbench-0.8.201.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,497 @@
1
+ """PyTorch Tabular utilities for Workbench models."""
2
+
3
+ # flake8: noqa: E402
4
+ import logging
5
+ import os
6
+ import tempfile
7
+ from pprint import pformat
8
+ from typing import Any, Tuple
9
+
10
+ # Disable OpenMP parallelism to avoid segfaults on macOS with conflicting OpenMP runtimes
11
+ # (libomp from LLVM vs libiomp from Intel). Must be set before importing numpy/sklearn/torch.
12
+ # See: https://github.com/scikit-learn/scikit-learn/issues/21302
13
+ os.environ.setdefault("OMP_NUM_THREADS", "1")
14
+ os.environ.setdefault("MKL_NUM_THREADS", "1")
15
+
16
+ import numpy as np
17
+ import pandas as pd
18
+ from scipy.stats import spearmanr
19
+ from sklearn.metrics import (
20
+ mean_absolute_error,
21
+ mean_squared_error,
22
+ median_absolute_error,
23
+ precision_recall_fscore_support,
24
+ r2_score,
25
+ roc_auc_score,
26
+ )
27
+ from sklearn.model_selection import KFold, StratifiedKFold
28
+ from sklearn.preprocessing import LabelEncoder
29
+
30
+ from workbench.utils.model_utils import safe_extract_tarfile
31
+ from workbench.utils.pandas_utils import expand_proba_column
32
+
33
+ log = logging.getLogger("workbench")
34
+
35
+
36
+ def download_and_extract_model(s3_uri: str, model_dir: str) -> None:
37
+ """Download model artifact from S3 and extract it.
38
+
39
+ Args:
40
+ s3_uri: S3 URI to the model artifact (model.tar.gz)
41
+ model_dir: Directory to extract model artifacts to
42
+ """
43
+ import awswrangler as wr
44
+
45
+ log.info(f"Downloading model from {s3_uri}...")
46
+
47
+ # Download to temp file
48
+ local_tar_path = os.path.join(model_dir, "model.tar.gz")
49
+ wr.s3.download(path=s3_uri, local_file=local_tar_path)
50
+
51
+ # Extract using safe extraction
52
+ log.info(f"Extracting to {model_dir}...")
53
+ safe_extract_tarfile(local_tar_path, model_dir)
54
+
55
+ # Cleanup tar file
56
+ os.unlink(local_tar_path)
57
+
58
+
59
+ def load_pytorch_model_artifacts(model_dir: str) -> Tuple[Any, dict]:
60
+ """Load PyTorch Tabular model and artifacts from an extracted model directory.
61
+
62
+ Args:
63
+ model_dir: Directory containing extracted model artifacts
64
+
65
+ Returns:
66
+ Tuple of (TabularModel, artifacts_dict).
67
+ artifacts_dict contains 'label_encoder' and 'category_mappings' if present.
68
+ """
69
+ import json
70
+
71
+ import joblib
72
+
73
+ # pytorch-tabular saves complex objects, use legacy loading behavior
74
+ os.environ["TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD"] = "1"
75
+ from pytorch_tabular import TabularModel
76
+
77
+ model_path = os.path.join(model_dir, "tabular_model")
78
+ if not os.path.exists(model_path):
79
+ raise FileNotFoundError(f"No tabular_model directory found in {model_dir}")
80
+
81
+ # Remove callbacks.sav if it exists - it's not needed for inference and causes
82
+ # GPU->CPU loading issues (joblib.load doesn't support map_location)
83
+ callbacks_path = os.path.join(model_path, "callbacks.sav")
84
+ if os.path.exists(callbacks_path):
85
+ os.remove(callbacks_path)
86
+
87
+ # PyTorch Tabular needs write access, so chdir to /tmp
88
+ original_cwd = os.getcwd()
89
+ try:
90
+ os.chdir("/tmp")
91
+ # map_location="cpu" ensures GPU-trained models work on CPU
92
+ model = TabularModel.load_model(model_path, map_location="cpu")
93
+ finally:
94
+ os.chdir(original_cwd)
95
+
96
+ # Load additional artifacts
97
+ artifacts = {}
98
+
99
+ label_encoder_path = os.path.join(model_dir, "label_encoder.joblib")
100
+ if os.path.exists(label_encoder_path):
101
+ artifacts["label_encoder"] = joblib.load(label_encoder_path)
102
+
103
+ category_mappings_path = os.path.join(model_dir, "category_mappings.json")
104
+ if os.path.exists(category_mappings_path):
105
+ with open(category_mappings_path) as f:
106
+ artifacts["category_mappings"] = json.load(f)
107
+
108
+ return model, artifacts
109
+
110
+
111
+ def _extract_model_configs(loaded_model: Any, n_train: int) -> dict:
112
+ """Extract trainer and model configs from a loaded PyTorch Tabular model.
113
+
114
+ Args:
115
+ loaded_model: Loaded TabularModel instance
116
+ n_train: Number of training samples (used for batch_size calculation)
117
+
118
+ Returns:
119
+ Dictionary with 'trainer' and 'model' config dictionaries
120
+ """
121
+ config = loaded_model.config
122
+
123
+ # Trainer config - extract from loaded model, matching template defaults
124
+ trainer_defaults = {
125
+ "auto_lr_find": False,
126
+ "batch_size": min(128, max(32, n_train // 16)),
127
+ "max_epochs": 100,
128
+ "min_epochs": 10,
129
+ "early_stopping": "valid_loss",
130
+ "early_stopping_patience": 10,
131
+ "gradient_clip_val": 1.0,
132
+ }
133
+
134
+ trainer_config = {}
135
+ for key, default in trainer_defaults.items():
136
+ value = getattr(config, key, default)
137
+ if value == default and not hasattr(config, key):
138
+ log.warning(f"Trainer config '{key}' not found in loaded model, using default: {default}")
139
+ trainer_config[key] = value
140
+
141
+ # Model config - extract from loaded model, matching template defaults
142
+ model_defaults = {
143
+ "layers": "256-128-64",
144
+ "activation": "LeakyReLU",
145
+ "learning_rate": 1e-3,
146
+ "dropout": 0.3,
147
+ "use_batch_norm": True,
148
+ "initialization": "kaiming",
149
+ }
150
+
151
+ model_config = {}
152
+ for key, default in model_defaults.items():
153
+ value = getattr(config, key, default)
154
+ if value == default and not hasattr(config, key):
155
+ log.warning(f"Model config '{key}' not found in loaded model, using default: {default}")
156
+ model_config[key] = value
157
+
158
+ return {"trainer": trainer_config, "model": model_config}
159
+
160
+
161
+ def cross_fold_inference(
162
+ workbench_model: Any,
163
+ nfolds: int = 5,
164
+ ) -> Tuple[pd.DataFrame, pd.DataFrame]:
165
+ """Performs K-fold cross-validation for PyTorch Tabular models.
166
+
167
+ Replicates the training setup from the original model to ensure
168
+ cross-validation results are comparable to the deployed model.
169
+
170
+ Args:
171
+ workbench_model: Workbench model object
172
+ nfolds: Number of folds for cross-validation (default is 5)
173
+
174
+ Returns:
175
+ Tuple of:
176
+ - DataFrame with per-class metrics (and 'all' row for overall metrics)
177
+ - DataFrame with columns: id, target, prediction, and *_proba columns (for classifiers)
178
+ """
179
+ import shutil
180
+
181
+ from pytorch_tabular import TabularModel
182
+ from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig
183
+ from pytorch_tabular.models import CategoryEmbeddingModelConfig
184
+
185
+ from workbench.api import FeatureSet
186
+
187
+ # Create a temporary model directory
188
+ model_dir = tempfile.mkdtemp(prefix="pytorch_cv_")
189
+ log.info(f"Using model directory: {model_dir}")
190
+
191
+ try:
192
+ # Download and extract model artifacts to get config and artifacts
193
+ model_artifact_uri = workbench_model.model_data_url()
194
+ download_and_extract_model(model_artifact_uri, model_dir)
195
+
196
+ # Load model and artifacts
197
+ loaded_model, artifacts = load_pytorch_model_artifacts(model_dir)
198
+ category_mappings = artifacts.get("category_mappings", {})
199
+
200
+ # Determine if classifier from the loaded model's config
201
+ is_classifier = loaded_model.config.task == "classification"
202
+
203
+ # Use saved label encoder if available, otherwise create fresh one
204
+ if is_classifier:
205
+ label_encoder = artifacts.get("label_encoder")
206
+ if label_encoder is None:
207
+ log.warning("No saved label encoder found, creating fresh one")
208
+ label_encoder = LabelEncoder()
209
+ else:
210
+ label_encoder = None
211
+
212
+ # Prepare data
213
+ fs = FeatureSet(workbench_model.get_input())
214
+ df = workbench_model.training_view().pull_dataframe()
215
+
216
+ # Get columns
217
+ id_col = fs.id_column
218
+ target_col = workbench_model.target()
219
+ feature_cols = workbench_model.features()
220
+ print(f"Target column: {target_col}")
221
+ print(f"Feature columns: {len(feature_cols)} features")
222
+
223
+ # Convert string columns to category for PyTorch Tabular compatibility
224
+ for col in feature_cols:
225
+ if pd.api.types.is_string_dtype(df[col]):
226
+ if col in category_mappings:
227
+ df[col] = pd.Categorical(df[col], categories=category_mappings[col])
228
+ else:
229
+ df[col] = df[col].astype("category")
230
+
231
+ # Determine categorical and continuous columns
232
+ categorical_cols = [col for col in feature_cols if df[col].dtype.name == "category"]
233
+ continuous_cols = [col for col in feature_cols if col not in categorical_cols]
234
+
235
+ # Cast continuous columns to float
236
+ if continuous_cols:
237
+ df[continuous_cols] = df[continuous_cols].astype("float64")
238
+
239
+ # Drop rows with NaN features or target (PyTorch Tabular cannot handle NaN values)
240
+ nan_mask = df[feature_cols].isna().any(axis=1) | df[target_col].isna()
241
+ if nan_mask.any():
242
+ n_nan_rows = nan_mask.sum()
243
+ log.warning(
244
+ f"Dropping {n_nan_rows} rows ({100*n_nan_rows/len(df):.1f}%) with NaN values for cross-validation"
245
+ )
246
+ df = df[~nan_mask].reset_index(drop=True)
247
+
248
+ X = df[feature_cols]
249
+ y = df[target_col]
250
+ ids = df[id_col]
251
+
252
+ # Encode target if classifier
253
+ if label_encoder is not None:
254
+ if not hasattr(label_encoder, "classes_"):
255
+ label_encoder.fit(y)
256
+ y_encoded = label_encoder.transform(y)
257
+ y_for_cv = pd.Series(y_encoded, index=y.index, name=target_col)
258
+ else:
259
+ y_for_cv = y
260
+
261
+ # Extract configs from loaded model (pass approx train size for batch_size calculation)
262
+ n_train_approx = int(len(df) * (1 - 1 / nfolds))
263
+ configs = _extract_model_configs(loaded_model, n_train_approx)
264
+ trainer_params = configs["trainer"]
265
+ model_params = configs["model"]
266
+
267
+ log.info(f"Trainer config:\n{pformat(trainer_params)}")
268
+ log.info(f"Model config:\n{pformat(model_params)}")
269
+
270
+ # Prepare KFold
271
+ kfold = (StratifiedKFold if is_classifier else KFold)(n_splits=nfolds, shuffle=True, random_state=42)
272
+
273
+ # Initialize results collection
274
+ fold_metrics = []
275
+ predictions_df = pd.DataFrame({id_col: ids, target_col: y})
276
+ if is_classifier:
277
+ predictions_df["pred_proba"] = [None] * len(predictions_df)
278
+
279
+ # Perform cross-validation
280
+ for fold_idx, (train_idx, val_idx) in enumerate(kfold.split(X, y_for_cv), 1):
281
+ print(f"\n{'='*50}")
282
+ print(f"Fold {fold_idx}/{nfolds}")
283
+ print(f"{'='*50}")
284
+
285
+ # Split data
286
+ df_train = df.iloc[train_idx].copy()
287
+ df_val = df.iloc[val_idx].copy()
288
+
289
+ # Encode target for this fold
290
+ if is_classifier:
291
+ df_train[target_col] = label_encoder.transform(df_train[target_col])
292
+ df_val[target_col] = label_encoder.transform(df_val[target_col])
293
+
294
+ # Create configs for this fold - matching the training template exactly
295
+ data_config = DataConfig(
296
+ target=[target_col],
297
+ continuous_cols=continuous_cols,
298
+ categorical_cols=categorical_cols,
299
+ )
300
+
301
+ trainer_config = TrainerConfig(
302
+ auto_lr_find=trainer_params["auto_lr_find"],
303
+ batch_size=trainer_params["batch_size"],
304
+ max_epochs=trainer_params["max_epochs"],
305
+ min_epochs=trainer_params["min_epochs"],
306
+ early_stopping=trainer_params["early_stopping"],
307
+ early_stopping_patience=trainer_params["early_stopping_patience"],
308
+ gradient_clip_val=trainer_params["gradient_clip_val"],
309
+ checkpoints="valid_loss", # Save best model based on validation loss
310
+ accelerator="cpu",
311
+ )
312
+
313
+ optimizer_config = OptimizerConfig()
314
+
315
+ model_config = CategoryEmbeddingModelConfig(
316
+ task="classification" if is_classifier else "regression",
317
+ layers=model_params["layers"],
318
+ activation=model_params["activation"],
319
+ learning_rate=model_params["learning_rate"],
320
+ dropout=model_params["dropout"],
321
+ use_batch_norm=model_params["use_batch_norm"],
322
+ initialization=model_params["initialization"],
323
+ )
324
+
325
+ # Create and train fresh model
326
+ tabular_model = TabularModel(
327
+ data_config=data_config,
328
+ model_config=model_config,
329
+ optimizer_config=optimizer_config,
330
+ trainer_config=trainer_config,
331
+ )
332
+
333
+ # Change to /tmp for training (PyTorch Tabular needs write access)
334
+ original_cwd = os.getcwd()
335
+ try:
336
+ os.chdir("/tmp")
337
+ # Clean up checkpoint directory from previous fold
338
+ checkpoint_dir = "/tmp/saved_models"
339
+ if os.path.exists(checkpoint_dir):
340
+ shutil.rmtree(checkpoint_dir)
341
+ tabular_model.fit(train=df_train, validation=df_val)
342
+ finally:
343
+ os.chdir(original_cwd)
344
+
345
+ # Make predictions
346
+ result = tabular_model.predict(df_val[feature_cols])
347
+
348
+ # Extract predictions
349
+ prediction_col = f"{target_col}_prediction"
350
+ preds = result[prediction_col].values
351
+
352
+ # Store predictions at the correct indices
353
+ val_indices = df.iloc[val_idx].index
354
+ if is_classifier:
355
+ preds_decoded = label_encoder.inverse_transform(preds.astype(int))
356
+ predictions_df.loc[val_indices, "prediction"] = preds_decoded
357
+
358
+ # Get probabilities and store at validation indices only
359
+ prob_cols = sorted([col for col in result.columns if col.endswith("_probability")])
360
+ if prob_cols:
361
+ probs = result[prob_cols].values
362
+ for i, idx in enumerate(val_indices):
363
+ predictions_df.at[idx, "pred_proba"] = probs[i].tolist()
364
+ else:
365
+ predictions_df.loc[val_indices, "prediction"] = preds
366
+
367
+ # Calculate fold metrics
368
+ if is_classifier:
369
+ y_val_orig = label_encoder.inverse_transform(df_val[target_col])
370
+ preds_orig = preds_decoded
371
+
372
+ prec, rec, f1, _ = precision_recall_fscore_support(
373
+ y_val_orig, preds_orig, average="weighted", zero_division=0
374
+ )
375
+
376
+ prec_per_class, rec_per_class, f1_per_class, _ = precision_recall_fscore_support(
377
+ y_val_orig, preds_orig, average=None, zero_division=0, labels=label_encoder.classes_
378
+ )
379
+
380
+ y_val_encoded = df_val[target_col].values
381
+ roc_auc_overall = roc_auc_score(y_val_encoded, probs, multi_class="ovr", average="macro")
382
+ roc_auc_per_class = roc_auc_score(y_val_encoded, probs, multi_class="ovr", average=None)
383
+
384
+ fold_metrics.append(
385
+ {
386
+ "fold": fold_idx,
387
+ "precision": prec,
388
+ "recall": rec,
389
+ "f1": f1,
390
+ "roc_auc": roc_auc_overall,
391
+ "precision_per_class": prec_per_class,
392
+ "recall_per_class": rec_per_class,
393
+ "f1_per_class": f1_per_class,
394
+ "roc_auc_per_class": roc_auc_per_class,
395
+ }
396
+ )
397
+
398
+ print(f"Fold {fold_idx} - F1: {f1:.4f}, ROC-AUC: {roc_auc_overall:.4f}")
399
+ else:
400
+ y_val = df_val[target_col].values
401
+ spearman_corr, _ = spearmanr(y_val, preds)
402
+ rmse = np.sqrt(mean_squared_error(y_val, preds))
403
+
404
+ fold_metrics.append(
405
+ {
406
+ "fold": fold_idx,
407
+ "rmse": rmse,
408
+ "mae": mean_absolute_error(y_val, preds),
409
+ "medae": median_absolute_error(y_val, preds),
410
+ "r2": r2_score(y_val, preds),
411
+ "spearmanr": spearman_corr,
412
+ }
413
+ )
414
+
415
+ print(f"Fold {fold_idx} - RMSE: {rmse:.4f}, R2: {fold_metrics[-1]['r2']:.4f}")
416
+
417
+ # Calculate summary metrics
418
+ fold_df = pd.DataFrame(fold_metrics)
419
+
420
+ if is_classifier:
421
+ if "pred_proba" in predictions_df.columns:
422
+ predictions_df = expand_proba_column(predictions_df, label_encoder.classes_)
423
+
424
+ metric_rows = []
425
+ for idx, class_name in enumerate(label_encoder.classes_):
426
+ prec_scores = np.array([fold["precision_per_class"][idx] for fold in fold_metrics])
427
+ rec_scores = np.array([fold["recall_per_class"][idx] for fold in fold_metrics])
428
+ f1_scores = np.array([fold["f1_per_class"][idx] for fold in fold_metrics])
429
+ roc_auc_scores = np.array([fold["roc_auc_per_class"][idx] for fold in fold_metrics])
430
+
431
+ y_orig = label_encoder.inverse_transform(y_for_cv)
432
+ support = int((y_orig == class_name).sum())
433
+
434
+ metric_rows.append(
435
+ {
436
+ "class": class_name,
437
+ "precision": prec_scores.mean(),
438
+ "recall": rec_scores.mean(),
439
+ "f1": f1_scores.mean(),
440
+ "roc_auc": roc_auc_scores.mean(),
441
+ "support": support,
442
+ }
443
+ )
444
+
445
+ metric_rows.append(
446
+ {
447
+ "class": "all",
448
+ "precision": fold_df["precision"].mean(),
449
+ "recall": fold_df["recall"].mean(),
450
+ "f1": fold_df["f1"].mean(),
451
+ "roc_auc": fold_df["roc_auc"].mean(),
452
+ "support": len(y_for_cv),
453
+ }
454
+ )
455
+
456
+ metrics_df = pd.DataFrame(metric_rows)
457
+ else:
458
+ metrics_df = pd.DataFrame(
459
+ [
460
+ {
461
+ "rmse": fold_df["rmse"].mean(),
462
+ "mae": fold_df["mae"].mean(),
463
+ "medae": fold_df["medae"].mean(),
464
+ "r2": fold_df["r2"].mean(),
465
+ "spearmanr": fold_df["spearmanr"].mean(),
466
+ "support": len(y_for_cv),
467
+ }
468
+ ]
469
+ )
470
+
471
+ print(f"\n{'='*50}")
472
+ print("Cross-Validation Summary")
473
+ print(f"{'='*50}")
474
+ print(metrics_df.to_string(index=False))
475
+
476
+ return metrics_df, predictions_df
477
+
478
+ finally:
479
+ log.info(f"Cleaning up model directory: {model_dir}")
480
+ shutil.rmtree(model_dir, ignore_errors=True)
481
+
482
+
483
+ if __name__ == "__main__":
484
+
485
+ # Tests for the PyTorch utilities
486
+ from workbench.api import Model, Endpoint
487
+
488
+ # Initialize Workbench model
489
+ model_name = "caco2-er-reg-pytorch-test"
490
+ # model_name = "aqsol-pytorch-reg"
491
+ print(f"Loading Workbench model: {model_name}")
492
+ model = Model(model_name)
493
+ print(f"Model Framework: {model.model_framework}")
494
+
495
+ # Perform cross-fold inference
496
+ end = Endpoint(model.endpoints()[0])
497
+ end.cross_fold_inference()
@@ -308,16 +308,21 @@ def cross_fold_inference(workbench_model: Any, nfolds: int = 5) -> Tuple[pd.Data
308
308
  fs = FeatureSet(workbench_model.get_input())
309
309
  df = workbench_model.training_view().pull_dataframe()
310
310
 
311
- # Get id column - assuming FeatureSet has an id_column attribute or similar
311
+ # Extract sample weights if present
312
+ sample_weights = df.get("sample_weight")
313
+ if sample_weights is not None:
314
+ log.info(f"Using sample weights: min={sample_weights.min():.2f}, max={sample_weights.max():.2f}")
315
+
316
+ # Get columns
312
317
  id_col = fs.id_column
313
318
  target_col = workbench_model.target()
314
319
  feature_cols = workbench_model.features()
320
+ print(f"Target column: {target_col}")
321
+ print(f"Feature columns: {len(feature_cols)} features")
315
322
 
316
323
  # Convert string[python] to object, then to category for XGBoost compatibility
317
- # This avoids XGBoost's issue with pandas 2.x string[python] dtype in categorical categories
318
324
  for col in feature_cols:
319
325
  if pd.api.types.is_string_dtype(df[col]):
320
- # Double conversion: string[python] -> object -> category
321
326
  df[col] = df[col].astype("object").astype("category")
322
327
 
323
328
  X = df[feature_cols]
@@ -333,7 +338,6 @@ def cross_fold_inference(workbench_model: Any, nfolds: int = 5) -> Tuple[pd.Data
333
338
  y_for_cv = y
334
339
 
335
340
  # Prepare KFold
336
- # Note: random_state=42 seems to not actually give us reproducible results
337
341
  kfold = (StratifiedKFold if is_classifier else KFold)(n_splits=nfolds, shuffle=True, random_state=42)
338
342
 
339
343
  # Initialize results collection
@@ -345,8 +349,11 @@ def cross_fold_inference(workbench_model: Any, nfolds: int = 5) -> Tuple[pd.Data
345
349
  X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
346
350
  y_train, y_val = y_for_cv.iloc[train_idx], y_for_cv.iloc[val_idx]
347
351
 
352
+ # Get sample weights for training fold
353
+ weights_train = sample_weights.iloc[train_idx] if sample_weights is not None else None
354
+
348
355
  # Train and predict
349
- xgb_model.fit(X_train, y_train)
356
+ xgb_model.fit(X_train, y_train, sample_weight=weights_train)
350
357
  preds = xgb_model.predict(X_val)
351
358
 
352
359
  # Store predictions (decode if classifier)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: workbench
3
- Version: 0.8.197
3
+ Version: 0.8.201
4
4
  Summary: Workbench: A Dashboard and Python API for creating and deploying AWS SageMaker Model Pipelines
5
5
  Author-email: SuperCowPowers LLC <support@supercowpowers.com>
6
6
  License: MIT License
@@ -42,7 +42,7 @@ Requires-Dist: redis>=5.0.1
42
42
  Requires-Dist: numpy>=1.26.4
43
43
  Requires-Dist: pandas>=2.2.1
44
44
  Requires-Dist: awswrangler>=3.4.0
45
- Requires-Dist: sagemaker>=2.143
45
+ Requires-Dist: sagemaker<3.0,>=2.143
46
46
  Requires-Dist: cryptography>=44.0.2
47
47
  Requires-Dist: ipython>=8.37.0
48
48
  Requires-Dist: pyreadline3; sys_platform == "win32"