py2ls 0.2.4.24__py3-none-any.whl → 0.2.4.25__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
py2ls/ml2ls.py CHANGED
@@ -2206,6 +2206,8 @@ def predict(
2206
2206
  y_train: pd.Series,
2207
2207
  x_true: pd.DataFrame = None,
2208
2208
  y_true: Optional[pd.Series] = None,
2209
+ fill_missing:bool = True,
2210
+ scaler:str='standard',# ["standard", "minmax", "robust","maxabs"]
2209
2211
  backward: bool = False, # backward_regression
2210
2212
  backward_thr:float = 0.05,# pval thr,only works when backward is True
2211
2213
  common_features: set = None,
@@ -2324,7 +2326,7 @@ def predict(
2324
2326
  if purpose == "classification":
2325
2327
  model_ = {
2326
2328
  "Random Forest": RandomForestClassifier(
2327
- random_state=random_state, class_weight=class_weight
2329
+ random_state=random_state, class_weight=class_weight,n_jobs=n_jobs
2328
2330
  ),
2329
2331
  # SVC (Support Vector Classification)
2330
2332
  "SVM": SVC(
@@ -2335,7 +2337,7 @@ def predict(
2335
2337
  ),
2336
2338
  # fit the best model without enforcing sparsity, which means it does not directly perform feature selection.
2337
2339
  "Logistic Regression": LogisticRegression(
2338
- class_weight=class_weight, random_state=random_state
2340
+ class_weight=class_weight, random_state=random_state,n_jobs=n_jobs
2339
2341
  ),
2340
2342
  # Logistic Regression with L1 Regularization (Lasso)
2341
2343
  "Lasso Logistic Regression": LogisticRegression(
@@ -2346,49 +2348,51 @@ def predict(
2346
2348
  eval_metric="logloss",
2347
2349
  random_state=random_state,
2348
2350
  ),
2349
- "KNN": KNeighborsClassifier(n_neighbors=5),
2351
+ "KNN": KNeighborsClassifier(n_neighbors=5,n_jobs=n_jobs),
2350
2352
  "Naive Bayes": GaussianNB(),
2351
2353
  "Linear Discriminant Analysis": LinearDiscriminantAnalysis(),
2352
2354
  "AdaBoost": AdaBoostClassifier(
2353
2355
  algorithm="SAMME", random_state=random_state
2354
2356
  ),
2355
- # "LightGBM": lgb.LGBMClassifier(random_state=random_state, class_weight=class_weight),
2357
+ "LightGBM": lgb.LGBMClassifier(random_state=random_state, class_weight=class_weight,n_jobs=n_jobs),
2356
2358
  "CatBoost": cb.CatBoostClassifier(verbose=0, random_state=random_state),
2357
2359
  "Extra Trees": ExtraTreesClassifier(
2358
- random_state=random_state, class_weight=class_weight
2360
+ random_state=random_state, class_weight=class_weight,n_jobs=n_jobs
2359
2361
  ),
2360
- "Bagging": BaggingClassifier(random_state=random_state),
2362
+ "Bagging": BaggingClassifier(random_state=random_state,n_jobs=n_jobs),
2361
2363
  "Neural Network": MLPClassifier(max_iter=500, random_state=random_state),
2362
2364
  "DecisionTree": DecisionTreeClassifier(),
2363
2365
  "Quadratic Discriminant Analysis": QuadraticDiscriminantAnalysis(),
2364
2366
  "Ridge": RidgeClassifierCV(
2365
2367
  class_weight=class_weight, store_cv_results=True
2366
2368
  ),
2367
- "Perceptron": Perceptron(random_state=random_state),
2369
+ "Perceptron": Perceptron(random_state=random_state,n_jobs=n_jobs),
2368
2370
  "Bernoulli Naive Bayes": BernoulliNB(),
2369
- "SGDClassifier": SGDClassifier(random_state=random_state),
2371
+ "SGDClassifier": SGDClassifier(random_state=random_state,n_jobs=n_jobs),
2370
2372
  }
2371
2373
  elif purpose == "regression":
2372
2374
  model_ = {
2373
- "Random Forest": RandomForestRegressor(random_state=random_state),
2375
+ "Random Forest": RandomForestRegressor(random_state=random_state,n_jobs=n_jobs),
2374
2376
  "SVM": SVR(), # SVR (Support Vector Regression)
2375
2377
  # "Lasso": Lasso(random_state=random_state), # 它和LassoCV相同(必须要提供alpha参数),
2376
2378
  "LassoCV": LassoCV(
2377
- cv=cv_folds, random_state=random_state
2379
+ cv=cv_folds, random_state=random_state,n_jobs=n_jobs
2378
2380
  ), # LassoCV自动找出最适alpha,优于Lasso
2379
2381
  "Gradient Boosting": GradientBoostingRegressor(random_state=random_state),
2380
- "XGBoost": xgb.XGBRegressor(eval_metric="rmse", random_state=random_state),
2381
- "Linear Regression": LinearRegression(),
2382
+ "XGBoost": xgb.XGBRegressor(eval_metric="rmse", random_state=random_state,n_jobs=n_jobs),
2383
+ "Linear Regression": LinearRegression(n_jobs=n_jobs),
2382
2384
  "AdaBoost": AdaBoostRegressor(random_state=random_state),
2383
- # "LightGBM": lgb.LGBMRegressor(random_state=random_state),
2385
+ "LightGBM": lgb.LGBMRegressor(random_state=random_state,n_jobs=n_jobs,
2386
+ force_row_wise=True # Or use force_col_wise=True if memory is a concern
2387
+ ),
2384
2388
  "CatBoost": cb.CatBoostRegressor(verbose=0, random_state=random_state),
2385
- "Extra Trees": ExtraTreesRegressor(random_state=random_state),
2386
- "Bagging": BaggingRegressor(random_state=random_state),
2389
+ "Extra Trees": ExtraTreesRegressor(random_state=random_state,n_jobs=n_jobs),
2390
+ "Bagging": BaggingRegressor(random_state=random_state,n_jobs=n_jobs),
2387
2391
  "Neural Network": MLPRegressor(max_iter=500, random_state=random_state),
2388
2392
  "ElasticNet": ElasticNet(random_state=random_state),
2389
2393
  "Ridge": Ridge(),
2390
- "KNN": KNeighborsRegressor(),
2391
- "TheilSen":TheilSenRegressor(),
2394
+ "KNN": KNeighborsRegressor(n_jobs=n_jobs),
2395
+ "TheilSen":TheilSenRegressor(n_jobs=n_jobs),
2392
2396
  "Huber":HuberRegressor(),
2393
2397
  "Poisson":PoissonRegressor()
2394
2398
  }
@@ -2410,7 +2414,7 @@ def predict(
2410
2414
  # indicate cls:
2411
2415
  if ips.run_once_within(30): # 10 min
2412
2416
  print(f"processing: {list(models.keys())}")
2413
-
2417
+ print(isinstance(y_train, str) and y_train in x_train.columns)
2414
2418
  if isinstance(y_train, str) and y_train in x_train.columns:
2415
2419
  y_train_col_name = y_train
2416
2420
  y_train = x_train[y_train]
@@ -2418,6 +2422,7 @@ def predict(
2418
2422
  x_train = x_train.drop(y_train_col_name, axis=1)
2419
2423
  # else:
2420
2424
  # y_train = ips.df_encoder(pd.DataFrame(y_train), method="dummy").values.ravel()
2425
+
2421
2426
  y_train = pd.DataFrame(y_train)
2422
2427
  if y_train.select_dtypes(include=np.number).empty:
2423
2428
  y_train_ = ips.df_encoder(y_train, method="dummy", drop=None)
@@ -2430,6 +2435,9 @@ def predict(
2430
2435
  y_train = ips.df_encoder(pd.DataFrame(y_train), method="label")
2431
2436
  print("is_binary:", is_binary)
2432
2437
 
2438
+ if fill_missing:
2439
+ ips.df_fillna(data=x_train, method="knn", inplace=True, axis=0)
2440
+ ips.df_fillna(data=y_train, method="knn", inplace=True, axis=0)
2433
2441
  # Perform backward feature selection
2434
2442
  if backward:
2435
2443
  selected_features = backward_regression(x_train, y_train, thr=backward_thr)
@@ -2458,6 +2466,8 @@ def predict(
2458
2466
  pd.DataFrame(y_train), method="label"
2459
2467
  ).values.ravel()
2460
2468
 
2469
+ if fill_missing:
2470
+ ips.df_fillna(data=x_true, method="knn", inplace=True, axis=0)
2461
2471
  if y_true is not None:
2462
2472
  if isinstance(y_true, str) and y_true in x_true.columns:
2463
2473
  y_true_col_name = y_true
@@ -2490,11 +2500,16 @@ def predict(
2490
2500
  # Ensure common features are selected
2491
2501
  if common_features is not None:
2492
2502
  x_train, x_true = x_train[common_features], x_true[common_features]
2503
+ share_col_names=common_features
2493
2504
  else:
2494
2505
  share_col_names = ips.shared(x_train.columns, x_true.columns, verbose=verbose)
2495
2506
  x_train, x_true = x_train[share_col_names], x_true[share_col_names]
2496
2507
 
2497
- x_train, x_true = ips.df_scaler(x_train), ips.df_scaler(x_true)
2508
+ #! scaler
2509
+ # scaler and fit x_train and export scaler to fit the x_true
2510
+ x_train,scaler_=ips.df_scaler(x_train,method=scaler,return_scaler=True)
2511
+ #
2512
+ x_true=ips.df_scaler(x_true,scaler=scaler_)# make sure 用于同一个scaler
2498
2513
  x_train, x_true = ips.df_encoder(x_train, method="dummy"), ips.df_encoder(
2499
2514
  x_true, method="dummy"
2500
2515
  )
@@ -2516,6 +2531,7 @@ def predict(
2516
2531
  if isinstance(y_train, np.ndarray):
2517
2532
  y_true = ips.df_encoder(data=pd.DataFrame(y_true), method="label")
2518
2533
  y_true = np.asarray(y_true)
2534
+
2519
2535
  # Hyperparameter grids for tuning
2520
2536
  param_grid_common_xgb = {
2521
2537
  'learning_rate': [0.01, 0.1, 0.2, 0.3],
@@ -3168,83 +3184,124 @@ def predict(
3168
3184
  ):
3169
3185
  if verbose:
3170
3186
  print(f"\nTraining and validating {name}:")
3171
-
3172
- # Grid search with KFold or StratifiedKFold
3173
- if is_binary:
3174
- gs = GridSearchCV(
3175
- clf,
3176
- param_grid=param_grids.get(name, {}),
3177
- scoring=(
3178
- "roc_auc"
3179
- if purpose == "classification"
3180
- else "neg_mean_squared_error"
3181
- ),
3182
- cv=cv,
3183
- n_jobs=n_jobs,
3184
- verbose=verbose,
3185
- )
3186
-
3187
- gs.fit(x_train, y_train)
3188
- best_clf = gs.best_estimator_
3189
- # make sure x_train and x_test has the same name
3190
- x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
3191
- y_pred = best_clf.predict(x_true)
3192
- if hasattr(best_clf, "predict_proba"):
3193
- y_pred_proba = best_clf.predict_proba(x_true)
3194
- print("Shape of predicted probabilities:", y_pred_proba.shape)
3195
- if y_pred_proba.shape[1] == 1:
3196
- y_pred_proba = np.hstack(
3197
- [1 - y_pred_proba, y_pred_proba]
3198
- ) # Add missing class probabilities
3199
- y_pred_proba = y_pred_proba[:, 1]
3200
- elif hasattr(best_clf, "decision_function"):
3201
- # If predict_proba is not available, use decision_function (e.g., for SVM)
3202
- y_pred_proba = best_clf.decision_function(x_true)
3203
- # Ensure y_pred_proba is within 0 and 1 bounds
3204
- y_pred_proba = (y_pred_proba - y_pred_proba.min()) / (
3205
- y_pred_proba.max() - y_pred_proba.min()
3187
+ try:
3188
+ # Grid search with KFold or StratifiedKFold
3189
+ if is_binary:
3190
+ gs = GridSearchCV(
3191
+ clf,
3192
+ param_grid=param_grids.get(name, {}),
3193
+ scoring=(
3194
+ "roc_auc"
3195
+ if purpose == "classification"
3196
+ else "neg_mean_squared_error"
3197
+ ),
3198
+ cv=cv,
3199
+ n_jobs=n_jobs,
3200
+ verbose=verbose,
3206
3201
  )
3207
- else:
3208
- y_pred_proba = None # No probability output for certain models
3209
- else:
3210
- gs = GridSearchCV(
3211
- clf,
3212
- param_grid=param_grids.get(name, {}),
3213
- scoring=(
3214
- "roc_auc_ovr"
3215
- if purpose == "classification"
3216
- else "neg_mean_squared_error"
3217
- ),
3218
- cv=cv,
3219
- n_jobs=n_jobs,
3220
- verbose=verbose,
3221
- )
3222
-
3223
- # Fit GridSearchCV
3224
- gs.fit(x_train, y_train)
3225
- best_clf = gs.best_estimator_
3226
-
3227
- # Ensure x_true aligns with x_train columns
3228
- x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
3229
- y_pred = best_clf.predict(x_true)
3230
-
3231
- # Handle prediction probabilities for multiclass
3232
- if hasattr(best_clf, "predict_proba"):
3233
- y_pred_proba = best_clf.predict_proba(x_true)
3234
- elif hasattr(best_clf, "decision_function"):
3235
- y_pred_proba = best_clf.decision_function(x_true)
3236
3202
 
3237
- # Normalize for multiclass if necessary
3238
- if y_pred_proba.ndim == 2:
3239
- y_pred_proba = (
3240
- y_pred_proba - y_pred_proba.min(axis=1, keepdims=True)
3241
- ) / (
3242
- y_pred_proba.max(axis=1, keepdims=True)
3243
- - y_pred_proba.min(axis=1, keepdims=True)
3203
+ gs.fit(x_train, y_train)
3204
+ best_clf = gs.best_estimator_
3205
+ # make sure x_train and x_test has the same name
3206
+ x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
3207
+ y_pred = best_clf.predict(x_true)
3208
+ if hasattr(best_clf, "predict_proba"):
3209
+ y_pred_proba = best_clf.predict_proba(x_true)
3210
+ print("Shape of predicted probabilities:", y_pred_proba.shape)
3211
+ if y_pred_proba.shape[1] == 1:
3212
+ y_pred_proba = np.hstack(
3213
+ [1 - y_pred_proba, y_pred_proba]
3214
+ ) # Add missing class probabilities
3215
+ y_pred_proba = y_pred_proba[:, 1]
3216
+ elif hasattr(best_clf, "decision_function"):
3217
+ # If predict_proba is not available, use decision_function (e.g., for SVM)
3218
+ y_pred_proba = best_clf.decision_function(x_true)
3219
+ # Ensure y_pred_proba is within 0 and 1 bounds
3220
+ y_pred_proba = (y_pred_proba - y_pred_proba.min()) / (
3221
+ y_pred_proba.max() - y_pred_proba.min()
3244
3222
  )
3223
+ else:
3224
+ y_pred_proba = None # No probability output for certain models
3225
+ # Access alphas if applicable (e.g., ElasticNetCV, LassoCV)
3226
+ if hasattr(best_clf, "alphas_") or hasattr(best_clf, "Cs_"):
3227
+ if hasattr(best_clf, "alphas_"):
3228
+ alphas_ = best_clf.alphas_
3229
+ elif hasattr(best_clf, "alpha_"):
3230
+ alphas_ = best_clf.alpha_
3231
+ elif hasattr(best_clf, "Cs_"):
3232
+ alphas_ = best_clf.Cs_
3233
+ else:
3234
+ alphas_= None
3235
+ coef_ = best_clf.coef_ if hasattr(best_clf, "coef_") else None
3245
3236
  else:
3246
- y_pred_proba = None # No probability output for certain models
3237
+ gs = GridSearchCV(
3238
+ clf,
3239
+ param_grid=param_grids.get(name, {}),
3240
+ scoring=(
3241
+ "roc_auc_ovr"
3242
+ if purpose == "classification"
3243
+ else "neg_mean_squared_error"
3244
+ ),
3245
+ cv=cv,
3246
+ n_jobs=n_jobs,
3247
+ verbose=verbose,
3248
+ )
3247
3249
 
3250
+ # Fit GridSearchCV
3251
+ gs.fit(x_train, y_train)
3252
+ best_clf = gs.best_estimator_
3253
+
3254
+ # Ensure x_true aligns with x_train columns
3255
+ x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
3256
+
3257
+ # do i need to fit the x_train, y_train again?
3258
+ best_clf=best_clf.fit(x_train, y_train)
3259
+ y_pred = best_clf.predict(x_true)
3260
+
3261
+ # Handle prediction probabilities for multiclass
3262
+ if hasattr(best_clf, "predict_proba"):
3263
+ y_pred_proba = best_clf.predict_proba(x_true)
3264
+ elif hasattr(best_clf, "decision_function"):
3265
+ y_pred_proba = best_clf.decision_function(x_true)
3266
+
3267
+ # Normalize for multiclass if necessary
3268
+ if y_pred_proba.ndim == 2:
3269
+ y_pred_proba = (
3270
+ y_pred_proba - y_pred_proba.min(axis=1, keepdims=True)
3271
+ ) / (
3272
+ y_pred_proba.max(axis=1, keepdims=True)
3273
+ - y_pred_proba.min(axis=1, keepdims=True)
3274
+ )
3275
+ else:
3276
+ y_pred_proba = None # No probability output for certain models
3277
+ # Access alphas if applicable (e.g., ElasticNetCV, LassoCV)
3278
+ if hasattr(best_clf, "alphas_") or hasattr(best_clf, "Cs_"):
3279
+ if hasattr(best_clf, "alphas_"):
3280
+ alphas_ = best_clf.alphas_
3281
+ elif hasattr(best_clf, "alpha_"):
3282
+ alphas_ = best_clf.alpha_
3283
+ elif hasattr(best_clf, "Cs_"):
3284
+ alphas_ = best_clf.Cs_
3285
+ else:
3286
+ alphas_= None
3287
+ coef_ = best_clf.coef_ if hasattr(best_clf, "coef_") else None
3288
+ except Exception as e:
3289
+ alphas_,coef_ = None,None
3290
+ print(f"skiped {clf}: {e}")
3291
+ continue
3292
+ # try to make predict format consistant
3293
+ try:
3294
+ y_pred= [i[0] for i in y_pred]
3295
+ except:
3296
+ pass
3297
+ try:
3298
+ y_true= [i[0] for i in y_true]
3299
+ except:
3300
+ pass
3301
+ try:
3302
+ y_train= [i[0] for i in y_train]
3303
+ except:
3304
+ pass
3248
3305
  validation_scores = {}
3249
3306
 
3250
3307
  if y_true is not None and y_pred_proba is not None:
@@ -3294,20 +3351,26 @@ def predict(
3294
3351
  "roc_curve": roc_info,
3295
3352
  "pr_curve": pr_info,
3296
3353
  "confusion_matrix": confusion_matrix(y_true, y_pred),
3297
- "predictions": y_pred.tolist(),
3354
+ "predictions": y_pred,#.tolist(),
3298
3355
  "predictions_proba": (
3299
3356
  y_pred_proba.tolist() if y_pred_proba is not None else None
3300
3357
  ),
3358
+ "features":share_col_names,
3359
+ "coef":coef_,
3360
+ "alphas":alphas_
3301
3361
  }
3302
3362
  else: # "regression"
3303
3363
  results[name] = {
3304
3364
  "best_clf": gs.best_estimator_,
3305
3365
  "best_params": gs.best_params_,
3306
3366
  "scores": validation_scores, # e.g., neg_MSE, R², etc.
3307
- "predictions": y_pred.tolist(),
3367
+ "predictions": y_pred,#.tolist(),
3308
3368
  "predictions_proba": (
3309
3369
  y_pred_proba.tolist() if y_pred_proba is not None else None
3310
3370
  ),
3371
+ "features":share_col_names,
3372
+ "coef":coef_,
3373
+ "alphas":alphas_
3311
3374
  }
3312
3375
  else: # multi-classes
3313
3376
  if y_pred_proba is not None:
@@ -3346,20 +3409,26 @@ def predict(
3346
3409
  "roc_curve": roc_info,
3347
3410
  "pr_curve": pr_info,
3348
3411
  "confusion_matrix": confusion_matrix(y_true, y_pred),
3349
- "predictions": y_pred.tolist(),
3412
+ "predictions": y_pred,#.tolist(),
3350
3413
  "predictions_proba": (
3351
3414
  y_pred_proba.tolist() if y_pred_proba is not None else None
3352
3415
  ),
3416
+ "features":share_col_names,
3417
+ "coef":coef_,
3418
+ "alphas":alphas_
3353
3419
  }
3354
3420
  else: # "regression"
3355
3421
  results[name] = {
3356
3422
  "best_clf": gs.best_estimator_,
3357
3423
  "best_params": gs.best_params_,
3358
3424
  "scores": validation_scores, # e.g., neg_MSE, R², etc.
3359
- "predictions": y_pred.tolist(),
3425
+ "predictions": y_pred,#.tolist(),
3360
3426
  "predictions_proba": (
3361
3427
  y_pred_proba.tolist() if y_pred_proba is not None else None
3362
3428
  ),
3429
+ "features":share_col_names,
3430
+ "coef":coef_,
3431
+ "alphas":alphas_
3363
3432
  }
3364
3433
 
3365
3434
  else:
@@ -3378,12 +3447,15 @@ def predict(
3378
3447
  "best_clf": gs.best_estimator_,
3379
3448
  "best_params": gs.best_params_,
3380
3449
  "scores": validation_scores,
3381
- "predictions": y_pred.tolist(),
3450
+ "predictions": y_pred,#.tolist(),
3382
3451
  "predictions_proba": (
3383
3452
  y_pred_proba.tolist() if y_pred_proba is not None else None
3384
3453
  ),
3454
+ "features":share_col_names,
3385
3455
  "y_train": y_train if y_train is not None else [],
3386
3456
  "y_true": y_true if y_true is not None else [],
3457
+ "coef":coef_,
3458
+ "alphas":alphas_
3387
3459
  }
3388
3460
 
3389
3461
  # Convert results to DataFrame
@@ -3446,7 +3518,7 @@ def predict(
3446
3518
  for i, j in top_models.to_dict().items():
3447
3519
  base_estimators.append((i, j))
3448
3520
  if stacking_cv:
3449
- print(f" ⤵ stacking_cv is processing...")
3521
+ print(f"⤵ stacking_cv is processing...")
3450
3522
  #* 定义几个象征性的final_estimator
3451
3523
  # 备选的几种
3452
3524
  if purpose == "classification":
@@ -3520,7 +3592,7 @@ def predict(
3520
3592
  best_final_estimator = cv_results_df.iloc[0]['final_estimator']
3521
3593
  print(f"Best final estimator based on cross-validation: {best_final_estimator}")
3522
3594
  else:
3523
- print(f" ⤵ trying to find the best_final_estimator for stacking...")
3595
+ print(f"⤵ trying to find the best_final_estimator for stacking...")
3524
3596
  if purpose=="classification":
3525
3597
  best_final_estimator = LogisticRegression(class_weight=class_weight,
3526
3598
  random_state=random_state,
@@ -3530,26 +3602,25 @@ def predict(
3530
3602
  print(f"⤵ the best best_final_estimator: {best_final_estimator}")
3531
3603
  #! apply stacking
3532
3604
  if purpose == "classification":
3533
- print(f" ⤵ StackingClassifier...")
3605
+ print(f"⤵ StackingClassifier...")
3534
3606
  stacking_model = StackingClassifier(estimators=base_estimators,
3535
3607
  final_estimator=best_final_estimator,
3536
3608
  cv=cv)
3537
3609
  else:
3538
- print(f" ⤵ StackingRegressor...")
3610
+ print(f"⤵ StackingRegressor...")
3539
3611
  stacking_model = StackingRegressor(estimators=base_estimators,
3540
3612
  final_estimator=best_final_estimator,
3541
3613
  cv=cv)
3542
3614
 
3543
3615
  # Train the Stacking Classifier
3544
- print(f" ⤵ fit & predict...")
3616
+ print(f"⤵ fit & predict...")
3545
3617
  stacking_model.fit(x_train, y_train)
3546
3618
  y_pred_final = stacking_model.predict(x_true)
3547
- print(f" ⤵ collecting results...")
3619
+ print(f"⤵ collecting results...")
3548
3620
  # pred_proba
3549
3621
  if is_binary:
3550
3622
  if hasattr(stacking_model, "predict_proba"):
3551
3623
  y_pred_proba_final = stacking_model.predict_proba(x_true)
3552
- print("Shape of predicted probabilities:", y_pred_proba_final.shape)
3553
3624
  if y_pred_proba_final.shape[1] == 1:
3554
3625
  y_pred_proba_final = np.hstack(
3555
3626
  [1 - y_pred_proba_final, y_pred_proba_final]
@@ -3564,6 +3635,17 @@ def predict(
3564
3635
  )
3565
3636
  else:
3566
3637
  y_pred_proba_final = None # No probability output for certain models
3638
+ # Access alphas if applicable (e.g., ElasticNetCV, LassoCV)
3639
+ if hasattr(best_clf, "alphas_") or hasattr(best_clf, "Cs_"):
3640
+ if hasattr(best_clf, "alphas_"):
3641
+ alphas_ = best_clf.alphas_
3642
+ elif hasattr(best_clf, "alpha_"):
3643
+ alphas_ = best_clf.alpha_
3644
+ elif hasattr(best_clf, "Cs_"):
3645
+ alphas_ = best_clf.Cs_
3646
+ else:
3647
+ alphas_= None
3648
+ coef_ = best_clf.coef_ if hasattr(best_clf, "coef_") else None
3567
3649
  if not is_binary:
3568
3650
  # Handle prediction probabilities for multiclass
3569
3651
  if hasattr(stacking_model, "predict_proba"):
@@ -3581,6 +3663,17 @@ def predict(
3581
3663
  )
3582
3664
  else:
3583
3665
  y_pred_proba_final = None # No probability output for certain models
3666
+ # Access alphas if applicable (e.g., ElasticNetCV, LassoCV)
3667
+ if hasattr(best_clf, "alphas_") or hasattr(best_clf, "Cs_"):
3668
+ if hasattr(best_clf, "alphas_"):
3669
+ alphas_ = best_clf.alphas_
3670
+ elif hasattr(best_clf, "alpha_"):
3671
+ alphas_ = best_clf.alpha_
3672
+ elif hasattr(best_clf, "Cs_"):
3673
+ alphas_ = best_clf.Cs_
3674
+ else:
3675
+ alphas_= None
3676
+ coef_ = best_clf.coef_ if hasattr(best_clf, "coef_") else None
3584
3677
  #! dict_pred_stack
3585
3678
  dict_pred_stack={}
3586
3679
  validation_scores_final = {}
@@ -3631,6 +3724,9 @@ def predict(
3631
3724
  "predictions_proba": (
3632
3725
  y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
3633
3726
  ),
3727
+ "features":share_col_names,
3728
+ "coef":coef_,
3729
+ "alphas":alphas_
3634
3730
  }
3635
3731
  else: # "regression"
3636
3732
  dict_pred_stack = {
@@ -3641,6 +3737,9 @@ def predict(
3641
3737
  "predictions_proba": (
3642
3738
  y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
3643
3739
  ),
3740
+ "features":share_col_names,
3741
+ "coef":coef_,
3742
+ "alphas":alphas_
3644
3743
  }
3645
3744
  else: # multi-classes
3646
3745
  if y_pred_proba_final is not None:
@@ -3680,6 +3779,9 @@ def predict(
3680
3779
  "predictions_proba": (
3681
3780
  y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
3682
3781
  ),
3782
+ "features":share_col_names,
3783
+ "coef":coef_,
3784
+ "alphas":alphas_
3683
3785
  }
3684
3786
  else: # "regression"
3685
3787
  dict_pred_stack = {
@@ -3690,6 +3792,9 @@ def predict(
3690
3792
  "predictions_proba": (
3691
3793
  y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
3692
3794
  ),
3795
+ "features":share_col_names,
3796
+ "coef":coef_,
3797
+ "alphas":alphas_
3693
3798
  }
3694
3799
 
3695
3800
  else:
@@ -3712,8 +3817,11 @@ def predict(
3712
3817
  "predictions_proba": (
3713
3818
  y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
3714
3819
  ),
3820
+ "features":share_col_names,
3715
3821
  "y_train": y_train if y_train is not None else [],
3716
3822
  "y_true": y_true if y_true is not None else [],
3823
+ "coef":coef_,
3824
+ "alphas":alphas_
3717
3825
  }
3718
3826
  # merge together
3719
3827
  df_pred = pd.DataFrame(
@@ -3728,16 +3836,16 @@ def predict(
3728
3836
  # if dir_save:
3729
3837
  # ips.figsave(dir_save + f"validate_features_stacking_{now_}.pdf")
3730
3838
  if vote:
3731
- print(f" ⤵ voting...")
3839
+ print(f"⤵ voting...")
3732
3840
  from sklearn.ensemble import VotingClassifier, VotingRegressor
3733
- #! Votting
3841
+ #! voting
3734
3842
  n_top_models = min(n_top_models, df_results.shape[0])
3735
3843
  base_estimators=[]
3736
3844
  for name, cls in zip(list(df_results.iloc[:n_top_models, :].index),df_results.iloc[:n_top_models, :]["best_clf"].tolist()):
3737
3845
  base_estimators.append((name,cls))
3738
3846
  # Apply Voting Classifier/Regressor
3739
3847
  if purpose == "classification":
3740
- print(f" ⤵ VotingClassifier...via{votting}")
3848
+ print(f"⤵ VotingClassifier...via{voting}")
3741
3849
  if voting=='hard':
3742
3850
  # Hard voting does not support `predict_proba`
3743
3851
  voting_model = VotingClassifier(estimators=base_estimators)
@@ -3745,7 +3853,7 @@ def predict(
3745
3853
  # Soft voting supports `predict_proba`
3746
3854
  voting_model = VotingClassifier(estimators=base_estimators, voting="soft")
3747
3855
  else:
3748
- print(f" ⤵ VotingRegressor...")
3856
+ print(f"⤵ VotingRegressor...")
3749
3857
  voting_model = VotingRegressor(estimators=base_estimators)
3750
3858
 
3751
3859
  # Train the Voting Classifier/Regressor
@@ -3770,10 +3878,23 @@ def predict(
3770
3878
  y_pred_proba_vote = y_pred_proba_vote[:, 1]
3771
3879
  else:
3772
3880
  y_pred_proba_vote = None
3881
+
3882
+ # Access alphas if applicable (e.g., ElasticNetCV, LassoCV)
3883
+ if hasattr(best_clf, "alphas_") or hasattr(best_clf, "Cs_"):
3884
+ if hasattr(best_clf, "alphas_"):
3885
+ alphas_ = best_clf.alphas_
3886
+ elif hasattr(best_clf, "alpha_"):
3887
+ alphas_ = best_clf.alpha_
3888
+ elif hasattr(best_clf, "Cs_"):
3889
+ alphas_ = best_clf.Cs_
3890
+ else:
3891
+ alphas_= None
3892
+ coef_ = best_clf.coef_ if hasattr(best_clf, "coef_") else None
3773
3893
  else: # Regression
3774
3894
  y_pred_proba_vote = None
3895
+ coef_,alphas_=None,None
3775
3896
 
3776
- print(f" ⤵ collecting voting results...")
3897
+ print(f"⤵ collecting voting results...")
3777
3898
  #! dict_pred_vote
3778
3899
  dict_pred_vote = {}
3779
3900
  validation_scores_vote = {}
@@ -3822,6 +3943,9 @@ def predict(
3822
3943
  "predictions_proba": (
3823
3944
  y_pred_proba_vote.tolist() if y_pred_proba_vote is not None else None
3824
3945
  ),
3946
+ "features":share_col_names,
3947
+ "coef":coef_,
3948
+ "alphas":alphas_
3825
3949
  }
3826
3950
  else: # Multi-class
3827
3951
  if y_pred_proba_vote is not None:
@@ -3856,6 +3980,9 @@ def predict(
3856
3980
  "predictions_proba": (
3857
3981
  y_pred_proba_vote.tolist() if y_pred_proba_vote is not None else None
3858
3982
  ),
3983
+ "features":share_col_names,
3984
+ "coef":coef_,
3985
+ "alphas":alphas_
3859
3986
  }
3860
3987
  else:
3861
3988
  if y_true is None:
@@ -3877,6 +4004,7 @@ def predict(
3877
4004
  "predictions_proba": (
3878
4005
  y_pred_proba_vote.tolist() if y_pred_proba_vote is not None else None
3879
4006
  ),
4007
+ "features":share_col_names,
3880
4008
  "y_train": y_train if y_train is not None else [],
3881
4009
  "y_true": y_true if y_true is not None else [],
3882
4010
  }
@@ -3907,9 +4035,15 @@ def predict(
3907
4035
  now_ = datetime.now().strftime("%y%m%d_%H%M%S")
3908
4036
  # try:
3909
4037
  if df_res.shape[0] > 3:
3910
- plot_validate_features(df_res, is_binary=is_binary)
4038
+ try:
4039
+ plot_validate_features(df_res, is_binary=is_binary)
4040
+ except Exception as e:
4041
+ print(e)
3911
4042
  else:
3912
- plot_validate_features_single(df_res, is_binary=is_binary)
4043
+ try:
4044
+ plot_validate_features_single(df_res, is_binary=is_binary)
4045
+ except Exception as e:
4046
+ print(e)
3913
4047
  if dir_save:
3914
4048
  ips.figsave(dir_save + f"validate_features{now_}.pdf")
3915
4049
  # except Exception as e: