py2ls 0.2.4.24__py3-none-any.whl → 0.2.4.25__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- py2ls/.git/index +0 -0
- py2ls/ec2ls.py +61 -0
- py2ls/ips.py +105 -55
- py2ls/ml2ls.py +244 -110
- py2ls/nl2ls.py +283 -0
- py2ls/plot.py +351 -40
- {py2ls-0.2.4.24.dist-info → py2ls-0.2.4.25.dist-info}/METADATA +1 -1
- {py2ls-0.2.4.24.dist-info → py2ls-0.2.4.25.dist-info}/RECORD +9 -8
- py2ls/ml2ls copy.py +0 -2906
- {py2ls-0.2.4.24.dist-info → py2ls-0.2.4.25.dist-info}/WHEEL +0 -0
py2ls/ml2ls.py
CHANGED
@@ -2206,6 +2206,8 @@ def predict(
|
|
2206
2206
|
y_train: pd.Series,
|
2207
2207
|
x_true: pd.DataFrame = None,
|
2208
2208
|
y_true: Optional[pd.Series] = None,
|
2209
|
+
fill_missing:bool = True,
|
2210
|
+
scaler:str='standard',# ["standard", "minmax", "robust","maxabs"]
|
2209
2211
|
backward: bool = False, # backward_regression
|
2210
2212
|
backward_thr:float = 0.05,# pval thr,only works when backward is True
|
2211
2213
|
common_features: set = None,
|
@@ -2324,7 +2326,7 @@ def predict(
|
|
2324
2326
|
if purpose == "classification":
|
2325
2327
|
model_ = {
|
2326
2328
|
"Random Forest": RandomForestClassifier(
|
2327
|
-
random_state=random_state, class_weight=class_weight
|
2329
|
+
random_state=random_state, class_weight=class_weight,n_jobs=n_jobs
|
2328
2330
|
),
|
2329
2331
|
# SVC (Support Vector Classification)
|
2330
2332
|
"SVM": SVC(
|
@@ -2335,7 +2337,7 @@ def predict(
|
|
2335
2337
|
),
|
2336
2338
|
# fit the best model without enforcing sparsity, which means it does not directly perform feature selection.
|
2337
2339
|
"Logistic Regression": LogisticRegression(
|
2338
|
-
class_weight=class_weight, random_state=random_state
|
2340
|
+
class_weight=class_weight, random_state=random_state,n_jobs=n_jobs
|
2339
2341
|
),
|
2340
2342
|
# Logistic Regression with L1 Regularization (Lasso)
|
2341
2343
|
"Lasso Logistic Regression": LogisticRegression(
|
@@ -2346,49 +2348,51 @@ def predict(
|
|
2346
2348
|
eval_metric="logloss",
|
2347
2349
|
random_state=random_state,
|
2348
2350
|
),
|
2349
|
-
"KNN": KNeighborsClassifier(n_neighbors=5),
|
2351
|
+
"KNN": KNeighborsClassifier(n_neighbors=5,n_jobs=n_jobs),
|
2350
2352
|
"Naive Bayes": GaussianNB(),
|
2351
2353
|
"Linear Discriminant Analysis": LinearDiscriminantAnalysis(),
|
2352
2354
|
"AdaBoost": AdaBoostClassifier(
|
2353
2355
|
algorithm="SAMME", random_state=random_state
|
2354
2356
|
),
|
2355
|
-
|
2357
|
+
"LightGBM": lgb.LGBMClassifier(random_state=random_state, class_weight=class_weight,n_jobs=n_jobs),
|
2356
2358
|
"CatBoost": cb.CatBoostClassifier(verbose=0, random_state=random_state),
|
2357
2359
|
"Extra Trees": ExtraTreesClassifier(
|
2358
|
-
random_state=random_state, class_weight=class_weight
|
2360
|
+
random_state=random_state, class_weight=class_weight,n_jobs=n_jobs
|
2359
2361
|
),
|
2360
|
-
"Bagging": BaggingClassifier(random_state=random_state),
|
2362
|
+
"Bagging": BaggingClassifier(random_state=random_state,n_jobs=n_jobs),
|
2361
2363
|
"Neural Network": MLPClassifier(max_iter=500, random_state=random_state),
|
2362
2364
|
"DecisionTree": DecisionTreeClassifier(),
|
2363
2365
|
"Quadratic Discriminant Analysis": QuadraticDiscriminantAnalysis(),
|
2364
2366
|
"Ridge": RidgeClassifierCV(
|
2365
2367
|
class_weight=class_weight, store_cv_results=True
|
2366
2368
|
),
|
2367
|
-
"Perceptron": Perceptron(random_state=random_state),
|
2369
|
+
"Perceptron": Perceptron(random_state=random_state,n_jobs=n_jobs),
|
2368
2370
|
"Bernoulli Naive Bayes": BernoulliNB(),
|
2369
|
-
"SGDClassifier": SGDClassifier(random_state=random_state),
|
2371
|
+
"SGDClassifier": SGDClassifier(random_state=random_state,n_jobs=n_jobs),
|
2370
2372
|
}
|
2371
2373
|
elif purpose == "regression":
|
2372
2374
|
model_ = {
|
2373
|
-
"Random Forest": RandomForestRegressor(random_state=random_state),
|
2375
|
+
"Random Forest": RandomForestRegressor(random_state=random_state,n_jobs=n_jobs),
|
2374
2376
|
"SVM": SVR(), # SVR (Support Vector Regression)
|
2375
2377
|
# "Lasso": Lasso(random_state=random_state), # 它和LassoCV相同(必须要提供alpha参数),
|
2376
2378
|
"LassoCV": LassoCV(
|
2377
|
-
cv=cv_folds, random_state=random_state
|
2379
|
+
cv=cv_folds, random_state=random_state,n_jobs=n_jobs
|
2378
2380
|
), # LassoCV自动找出最适alpha,优于Lasso
|
2379
2381
|
"Gradient Boosting": GradientBoostingRegressor(random_state=random_state),
|
2380
|
-
"XGBoost": xgb.XGBRegressor(eval_metric="rmse", random_state=random_state),
|
2381
|
-
"Linear Regression": LinearRegression(),
|
2382
|
+
"XGBoost": xgb.XGBRegressor(eval_metric="rmse", random_state=random_state,n_jobs=n_jobs),
|
2383
|
+
"Linear Regression": LinearRegression(n_jobs=n_jobs),
|
2382
2384
|
"AdaBoost": AdaBoostRegressor(random_state=random_state),
|
2383
|
-
|
2385
|
+
"LightGBM": lgb.LGBMRegressor(random_state=random_state,n_jobs=n_jobs,
|
2386
|
+
force_row_wise=True # Or use force_col_wise=True if memory is a concern
|
2387
|
+
),
|
2384
2388
|
"CatBoost": cb.CatBoostRegressor(verbose=0, random_state=random_state),
|
2385
|
-
"Extra Trees": ExtraTreesRegressor(random_state=random_state),
|
2386
|
-
"Bagging": BaggingRegressor(random_state=random_state),
|
2389
|
+
"Extra Trees": ExtraTreesRegressor(random_state=random_state,n_jobs=n_jobs),
|
2390
|
+
"Bagging": BaggingRegressor(random_state=random_state,n_jobs=n_jobs),
|
2387
2391
|
"Neural Network": MLPRegressor(max_iter=500, random_state=random_state),
|
2388
2392
|
"ElasticNet": ElasticNet(random_state=random_state),
|
2389
2393
|
"Ridge": Ridge(),
|
2390
|
-
"KNN": KNeighborsRegressor(),
|
2391
|
-
"TheilSen":TheilSenRegressor(),
|
2394
|
+
"KNN": KNeighborsRegressor(n_jobs=n_jobs),
|
2395
|
+
"TheilSen":TheilSenRegressor(n_jobs=n_jobs),
|
2392
2396
|
"Huber":HuberRegressor(),
|
2393
2397
|
"Poisson":PoissonRegressor()
|
2394
2398
|
}
|
@@ -2410,7 +2414,7 @@ def predict(
|
|
2410
2414
|
# indicate cls:
|
2411
2415
|
if ips.run_once_within(30): # 10 min
|
2412
2416
|
print(f"processing: {list(models.keys())}")
|
2413
|
-
|
2417
|
+
print(isinstance(y_train, str) and y_train in x_train.columns)
|
2414
2418
|
if isinstance(y_train, str) and y_train in x_train.columns:
|
2415
2419
|
y_train_col_name = y_train
|
2416
2420
|
y_train = x_train[y_train]
|
@@ -2418,6 +2422,7 @@ def predict(
|
|
2418
2422
|
x_train = x_train.drop(y_train_col_name, axis=1)
|
2419
2423
|
# else:
|
2420
2424
|
# y_train = ips.df_encoder(pd.DataFrame(y_train), method="dummy").values.ravel()
|
2425
|
+
|
2421
2426
|
y_train = pd.DataFrame(y_train)
|
2422
2427
|
if y_train.select_dtypes(include=np.number).empty:
|
2423
2428
|
y_train_ = ips.df_encoder(y_train, method="dummy", drop=None)
|
@@ -2430,6 +2435,9 @@ def predict(
|
|
2430
2435
|
y_train = ips.df_encoder(pd.DataFrame(y_train), method="label")
|
2431
2436
|
print("is_binary:", is_binary)
|
2432
2437
|
|
2438
|
+
if fill_missing:
|
2439
|
+
ips.df_fillna(data=x_train, method="knn", inplace=True, axis=0)
|
2440
|
+
ips.df_fillna(data=y_train, method="knn", inplace=True, axis=0)
|
2433
2441
|
# Perform backward feature selection
|
2434
2442
|
if backward:
|
2435
2443
|
selected_features = backward_regression(x_train, y_train, thr=backward_thr)
|
@@ -2458,6 +2466,8 @@ def predict(
|
|
2458
2466
|
pd.DataFrame(y_train), method="label"
|
2459
2467
|
).values.ravel()
|
2460
2468
|
|
2469
|
+
if fill_missing:
|
2470
|
+
ips.df_fillna(data=x_true, method="knn", inplace=True, axis=0)
|
2461
2471
|
if y_true is not None:
|
2462
2472
|
if isinstance(y_true, str) and y_true in x_true.columns:
|
2463
2473
|
y_true_col_name = y_true
|
@@ -2490,11 +2500,16 @@ def predict(
|
|
2490
2500
|
# Ensure common features are selected
|
2491
2501
|
if common_features is not None:
|
2492
2502
|
x_train, x_true = x_train[common_features], x_true[common_features]
|
2503
|
+
share_col_names=common_features
|
2493
2504
|
else:
|
2494
2505
|
share_col_names = ips.shared(x_train.columns, x_true.columns, verbose=verbose)
|
2495
2506
|
x_train, x_true = x_train[share_col_names], x_true[share_col_names]
|
2496
2507
|
|
2497
|
-
|
2508
|
+
#! scaler
|
2509
|
+
# scaler and fit x_train and export scaler to fit the x_true
|
2510
|
+
x_train,scaler_=ips.df_scaler(x_train,method=scaler,return_scaler=True)
|
2511
|
+
#
|
2512
|
+
x_true=ips.df_scaler(x_true,scaler=scaler_)# make sure 用于同一个scaler
|
2498
2513
|
x_train, x_true = ips.df_encoder(x_train, method="dummy"), ips.df_encoder(
|
2499
2514
|
x_true, method="dummy"
|
2500
2515
|
)
|
@@ -2516,6 +2531,7 @@ def predict(
|
|
2516
2531
|
if isinstance(y_train, np.ndarray):
|
2517
2532
|
y_true = ips.df_encoder(data=pd.DataFrame(y_true), method="label")
|
2518
2533
|
y_true = np.asarray(y_true)
|
2534
|
+
|
2519
2535
|
# Hyperparameter grids for tuning
|
2520
2536
|
param_grid_common_xgb = {
|
2521
2537
|
'learning_rate': [0.01, 0.1, 0.2, 0.3],
|
@@ -3168,83 +3184,124 @@ def predict(
|
|
3168
3184
|
):
|
3169
3185
|
if verbose:
|
3170
3186
|
print(f"\nTraining and validating {name}:")
|
3171
|
-
|
3172
|
-
|
3173
|
-
|
3174
|
-
|
3175
|
-
|
3176
|
-
|
3177
|
-
|
3178
|
-
|
3179
|
-
|
3180
|
-
|
3181
|
-
|
3182
|
-
|
3183
|
-
|
3184
|
-
|
3185
|
-
)
|
3186
|
-
|
3187
|
-
gs.fit(x_train, y_train)
|
3188
|
-
best_clf = gs.best_estimator_
|
3189
|
-
# make sure x_train and x_test has the same name
|
3190
|
-
x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
|
3191
|
-
y_pred = best_clf.predict(x_true)
|
3192
|
-
if hasattr(best_clf, "predict_proba"):
|
3193
|
-
y_pred_proba = best_clf.predict_proba(x_true)
|
3194
|
-
print("Shape of predicted probabilities:", y_pred_proba.shape)
|
3195
|
-
if y_pred_proba.shape[1] == 1:
|
3196
|
-
y_pred_proba = np.hstack(
|
3197
|
-
[1 - y_pred_proba, y_pred_proba]
|
3198
|
-
) # Add missing class probabilities
|
3199
|
-
y_pred_proba = y_pred_proba[:, 1]
|
3200
|
-
elif hasattr(best_clf, "decision_function"):
|
3201
|
-
# If predict_proba is not available, use decision_function (e.g., for SVM)
|
3202
|
-
y_pred_proba = best_clf.decision_function(x_true)
|
3203
|
-
# Ensure y_pred_proba is within 0 and 1 bounds
|
3204
|
-
y_pred_proba = (y_pred_proba - y_pred_proba.min()) / (
|
3205
|
-
y_pred_proba.max() - y_pred_proba.min()
|
3187
|
+
try:
|
3188
|
+
# Grid search with KFold or StratifiedKFold
|
3189
|
+
if is_binary:
|
3190
|
+
gs = GridSearchCV(
|
3191
|
+
clf,
|
3192
|
+
param_grid=param_grids.get(name, {}),
|
3193
|
+
scoring=(
|
3194
|
+
"roc_auc"
|
3195
|
+
if purpose == "classification"
|
3196
|
+
else "neg_mean_squared_error"
|
3197
|
+
),
|
3198
|
+
cv=cv,
|
3199
|
+
n_jobs=n_jobs,
|
3200
|
+
verbose=verbose,
|
3206
3201
|
)
|
3207
|
-
else:
|
3208
|
-
y_pred_proba = None # No probability output for certain models
|
3209
|
-
else:
|
3210
|
-
gs = GridSearchCV(
|
3211
|
-
clf,
|
3212
|
-
param_grid=param_grids.get(name, {}),
|
3213
|
-
scoring=(
|
3214
|
-
"roc_auc_ovr"
|
3215
|
-
if purpose == "classification"
|
3216
|
-
else "neg_mean_squared_error"
|
3217
|
-
),
|
3218
|
-
cv=cv,
|
3219
|
-
n_jobs=n_jobs,
|
3220
|
-
verbose=verbose,
|
3221
|
-
)
|
3222
|
-
|
3223
|
-
# Fit GridSearchCV
|
3224
|
-
gs.fit(x_train, y_train)
|
3225
|
-
best_clf = gs.best_estimator_
|
3226
|
-
|
3227
|
-
# Ensure x_true aligns with x_train columns
|
3228
|
-
x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
|
3229
|
-
y_pred = best_clf.predict(x_true)
|
3230
|
-
|
3231
|
-
# Handle prediction probabilities for multiclass
|
3232
|
-
if hasattr(best_clf, "predict_proba"):
|
3233
|
-
y_pred_proba = best_clf.predict_proba(x_true)
|
3234
|
-
elif hasattr(best_clf, "decision_function"):
|
3235
|
-
y_pred_proba = best_clf.decision_function(x_true)
|
3236
3202
|
|
3237
|
-
|
3238
|
-
|
3239
|
-
|
3240
|
-
|
3241
|
-
|
3242
|
-
|
3243
|
-
|
3203
|
+
gs.fit(x_train, y_train)
|
3204
|
+
best_clf = gs.best_estimator_
|
3205
|
+
# make sure x_train and x_test has the same name
|
3206
|
+
x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
|
3207
|
+
y_pred = best_clf.predict(x_true)
|
3208
|
+
if hasattr(best_clf, "predict_proba"):
|
3209
|
+
y_pred_proba = best_clf.predict_proba(x_true)
|
3210
|
+
print("Shape of predicted probabilities:", y_pred_proba.shape)
|
3211
|
+
if y_pred_proba.shape[1] == 1:
|
3212
|
+
y_pred_proba = np.hstack(
|
3213
|
+
[1 - y_pred_proba, y_pred_proba]
|
3214
|
+
) # Add missing class probabilities
|
3215
|
+
y_pred_proba = y_pred_proba[:, 1]
|
3216
|
+
elif hasattr(best_clf, "decision_function"):
|
3217
|
+
# If predict_proba is not available, use decision_function (e.g., for SVM)
|
3218
|
+
y_pred_proba = best_clf.decision_function(x_true)
|
3219
|
+
# Ensure y_pred_proba is within 0 and 1 bounds
|
3220
|
+
y_pred_proba = (y_pred_proba - y_pred_proba.min()) / (
|
3221
|
+
y_pred_proba.max() - y_pred_proba.min()
|
3244
3222
|
)
|
3223
|
+
else:
|
3224
|
+
y_pred_proba = None # No probability output for certain models
|
3225
|
+
# Access alphas if applicable (e.g., ElasticNetCV, LassoCV)
|
3226
|
+
if hasattr(best_clf, "alphas_") or hasattr(best_clf, "Cs_"):
|
3227
|
+
if hasattr(best_clf, "alphas_"):
|
3228
|
+
alphas_ = best_clf.alphas_
|
3229
|
+
elif hasattr(best_clf, "alpha_"):
|
3230
|
+
alphas_ = best_clf.alpha_
|
3231
|
+
elif hasattr(best_clf, "Cs_"):
|
3232
|
+
alphas_ = best_clf.Cs_
|
3233
|
+
else:
|
3234
|
+
alphas_= None
|
3235
|
+
coef_ = best_clf.coef_ if hasattr(best_clf, "coef_") else None
|
3245
3236
|
else:
|
3246
|
-
|
3237
|
+
gs = GridSearchCV(
|
3238
|
+
clf,
|
3239
|
+
param_grid=param_grids.get(name, {}),
|
3240
|
+
scoring=(
|
3241
|
+
"roc_auc_ovr"
|
3242
|
+
if purpose == "classification"
|
3243
|
+
else "neg_mean_squared_error"
|
3244
|
+
),
|
3245
|
+
cv=cv,
|
3246
|
+
n_jobs=n_jobs,
|
3247
|
+
verbose=verbose,
|
3248
|
+
)
|
3247
3249
|
|
3250
|
+
# Fit GridSearchCV
|
3251
|
+
gs.fit(x_train, y_train)
|
3252
|
+
best_clf = gs.best_estimator_
|
3253
|
+
|
3254
|
+
# Ensure x_true aligns with x_train columns
|
3255
|
+
x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
|
3256
|
+
|
3257
|
+
# do i need to fit the x_train, y_train again?
|
3258
|
+
best_clf=best_clf.fit(x_train, y_train)
|
3259
|
+
y_pred = best_clf.predict(x_true)
|
3260
|
+
|
3261
|
+
# Handle prediction probabilities for multiclass
|
3262
|
+
if hasattr(best_clf, "predict_proba"):
|
3263
|
+
y_pred_proba = best_clf.predict_proba(x_true)
|
3264
|
+
elif hasattr(best_clf, "decision_function"):
|
3265
|
+
y_pred_proba = best_clf.decision_function(x_true)
|
3266
|
+
|
3267
|
+
# Normalize for multiclass if necessary
|
3268
|
+
if y_pred_proba.ndim == 2:
|
3269
|
+
y_pred_proba = (
|
3270
|
+
y_pred_proba - y_pred_proba.min(axis=1, keepdims=True)
|
3271
|
+
) / (
|
3272
|
+
y_pred_proba.max(axis=1, keepdims=True)
|
3273
|
+
- y_pred_proba.min(axis=1, keepdims=True)
|
3274
|
+
)
|
3275
|
+
else:
|
3276
|
+
y_pred_proba = None # No probability output for certain models
|
3277
|
+
# Access alphas if applicable (e.g., ElasticNetCV, LassoCV)
|
3278
|
+
if hasattr(best_clf, "alphas_") or hasattr(best_clf, "Cs_"):
|
3279
|
+
if hasattr(best_clf, "alphas_"):
|
3280
|
+
alphas_ = best_clf.alphas_
|
3281
|
+
elif hasattr(best_clf, "alpha_"):
|
3282
|
+
alphas_ = best_clf.alpha_
|
3283
|
+
elif hasattr(best_clf, "Cs_"):
|
3284
|
+
alphas_ = best_clf.Cs_
|
3285
|
+
else:
|
3286
|
+
alphas_= None
|
3287
|
+
coef_ = best_clf.coef_ if hasattr(best_clf, "coef_") else None
|
3288
|
+
except Exception as e:
|
3289
|
+
alphas_,coef_ = None,None
|
3290
|
+
print(f"skiped {clf}: {e}")
|
3291
|
+
continue
|
3292
|
+
# try to make predict format consistant
|
3293
|
+
try:
|
3294
|
+
y_pred= [i[0] for i in y_pred]
|
3295
|
+
except:
|
3296
|
+
pass
|
3297
|
+
try:
|
3298
|
+
y_true= [i[0] for i in y_true]
|
3299
|
+
except:
|
3300
|
+
pass
|
3301
|
+
try:
|
3302
|
+
y_train= [i[0] for i in y_train]
|
3303
|
+
except:
|
3304
|
+
pass
|
3248
3305
|
validation_scores = {}
|
3249
3306
|
|
3250
3307
|
if y_true is not None and y_pred_proba is not None:
|
@@ -3294,20 +3351,26 @@ def predict(
|
|
3294
3351
|
"roc_curve": roc_info,
|
3295
3352
|
"pr_curve": pr_info,
|
3296
3353
|
"confusion_matrix": confusion_matrix(y_true, y_pred),
|
3297
|
-
"predictions": y_pred
|
3354
|
+
"predictions": y_pred,#.tolist(),
|
3298
3355
|
"predictions_proba": (
|
3299
3356
|
y_pred_proba.tolist() if y_pred_proba is not None else None
|
3300
3357
|
),
|
3358
|
+
"features":share_col_names,
|
3359
|
+
"coef":coef_,
|
3360
|
+
"alphas":alphas_
|
3301
3361
|
}
|
3302
3362
|
else: # "regression"
|
3303
3363
|
results[name] = {
|
3304
3364
|
"best_clf": gs.best_estimator_,
|
3305
3365
|
"best_params": gs.best_params_,
|
3306
3366
|
"scores": validation_scores, # e.g., neg_MSE, R², etc.
|
3307
|
-
"predictions": y_pred
|
3367
|
+
"predictions": y_pred,#.tolist(),
|
3308
3368
|
"predictions_proba": (
|
3309
3369
|
y_pred_proba.tolist() if y_pred_proba is not None else None
|
3310
3370
|
),
|
3371
|
+
"features":share_col_names,
|
3372
|
+
"coef":coef_,
|
3373
|
+
"alphas":alphas_
|
3311
3374
|
}
|
3312
3375
|
else: # multi-classes
|
3313
3376
|
if y_pred_proba is not None:
|
@@ -3346,20 +3409,26 @@ def predict(
|
|
3346
3409
|
"roc_curve": roc_info,
|
3347
3410
|
"pr_curve": pr_info,
|
3348
3411
|
"confusion_matrix": confusion_matrix(y_true, y_pred),
|
3349
|
-
"predictions": y_pred
|
3412
|
+
"predictions": y_pred,#.tolist(),
|
3350
3413
|
"predictions_proba": (
|
3351
3414
|
y_pred_proba.tolist() if y_pred_proba is not None else None
|
3352
3415
|
),
|
3416
|
+
"features":share_col_names,
|
3417
|
+
"coef":coef_,
|
3418
|
+
"alphas":alphas_
|
3353
3419
|
}
|
3354
3420
|
else: # "regression"
|
3355
3421
|
results[name] = {
|
3356
3422
|
"best_clf": gs.best_estimator_,
|
3357
3423
|
"best_params": gs.best_params_,
|
3358
3424
|
"scores": validation_scores, # e.g., neg_MSE, R², etc.
|
3359
|
-
"predictions": y_pred
|
3425
|
+
"predictions": y_pred,#.tolist(),
|
3360
3426
|
"predictions_proba": (
|
3361
3427
|
y_pred_proba.tolist() if y_pred_proba is not None else None
|
3362
3428
|
),
|
3429
|
+
"features":share_col_names,
|
3430
|
+
"coef":coef_,
|
3431
|
+
"alphas":alphas_
|
3363
3432
|
}
|
3364
3433
|
|
3365
3434
|
else:
|
@@ -3378,12 +3447,15 @@ def predict(
|
|
3378
3447
|
"best_clf": gs.best_estimator_,
|
3379
3448
|
"best_params": gs.best_params_,
|
3380
3449
|
"scores": validation_scores,
|
3381
|
-
"predictions": y_pred
|
3450
|
+
"predictions": y_pred,#.tolist(),
|
3382
3451
|
"predictions_proba": (
|
3383
3452
|
y_pred_proba.tolist() if y_pred_proba is not None else None
|
3384
3453
|
),
|
3454
|
+
"features":share_col_names,
|
3385
3455
|
"y_train": y_train if y_train is not None else [],
|
3386
3456
|
"y_true": y_true if y_true is not None else [],
|
3457
|
+
"coef":coef_,
|
3458
|
+
"alphas":alphas_
|
3387
3459
|
}
|
3388
3460
|
|
3389
3461
|
# Convert results to DataFrame
|
@@ -3446,7 +3518,7 @@ def predict(
|
|
3446
3518
|
for i, j in top_models.to_dict().items():
|
3447
3519
|
base_estimators.append((i, j))
|
3448
3520
|
if stacking_cv:
|
3449
|
-
print(f"
|
3521
|
+
print(f"⤵ stacking_cv is processing...")
|
3450
3522
|
#* 定义几个象征性的final_estimator
|
3451
3523
|
# 备选的几种
|
3452
3524
|
if purpose == "classification":
|
@@ -3520,7 +3592,7 @@ def predict(
|
|
3520
3592
|
best_final_estimator = cv_results_df.iloc[0]['final_estimator']
|
3521
3593
|
print(f"Best final estimator based on cross-validation: {best_final_estimator}")
|
3522
3594
|
else:
|
3523
|
-
print(f"
|
3595
|
+
print(f"⤵ trying to find the best_final_estimator for stacking...")
|
3524
3596
|
if purpose=="classification":
|
3525
3597
|
best_final_estimator = LogisticRegression(class_weight=class_weight,
|
3526
3598
|
random_state=random_state,
|
@@ -3530,26 +3602,25 @@ def predict(
|
|
3530
3602
|
print(f"⤵ the best best_final_estimator: {best_final_estimator}")
|
3531
3603
|
#! apply stacking
|
3532
3604
|
if purpose == "classification":
|
3533
|
-
print(f"
|
3605
|
+
print(f"⤵ StackingClassifier...")
|
3534
3606
|
stacking_model = StackingClassifier(estimators=base_estimators,
|
3535
3607
|
final_estimator=best_final_estimator,
|
3536
3608
|
cv=cv)
|
3537
3609
|
else:
|
3538
|
-
print(f"
|
3610
|
+
print(f"⤵ StackingRegressor...")
|
3539
3611
|
stacking_model = StackingRegressor(estimators=base_estimators,
|
3540
3612
|
final_estimator=best_final_estimator,
|
3541
3613
|
cv=cv)
|
3542
3614
|
|
3543
3615
|
# Train the Stacking Classifier
|
3544
|
-
print(f"
|
3616
|
+
print(f"⤵ fit & predict...")
|
3545
3617
|
stacking_model.fit(x_train, y_train)
|
3546
3618
|
y_pred_final = stacking_model.predict(x_true)
|
3547
|
-
print(f"
|
3619
|
+
print(f"⤵ collecting results...")
|
3548
3620
|
# pred_proba
|
3549
3621
|
if is_binary:
|
3550
3622
|
if hasattr(stacking_model, "predict_proba"):
|
3551
3623
|
y_pred_proba_final = stacking_model.predict_proba(x_true)
|
3552
|
-
print("Shape of predicted probabilities:", y_pred_proba_final.shape)
|
3553
3624
|
if y_pred_proba_final.shape[1] == 1:
|
3554
3625
|
y_pred_proba_final = np.hstack(
|
3555
3626
|
[1 - y_pred_proba_final, y_pred_proba_final]
|
@@ -3564,6 +3635,17 @@ def predict(
|
|
3564
3635
|
)
|
3565
3636
|
else:
|
3566
3637
|
y_pred_proba_final = None # No probability output for certain models
|
3638
|
+
# Access alphas if applicable (e.g., ElasticNetCV, LassoCV)
|
3639
|
+
if hasattr(best_clf, "alphas_") or hasattr(best_clf, "Cs_"):
|
3640
|
+
if hasattr(best_clf, "alphas_"):
|
3641
|
+
alphas_ = best_clf.alphas_
|
3642
|
+
elif hasattr(best_clf, "alpha_"):
|
3643
|
+
alphas_ = best_clf.alpha_
|
3644
|
+
elif hasattr(best_clf, "Cs_"):
|
3645
|
+
alphas_ = best_clf.Cs_
|
3646
|
+
else:
|
3647
|
+
alphas_= None
|
3648
|
+
coef_ = best_clf.coef_ if hasattr(best_clf, "coef_") else None
|
3567
3649
|
if not is_binary:
|
3568
3650
|
# Handle prediction probabilities for multiclass
|
3569
3651
|
if hasattr(stacking_model, "predict_proba"):
|
@@ -3581,6 +3663,17 @@ def predict(
|
|
3581
3663
|
)
|
3582
3664
|
else:
|
3583
3665
|
y_pred_proba_final = None # No probability output for certain models
|
3666
|
+
# Access alphas if applicable (e.g., ElasticNetCV, LassoCV)
|
3667
|
+
if hasattr(best_clf, "alphas_") or hasattr(best_clf, "Cs_"):
|
3668
|
+
if hasattr(best_clf, "alphas_"):
|
3669
|
+
alphas_ = best_clf.alphas_
|
3670
|
+
elif hasattr(best_clf, "alpha_"):
|
3671
|
+
alphas_ = best_clf.alpha_
|
3672
|
+
elif hasattr(best_clf, "Cs_"):
|
3673
|
+
alphas_ = best_clf.Cs_
|
3674
|
+
else:
|
3675
|
+
alphas_= None
|
3676
|
+
coef_ = best_clf.coef_ if hasattr(best_clf, "coef_") else None
|
3584
3677
|
#! dict_pred_stack
|
3585
3678
|
dict_pred_stack={}
|
3586
3679
|
validation_scores_final = {}
|
@@ -3631,6 +3724,9 @@ def predict(
|
|
3631
3724
|
"predictions_proba": (
|
3632
3725
|
y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
|
3633
3726
|
),
|
3727
|
+
"features":share_col_names,
|
3728
|
+
"coef":coef_,
|
3729
|
+
"alphas":alphas_
|
3634
3730
|
}
|
3635
3731
|
else: # "regression"
|
3636
3732
|
dict_pred_stack = {
|
@@ -3641,6 +3737,9 @@ def predict(
|
|
3641
3737
|
"predictions_proba": (
|
3642
3738
|
y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
|
3643
3739
|
),
|
3740
|
+
"features":share_col_names,
|
3741
|
+
"coef":coef_,
|
3742
|
+
"alphas":alphas_
|
3644
3743
|
}
|
3645
3744
|
else: # multi-classes
|
3646
3745
|
if y_pred_proba_final is not None:
|
@@ -3680,6 +3779,9 @@ def predict(
|
|
3680
3779
|
"predictions_proba": (
|
3681
3780
|
y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
|
3682
3781
|
),
|
3782
|
+
"features":share_col_names,
|
3783
|
+
"coef":coef_,
|
3784
|
+
"alphas":alphas_
|
3683
3785
|
}
|
3684
3786
|
else: # "regression"
|
3685
3787
|
dict_pred_stack = {
|
@@ -3690,6 +3792,9 @@ def predict(
|
|
3690
3792
|
"predictions_proba": (
|
3691
3793
|
y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
|
3692
3794
|
),
|
3795
|
+
"features":share_col_names,
|
3796
|
+
"coef":coef_,
|
3797
|
+
"alphas":alphas_
|
3693
3798
|
}
|
3694
3799
|
|
3695
3800
|
else:
|
@@ -3712,8 +3817,11 @@ def predict(
|
|
3712
3817
|
"predictions_proba": (
|
3713
3818
|
y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
|
3714
3819
|
),
|
3820
|
+
"features":share_col_names,
|
3715
3821
|
"y_train": y_train if y_train is not None else [],
|
3716
3822
|
"y_true": y_true if y_true is not None else [],
|
3823
|
+
"coef":coef_,
|
3824
|
+
"alphas":alphas_
|
3717
3825
|
}
|
3718
3826
|
# merge together
|
3719
3827
|
df_pred = pd.DataFrame(
|
@@ -3728,16 +3836,16 @@ def predict(
|
|
3728
3836
|
# if dir_save:
|
3729
3837
|
# ips.figsave(dir_save + f"validate_features_stacking_{now_}.pdf")
|
3730
3838
|
if vote:
|
3731
|
-
print(f"
|
3839
|
+
print(f"⤵ voting...")
|
3732
3840
|
from sklearn.ensemble import VotingClassifier, VotingRegressor
|
3733
|
-
#!
|
3841
|
+
#! voting
|
3734
3842
|
n_top_models = min(n_top_models, df_results.shape[0])
|
3735
3843
|
base_estimators=[]
|
3736
3844
|
for name, cls in zip(list(df_results.iloc[:n_top_models, :].index),df_results.iloc[:n_top_models, :]["best_clf"].tolist()):
|
3737
3845
|
base_estimators.append((name,cls))
|
3738
3846
|
# Apply Voting Classifier/Regressor
|
3739
3847
|
if purpose == "classification":
|
3740
|
-
print(f"
|
3848
|
+
print(f"⤵ VotingClassifier...via{voting}")
|
3741
3849
|
if voting=='hard':
|
3742
3850
|
# Hard voting does not support `predict_proba`
|
3743
3851
|
voting_model = VotingClassifier(estimators=base_estimators)
|
@@ -3745,7 +3853,7 @@ def predict(
|
|
3745
3853
|
# Soft voting supports `predict_proba`
|
3746
3854
|
voting_model = VotingClassifier(estimators=base_estimators, voting="soft")
|
3747
3855
|
else:
|
3748
|
-
print(f"
|
3856
|
+
print(f"⤵ VotingRegressor...")
|
3749
3857
|
voting_model = VotingRegressor(estimators=base_estimators)
|
3750
3858
|
|
3751
3859
|
# Train the Voting Classifier/Regressor
|
@@ -3770,10 +3878,23 @@ def predict(
|
|
3770
3878
|
y_pred_proba_vote = y_pred_proba_vote[:, 1]
|
3771
3879
|
else:
|
3772
3880
|
y_pred_proba_vote = None
|
3881
|
+
|
3882
|
+
# Access alphas if applicable (e.g., ElasticNetCV, LassoCV)
|
3883
|
+
if hasattr(best_clf, "alphas_") or hasattr(best_clf, "Cs_"):
|
3884
|
+
if hasattr(best_clf, "alphas_"):
|
3885
|
+
alphas_ = best_clf.alphas_
|
3886
|
+
elif hasattr(best_clf, "alpha_"):
|
3887
|
+
alphas_ = best_clf.alpha_
|
3888
|
+
elif hasattr(best_clf, "Cs_"):
|
3889
|
+
alphas_ = best_clf.Cs_
|
3890
|
+
else:
|
3891
|
+
alphas_= None
|
3892
|
+
coef_ = best_clf.coef_ if hasattr(best_clf, "coef_") else None
|
3773
3893
|
else: # Regression
|
3774
3894
|
y_pred_proba_vote = None
|
3895
|
+
coef_,alphas_=None,None
|
3775
3896
|
|
3776
|
-
print(f"
|
3897
|
+
print(f"⤵ collecting voting results...")
|
3777
3898
|
#! dict_pred_vote
|
3778
3899
|
dict_pred_vote = {}
|
3779
3900
|
validation_scores_vote = {}
|
@@ -3822,6 +3943,9 @@ def predict(
|
|
3822
3943
|
"predictions_proba": (
|
3823
3944
|
y_pred_proba_vote.tolist() if y_pred_proba_vote is not None else None
|
3824
3945
|
),
|
3946
|
+
"features":share_col_names,
|
3947
|
+
"coef":coef_,
|
3948
|
+
"alphas":alphas_
|
3825
3949
|
}
|
3826
3950
|
else: # Multi-class
|
3827
3951
|
if y_pred_proba_vote is not None:
|
@@ -3856,6 +3980,9 @@ def predict(
|
|
3856
3980
|
"predictions_proba": (
|
3857
3981
|
y_pred_proba_vote.tolist() if y_pred_proba_vote is not None else None
|
3858
3982
|
),
|
3983
|
+
"features":share_col_names,
|
3984
|
+
"coef":coef_,
|
3985
|
+
"alphas":alphas_
|
3859
3986
|
}
|
3860
3987
|
else:
|
3861
3988
|
if y_true is None:
|
@@ -3877,6 +4004,7 @@ def predict(
|
|
3877
4004
|
"predictions_proba": (
|
3878
4005
|
y_pred_proba_vote.tolist() if y_pred_proba_vote is not None else None
|
3879
4006
|
),
|
4007
|
+
"features":share_col_names,
|
3880
4008
|
"y_train": y_train if y_train is not None else [],
|
3881
4009
|
"y_true": y_true if y_true is not None else [],
|
3882
4010
|
}
|
@@ -3907,9 +4035,15 @@ def predict(
|
|
3907
4035
|
now_ = datetime.now().strftime("%y%m%d_%H%M%S")
|
3908
4036
|
# try:
|
3909
4037
|
if df_res.shape[0] > 3:
|
3910
|
-
|
4038
|
+
try:
|
4039
|
+
plot_validate_features(df_res, is_binary=is_binary)
|
4040
|
+
except Exception as e:
|
4041
|
+
print(e)
|
3911
4042
|
else:
|
3912
|
-
|
4043
|
+
try:
|
4044
|
+
plot_validate_features_single(df_res, is_binary=is_binary)
|
4045
|
+
except Exception as e:
|
4046
|
+
print(e)
|
3913
4047
|
if dir_save:
|
3914
4048
|
ips.figsave(dir_save + f"validate_features{now_}.pdf")
|
3915
4049
|
# except Exception as e:
|