py2ls 0.2.4.24__py3-none-any.whl → 0.2.4.25__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- py2ls/.git/index +0 -0
- py2ls/ec2ls.py +61 -0
- py2ls/ips.py +105 -55
- py2ls/ml2ls.py +244 -110
- py2ls/nl2ls.py +283 -0
- py2ls/plot.py +351 -40
- {py2ls-0.2.4.24.dist-info → py2ls-0.2.4.25.dist-info}/METADATA +1 -1
- {py2ls-0.2.4.24.dist-info → py2ls-0.2.4.25.dist-info}/RECORD +9 -8
- py2ls/ml2ls copy.py +0 -2906
- {py2ls-0.2.4.24.dist-info → py2ls-0.2.4.25.dist-info}/WHEEL +0 -0
py2ls/ml2ls.py
CHANGED
@@ -2206,6 +2206,8 @@ def predict(
|
|
2206
2206
|
y_train: pd.Series,
|
2207
2207
|
x_true: pd.DataFrame = None,
|
2208
2208
|
y_true: Optional[pd.Series] = None,
|
2209
|
+
fill_missing:bool = True,
|
2210
|
+
scaler:str='standard',# ["standard", "minmax", "robust","maxabs"]
|
2209
2211
|
backward: bool = False, # backward_regression
|
2210
2212
|
backward_thr:float = 0.05,# pval thr,only works when backward is True
|
2211
2213
|
common_features: set = None,
|
@@ -2324,7 +2326,7 @@ def predict(
|
|
2324
2326
|
if purpose == "classification":
|
2325
2327
|
model_ = {
|
2326
2328
|
"Random Forest": RandomForestClassifier(
|
2327
|
-
random_state=random_state, class_weight=class_weight
|
2329
|
+
random_state=random_state, class_weight=class_weight,n_jobs=n_jobs
|
2328
2330
|
),
|
2329
2331
|
# SVC (Support Vector Classification)
|
2330
2332
|
"SVM": SVC(
|
@@ -2335,7 +2337,7 @@ def predict(
|
|
2335
2337
|
),
|
2336
2338
|
# fit the best model without enforcing sparsity, which means it does not directly perform feature selection.
|
2337
2339
|
"Logistic Regression": LogisticRegression(
|
2338
|
-
class_weight=class_weight, random_state=random_state
|
2340
|
+
class_weight=class_weight, random_state=random_state,n_jobs=n_jobs
|
2339
2341
|
),
|
2340
2342
|
# Logistic Regression with L1 Regularization (Lasso)
|
2341
2343
|
"Lasso Logistic Regression": LogisticRegression(
|
@@ -2346,49 +2348,51 @@ def predict(
|
|
2346
2348
|
eval_metric="logloss",
|
2347
2349
|
random_state=random_state,
|
2348
2350
|
),
|
2349
|
-
"KNN": KNeighborsClassifier(n_neighbors=5),
|
2351
|
+
"KNN": KNeighborsClassifier(n_neighbors=5,n_jobs=n_jobs),
|
2350
2352
|
"Naive Bayes": GaussianNB(),
|
2351
2353
|
"Linear Discriminant Analysis": LinearDiscriminantAnalysis(),
|
2352
2354
|
"AdaBoost": AdaBoostClassifier(
|
2353
2355
|
algorithm="SAMME", random_state=random_state
|
2354
2356
|
),
|
2355
|
-
|
2357
|
+
"LightGBM": lgb.LGBMClassifier(random_state=random_state, class_weight=class_weight,n_jobs=n_jobs),
|
2356
2358
|
"CatBoost": cb.CatBoostClassifier(verbose=0, random_state=random_state),
|
2357
2359
|
"Extra Trees": ExtraTreesClassifier(
|
2358
|
-
random_state=random_state, class_weight=class_weight
|
2360
|
+
random_state=random_state, class_weight=class_weight,n_jobs=n_jobs
|
2359
2361
|
),
|
2360
|
-
"Bagging": BaggingClassifier(random_state=random_state),
|
2362
|
+
"Bagging": BaggingClassifier(random_state=random_state,n_jobs=n_jobs),
|
2361
2363
|
"Neural Network": MLPClassifier(max_iter=500, random_state=random_state),
|
2362
2364
|
"DecisionTree": DecisionTreeClassifier(),
|
2363
2365
|
"Quadratic Discriminant Analysis": QuadraticDiscriminantAnalysis(),
|
2364
2366
|
"Ridge": RidgeClassifierCV(
|
2365
2367
|
class_weight=class_weight, store_cv_results=True
|
2366
2368
|
),
|
2367
|
-
"Perceptron": Perceptron(random_state=random_state),
|
2369
|
+
"Perceptron": Perceptron(random_state=random_state,n_jobs=n_jobs),
|
2368
2370
|
"Bernoulli Naive Bayes": BernoulliNB(),
|
2369
|
-
"SGDClassifier": SGDClassifier(random_state=random_state),
|
2371
|
+
"SGDClassifier": SGDClassifier(random_state=random_state,n_jobs=n_jobs),
|
2370
2372
|
}
|
2371
2373
|
elif purpose == "regression":
|
2372
2374
|
model_ = {
|
2373
|
-
"Random Forest": RandomForestRegressor(random_state=random_state),
|
2375
|
+
"Random Forest": RandomForestRegressor(random_state=random_state,n_jobs=n_jobs),
|
2374
2376
|
"SVM": SVR(), # SVR (Support Vector Regression)
|
2375
2377
|
# "Lasso": Lasso(random_state=random_state), # 它和LassoCV相同(必须要提供alpha参数),
|
2376
2378
|
"LassoCV": LassoCV(
|
2377
|
-
cv=cv_folds, random_state=random_state
|
2379
|
+
cv=cv_folds, random_state=random_state,n_jobs=n_jobs
|
2378
2380
|
), # LassoCV自动找出最适alpha,优于Lasso
|
2379
2381
|
"Gradient Boosting": GradientBoostingRegressor(random_state=random_state),
|
2380
|
-
"XGBoost": xgb.XGBRegressor(eval_metric="rmse", random_state=random_state),
|
2381
|
-
"Linear Regression": LinearRegression(),
|
2382
|
+
"XGBoost": xgb.XGBRegressor(eval_metric="rmse", random_state=random_state,n_jobs=n_jobs),
|
2383
|
+
"Linear Regression": LinearRegression(n_jobs=n_jobs),
|
2382
2384
|
"AdaBoost": AdaBoostRegressor(random_state=random_state),
|
2383
|
-
|
2385
|
+
"LightGBM": lgb.LGBMRegressor(random_state=random_state,n_jobs=n_jobs,
|
2386
|
+
force_row_wise=True # Or use force_col_wise=True if memory is a concern
|
2387
|
+
),
|
2384
2388
|
"CatBoost": cb.CatBoostRegressor(verbose=0, random_state=random_state),
|
2385
|
-
"Extra Trees": ExtraTreesRegressor(random_state=random_state),
|
2386
|
-
"Bagging": BaggingRegressor(random_state=random_state),
|
2389
|
+
"Extra Trees": ExtraTreesRegressor(random_state=random_state,n_jobs=n_jobs),
|
2390
|
+
"Bagging": BaggingRegressor(random_state=random_state,n_jobs=n_jobs),
|
2387
2391
|
"Neural Network": MLPRegressor(max_iter=500, random_state=random_state),
|
2388
2392
|
"ElasticNet": ElasticNet(random_state=random_state),
|
2389
2393
|
"Ridge": Ridge(),
|
2390
|
-
"KNN": KNeighborsRegressor(),
|
2391
|
-
"TheilSen":TheilSenRegressor(),
|
2394
|
+
"KNN": KNeighborsRegressor(n_jobs=n_jobs),
|
2395
|
+
"TheilSen":TheilSenRegressor(n_jobs=n_jobs),
|
2392
2396
|
"Huber":HuberRegressor(),
|
2393
2397
|
"Poisson":PoissonRegressor()
|
2394
2398
|
}
|
@@ -2410,7 +2414,7 @@ def predict(
|
|
2410
2414
|
# indicate cls:
|
2411
2415
|
if ips.run_once_within(30): # 10 min
|
2412
2416
|
print(f"processing: {list(models.keys())}")
|
2413
|
-
|
2417
|
+
print(isinstance(y_train, str) and y_train in x_train.columns)
|
2414
2418
|
if isinstance(y_train, str) and y_train in x_train.columns:
|
2415
2419
|
y_train_col_name = y_train
|
2416
2420
|
y_train = x_train[y_train]
|
@@ -2418,6 +2422,7 @@ def predict(
|
|
2418
2422
|
x_train = x_train.drop(y_train_col_name, axis=1)
|
2419
2423
|
# else:
|
2420
2424
|
# y_train = ips.df_encoder(pd.DataFrame(y_train), method="dummy").values.ravel()
|
2425
|
+
|
2421
2426
|
y_train = pd.DataFrame(y_train)
|
2422
2427
|
if y_train.select_dtypes(include=np.number).empty:
|
2423
2428
|
y_train_ = ips.df_encoder(y_train, method="dummy", drop=None)
|
@@ -2430,6 +2435,9 @@ def predict(
|
|
2430
2435
|
y_train = ips.df_encoder(pd.DataFrame(y_train), method="label")
|
2431
2436
|
print("is_binary:", is_binary)
|
2432
2437
|
|
2438
|
+
if fill_missing:
|
2439
|
+
ips.df_fillna(data=x_train, method="knn", inplace=True, axis=0)
|
2440
|
+
ips.df_fillna(data=y_train, method="knn", inplace=True, axis=0)
|
2433
2441
|
# Perform backward feature selection
|
2434
2442
|
if backward:
|
2435
2443
|
selected_features = backward_regression(x_train, y_train, thr=backward_thr)
|
@@ -2458,6 +2466,8 @@ def predict(
|
|
2458
2466
|
pd.DataFrame(y_train), method="label"
|
2459
2467
|
).values.ravel()
|
2460
2468
|
|
2469
|
+
if fill_missing:
|
2470
|
+
ips.df_fillna(data=x_true, method="knn", inplace=True, axis=0)
|
2461
2471
|
if y_true is not None:
|
2462
2472
|
if isinstance(y_true, str) and y_true in x_true.columns:
|
2463
2473
|
y_true_col_name = y_true
|
@@ -2490,11 +2500,16 @@ def predict(
|
|
2490
2500
|
# Ensure common features are selected
|
2491
2501
|
if common_features is not None:
|
2492
2502
|
x_train, x_true = x_train[common_features], x_true[common_features]
|
2503
|
+
share_col_names=common_features
|
2493
2504
|
else:
|
2494
2505
|
share_col_names = ips.shared(x_train.columns, x_true.columns, verbose=verbose)
|
2495
2506
|
x_train, x_true = x_train[share_col_names], x_true[share_col_names]
|
2496
2507
|
|
2497
|
-
|
2508
|
+
#! scaler
|
2509
|
+
# scaler and fit x_train and export scaler to fit the x_true
|
2510
|
+
x_train,scaler_=ips.df_scaler(x_train,method=scaler,return_scaler=True)
|
2511
|
+
#
|
2512
|
+
x_true=ips.df_scaler(x_true,scaler=scaler_)# make sure 用于同一个scaler
|
2498
2513
|
x_train, x_true = ips.df_encoder(x_train, method="dummy"), ips.df_encoder(
|
2499
2514
|
x_true, method="dummy"
|
2500
2515
|
)
|
@@ -2516,6 +2531,7 @@ def predict(
|
|
2516
2531
|
if isinstance(y_train, np.ndarray):
|
2517
2532
|
y_true = ips.df_encoder(data=pd.DataFrame(y_true), method="label")
|
2518
2533
|
y_true = np.asarray(y_true)
|
2534
|
+
|
2519
2535
|
# Hyperparameter grids for tuning
|
2520
2536
|
param_grid_common_xgb = {
|
2521
2537
|
'learning_rate': [0.01, 0.1, 0.2, 0.3],
|
@@ -3168,83 +3184,124 @@ def predict(
|
|
3168
3184
|
):
|
3169
3185
|
if verbose:
|
3170
3186
|
print(f"\nTraining and validating {name}:")
|
3171
|
-
|
3172
|
-
|
3173
|
-
|
3174
|
-
|
3175
|
-
|
3176
|
-
|
3177
|
-
|
3178
|
-
|
3179
|
-
|
3180
|
-
|
3181
|
-
|
3182
|
-
|
3183
|
-
|
3184
|
-
|
3185
|
-
)
|
3186
|
-
|
3187
|
-
gs.fit(x_train, y_train)
|
3188
|
-
best_clf = gs.best_estimator_
|
3189
|
-
# make sure x_train and x_test has the same name
|
3190
|
-
x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
|
3191
|
-
y_pred = best_clf.predict(x_true)
|
3192
|
-
if hasattr(best_clf, "predict_proba"):
|
3193
|
-
y_pred_proba = best_clf.predict_proba(x_true)
|
3194
|
-
print("Shape of predicted probabilities:", y_pred_proba.shape)
|
3195
|
-
if y_pred_proba.shape[1] == 1:
|
3196
|
-
y_pred_proba = np.hstack(
|
3197
|
-
[1 - y_pred_proba, y_pred_proba]
|
3198
|
-
) # Add missing class probabilities
|
3199
|
-
y_pred_proba = y_pred_proba[:, 1]
|
3200
|
-
elif hasattr(best_clf, "decision_function"):
|
3201
|
-
# If predict_proba is not available, use decision_function (e.g., for SVM)
|
3202
|
-
y_pred_proba = best_clf.decision_function(x_true)
|
3203
|
-
# Ensure y_pred_proba is within 0 and 1 bounds
|
3204
|
-
y_pred_proba = (y_pred_proba - y_pred_proba.min()) / (
|
3205
|
-
y_pred_proba.max() - y_pred_proba.min()
|
3187
|
+
try:
|
3188
|
+
# Grid search with KFold or StratifiedKFold
|
3189
|
+
if is_binary:
|
3190
|
+
gs = GridSearchCV(
|
3191
|
+
clf,
|
3192
|
+
param_grid=param_grids.get(name, {}),
|
3193
|
+
scoring=(
|
3194
|
+
"roc_auc"
|
3195
|
+
if purpose == "classification"
|
3196
|
+
else "neg_mean_squared_error"
|
3197
|
+
),
|
3198
|
+
cv=cv,
|
3199
|
+
n_jobs=n_jobs,
|
3200
|
+
verbose=verbose,
|
3206
3201
|
)
|
3207
|
-
else:
|
3208
|
-
y_pred_proba = None # No probability output for certain models
|
3209
|
-
else:
|
3210
|
-
gs = GridSearchCV(
|
3211
|
-
clf,
|
3212
|
-
param_grid=param_grids.get(name, {}),
|
3213
|
-
scoring=(
|
3214
|
-
"roc_auc_ovr"
|
3215
|
-
if purpose == "classification"
|
3216
|
-
else "neg_mean_squared_error"
|
3217
|
-
),
|
3218
|
-
cv=cv,
|
3219
|
-
n_jobs=n_jobs,
|
3220
|
-
verbose=verbose,
|
3221
|
-
)
|
3222
|
-
|
3223
|
-
# Fit GridSearchCV
|
3224
|
-
gs.fit(x_train, y_train)
|
3225
|
-
best_clf = gs.best_estimator_
|
3226
|
-
|
3227
|
-
# Ensure x_true aligns with x_train columns
|
3228
|
-
x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
|
3229
|
-
y_pred = best_clf.predict(x_true)
|
3230
|
-
|
3231
|
-
# Handle prediction probabilities for multiclass
|
3232
|
-
if hasattr(best_clf, "predict_proba"):
|
3233
|
-
y_pred_proba = best_clf.predict_proba(x_true)
|
3234
|
-
elif hasattr(best_clf, "decision_function"):
|
3235
|
-
y_pred_proba = best_clf.decision_function(x_true)
|
3236
3202
|
|
3237
|
-
|
3238
|
-
|
3239
|
-
|
3240
|
-
|
3241
|
-
|
3242
|
-
|
3243
|
-
|
3203
|
+
gs.fit(x_train, y_train)
|
3204
|
+
best_clf = gs.best_estimator_
|
3205
|
+
# make sure x_train and x_test has the same name
|
3206
|
+
x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
|
3207
|
+
y_pred = best_clf.predict(x_true)
|
3208
|
+
if hasattr(best_clf, "predict_proba"):
|
3209
|
+
y_pred_proba = best_clf.predict_proba(x_true)
|
3210
|
+
print("Shape of predicted probabilities:", y_pred_proba.shape)
|
3211
|
+
if y_pred_proba.shape[1] == 1:
|
3212
|
+
y_pred_proba = np.hstack(
|
3213
|
+
[1 - y_pred_proba, y_pred_proba]
|
3214
|
+
) # Add missing class probabilities
|
3215
|
+
y_pred_proba = y_pred_proba[:, 1]
|
3216
|
+
elif hasattr(best_clf, "decision_function"):
|
3217
|
+
# If predict_proba is not available, use decision_function (e.g., for SVM)
|
3218
|
+
y_pred_proba = best_clf.decision_function(x_true)
|
3219
|
+
# Ensure y_pred_proba is within 0 and 1 bounds
|
3220
|
+
y_pred_proba = (y_pred_proba - y_pred_proba.min()) / (
|
3221
|
+
y_pred_proba.max() - y_pred_proba.min()
|
3244
3222
|
)
|
3223
|
+
else:
|
3224
|
+
y_pred_proba = None # No probability output for certain models
|
3225
|
+
# Access alphas if applicable (e.g., ElasticNetCV, LassoCV)
|
3226
|
+
if hasattr(best_clf, "alphas_") or hasattr(best_clf, "Cs_"):
|
3227
|
+
if hasattr(best_clf, "alphas_"):
|
3228
|
+
alphas_ = best_clf.alphas_
|
3229
|
+
elif hasattr(best_clf, "alpha_"):
|
3230
|
+
alphas_ = best_clf.alpha_
|
3231
|
+
elif hasattr(best_clf, "Cs_"):
|
3232
|
+
alphas_ = best_clf.Cs_
|
3233
|
+
else:
|
3234
|
+
alphas_= None
|
3235
|
+
coef_ = best_clf.coef_ if hasattr(best_clf, "coef_") else None
|
3245
3236
|
else:
|
3246
|
-
|
3237
|
+
gs = GridSearchCV(
|
3238
|
+
clf,
|
3239
|
+
param_grid=param_grids.get(name, {}),
|
3240
|
+
scoring=(
|
3241
|
+
"roc_auc_ovr"
|
3242
|
+
if purpose == "classification"
|
3243
|
+
else "neg_mean_squared_error"
|
3244
|
+
),
|
3245
|
+
cv=cv,
|
3246
|
+
n_jobs=n_jobs,
|
3247
|
+
verbose=verbose,
|
3248
|
+
)
|
3247
3249
|
|
3250
|
+
# Fit GridSearchCV
|
3251
|
+
gs.fit(x_train, y_train)
|
3252
|
+
best_clf = gs.best_estimator_
|
3253
|
+
|
3254
|
+
# Ensure x_true aligns with x_train columns
|
3255
|
+
x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
|
3256
|
+
|
3257
|
+
# do i need to fit the x_train, y_train again?
|
3258
|
+
best_clf=best_clf.fit(x_train, y_train)
|
3259
|
+
y_pred = best_clf.predict(x_true)
|
3260
|
+
|
3261
|
+
# Handle prediction probabilities for multiclass
|
3262
|
+
if hasattr(best_clf, "predict_proba"):
|
3263
|
+
y_pred_proba = best_clf.predict_proba(x_true)
|
3264
|
+
elif hasattr(best_clf, "decision_function"):
|
3265
|
+
y_pred_proba = best_clf.decision_function(x_true)
|
3266
|
+
|
3267
|
+
# Normalize for multiclass if necessary
|
3268
|
+
if y_pred_proba.ndim == 2:
|
3269
|
+
y_pred_proba = (
|
3270
|
+
y_pred_proba - y_pred_proba.min(axis=1, keepdims=True)
|
3271
|
+
) / (
|
3272
|
+
y_pred_proba.max(axis=1, keepdims=True)
|
3273
|
+
- y_pred_proba.min(axis=1, keepdims=True)
|
3274
|
+
)
|
3275
|
+
else:
|
3276
|
+
y_pred_proba = None # No probability output for certain models
|
3277
|
+
# Access alphas if applicable (e.g., ElasticNetCV, LassoCV)
|
3278
|
+
if hasattr(best_clf, "alphas_") or hasattr(best_clf, "Cs_"):
|
3279
|
+
if hasattr(best_clf, "alphas_"):
|
3280
|
+
alphas_ = best_clf.alphas_
|
3281
|
+
elif hasattr(best_clf, "alpha_"):
|
3282
|
+
alphas_ = best_clf.alpha_
|
3283
|
+
elif hasattr(best_clf, "Cs_"):
|
3284
|
+
alphas_ = best_clf.Cs_
|
3285
|
+
else:
|
3286
|
+
alphas_= None
|
3287
|
+
coef_ = best_clf.coef_ if hasattr(best_clf, "coef_") else None
|
3288
|
+
except Exception as e:
|
3289
|
+
alphas_,coef_ = None,None
|
3290
|
+
print(f"skiped {clf}: {e}")
|
3291
|
+
continue
|
3292
|
+
# try to make predict format consistant
|
3293
|
+
try:
|
3294
|
+
y_pred= [i[0] for i in y_pred]
|
3295
|
+
except:
|
3296
|
+
pass
|
3297
|
+
try:
|
3298
|
+
y_true= [i[0] for i in y_true]
|
3299
|
+
except:
|
3300
|
+
pass
|
3301
|
+
try:
|
3302
|
+
y_train= [i[0] for i in y_train]
|
3303
|
+
except:
|
3304
|
+
pass
|
3248
3305
|
validation_scores = {}
|
3249
3306
|
|
3250
3307
|
if y_true is not None and y_pred_proba is not None:
|
@@ -3294,20 +3351,26 @@ def predict(
|
|
3294
3351
|
"roc_curve": roc_info,
|
3295
3352
|
"pr_curve": pr_info,
|
3296
3353
|
"confusion_matrix": confusion_matrix(y_true, y_pred),
|
3297
|
-
"predictions": y_pred
|
3354
|
+
"predictions": y_pred,#.tolist(),
|
3298
3355
|
"predictions_proba": (
|
3299
3356
|
y_pred_proba.tolist() if y_pred_proba is not None else None
|
3300
3357
|
),
|
3358
|
+
"features":share_col_names,
|
3359
|
+
"coef":coef_,
|
3360
|
+
"alphas":alphas_
|
3301
3361
|
}
|
3302
3362
|
else: # "regression"
|
3303
3363
|
results[name] = {
|
3304
3364
|
"best_clf": gs.best_estimator_,
|
3305
3365
|
"best_params": gs.best_params_,
|
3306
3366
|
"scores": validation_scores, # e.g., neg_MSE, R², etc.
|
3307
|
-
"predictions": y_pred
|
3367
|
+
"predictions": y_pred,#.tolist(),
|
3308
3368
|
"predictions_proba": (
|
3309
3369
|
y_pred_proba.tolist() if y_pred_proba is not None else None
|
3310
3370
|
),
|
3371
|
+
"features":share_col_names,
|
3372
|
+
"coef":coef_,
|
3373
|
+
"alphas":alphas_
|
3311
3374
|
}
|
3312
3375
|
else: # multi-classes
|
3313
3376
|
if y_pred_proba is not None:
|
@@ -3346,20 +3409,26 @@ def predict(
|
|
3346
3409
|
"roc_curve": roc_info,
|
3347
3410
|
"pr_curve": pr_info,
|
3348
3411
|
"confusion_matrix": confusion_matrix(y_true, y_pred),
|
3349
|
-
"predictions": y_pred
|
3412
|
+
"predictions": y_pred,#.tolist(),
|
3350
3413
|
"predictions_proba": (
|
3351
3414
|
y_pred_proba.tolist() if y_pred_proba is not None else None
|
3352
3415
|
),
|
3416
|
+
"features":share_col_names,
|
3417
|
+
"coef":coef_,
|
3418
|
+
"alphas":alphas_
|
3353
3419
|
}
|
3354
3420
|
else: # "regression"
|
3355
3421
|
results[name] = {
|
3356
3422
|
"best_clf": gs.best_estimator_,
|
3357
3423
|
"best_params": gs.best_params_,
|
3358
3424
|
"scores": validation_scores, # e.g., neg_MSE, R², etc.
|
3359
|
-
"predictions": y_pred
|
3425
|
+
"predictions": y_pred,#.tolist(),
|
3360
3426
|
"predictions_proba": (
|
3361
3427
|
y_pred_proba.tolist() if y_pred_proba is not None else None
|
3362
3428
|
),
|
3429
|
+
"features":share_col_names,
|
3430
|
+
"coef":coef_,
|
3431
|
+
"alphas":alphas_
|
3363
3432
|
}
|
3364
3433
|
|
3365
3434
|
else:
|
@@ -3378,12 +3447,15 @@ def predict(
|
|
3378
3447
|
"best_clf": gs.best_estimator_,
|
3379
3448
|
"best_params": gs.best_params_,
|
3380
3449
|
"scores": validation_scores,
|
3381
|
-
"predictions": y_pred
|
3450
|
+
"predictions": y_pred,#.tolist(),
|
3382
3451
|
"predictions_proba": (
|
3383
3452
|
y_pred_proba.tolist() if y_pred_proba is not None else None
|
3384
3453
|
),
|
3454
|
+
"features":share_col_names,
|
3385
3455
|
"y_train": y_train if y_train is not None else [],
|
3386
3456
|
"y_true": y_true if y_true is not None else [],
|
3457
|
+
"coef":coef_,
|
3458
|
+
"alphas":alphas_
|
3387
3459
|
}
|
3388
3460
|
|
3389
3461
|
# Convert results to DataFrame
|
@@ -3446,7 +3518,7 @@ def predict(
|
|
3446
3518
|
for i, j in top_models.to_dict().items():
|
3447
3519
|
base_estimators.append((i, j))
|
3448
3520
|
if stacking_cv:
|
3449
|
-
print(f"
|
3521
|
+
print(f"⤵ stacking_cv is processing...")
|
3450
3522
|
#* 定义几个象征性的final_estimator
|
3451
3523
|
# 备选的几种
|
3452
3524
|
if purpose == "classification":
|
@@ -3520,7 +3592,7 @@ def predict(
|
|
3520
3592
|
best_final_estimator = cv_results_df.iloc[0]['final_estimator']
|
3521
3593
|
print(f"Best final estimator based on cross-validation: {best_final_estimator}")
|
3522
3594
|
else:
|
3523
|
-
print(f"
|
3595
|
+
print(f"⤵ trying to find the best_final_estimator for stacking...")
|
3524
3596
|
if purpose=="classification":
|
3525
3597
|
best_final_estimator = LogisticRegression(class_weight=class_weight,
|
3526
3598
|
random_state=random_state,
|
@@ -3530,26 +3602,25 @@ def predict(
|
|
3530
3602
|
print(f"⤵ the best best_final_estimator: {best_final_estimator}")
|
3531
3603
|
#! apply stacking
|
3532
3604
|
if purpose == "classification":
|
3533
|
-
print(f"
|
3605
|
+
print(f"⤵ StackingClassifier...")
|
3534
3606
|
stacking_model = StackingClassifier(estimators=base_estimators,
|
3535
3607
|
final_estimator=best_final_estimator,
|
3536
3608
|
cv=cv)
|
3537
3609
|
else:
|
3538
|
-
print(f"
|
3610
|
+
print(f"⤵ StackingRegressor...")
|
3539
3611
|
stacking_model = StackingRegressor(estimators=base_estimators,
|
3540
3612
|
final_estimator=best_final_estimator,
|
3541
3613
|
cv=cv)
|
3542
3614
|
|
3543
3615
|
# Train the Stacking Classifier
|
3544
|
-
print(f"
|
3616
|
+
print(f"⤵ fit & predict...")
|
3545
3617
|
stacking_model.fit(x_train, y_train)
|
3546
3618
|
y_pred_final = stacking_model.predict(x_true)
|
3547
|
-
print(f"
|
3619
|
+
print(f"⤵ collecting results...")
|
3548
3620
|
# pred_proba
|
3549
3621
|
if is_binary:
|
3550
3622
|
if hasattr(stacking_model, "predict_proba"):
|
3551
3623
|
y_pred_proba_final = stacking_model.predict_proba(x_true)
|
3552
|
-
print("Shape of predicted probabilities:", y_pred_proba_final.shape)
|
3553
3624
|
if y_pred_proba_final.shape[1] == 1:
|
3554
3625
|
y_pred_proba_final = np.hstack(
|
3555
3626
|
[1 - y_pred_proba_final, y_pred_proba_final]
|
@@ -3564,6 +3635,17 @@ def predict(
|
|
3564
3635
|
)
|
3565
3636
|
else:
|
3566
3637
|
y_pred_proba_final = None # No probability output for certain models
|
3638
|
+
# Access alphas if applicable (e.g., ElasticNetCV, LassoCV)
|
3639
|
+
if hasattr(best_clf, "alphas_") or hasattr(best_clf, "Cs_"):
|
3640
|
+
if hasattr(best_clf, "alphas_"):
|
3641
|
+
alphas_ = best_clf.alphas_
|
3642
|
+
elif hasattr(best_clf, "alpha_"):
|
3643
|
+
alphas_ = best_clf.alpha_
|
3644
|
+
elif hasattr(best_clf, "Cs_"):
|
3645
|
+
alphas_ = best_clf.Cs_
|
3646
|
+
else:
|
3647
|
+
alphas_= None
|
3648
|
+
coef_ = best_clf.coef_ if hasattr(best_clf, "coef_") else None
|
3567
3649
|
if not is_binary:
|
3568
3650
|
# Handle prediction probabilities for multiclass
|
3569
3651
|
if hasattr(stacking_model, "predict_proba"):
|
@@ -3581,6 +3663,17 @@ def predict(
|
|
3581
3663
|
)
|
3582
3664
|
else:
|
3583
3665
|
y_pred_proba_final = None # No probability output for certain models
|
3666
|
+
# Access alphas if applicable (e.g., ElasticNetCV, LassoCV)
|
3667
|
+
if hasattr(best_clf, "alphas_") or hasattr(best_clf, "Cs_"):
|
3668
|
+
if hasattr(best_clf, "alphas_"):
|
3669
|
+
alphas_ = best_clf.alphas_
|
3670
|
+
elif hasattr(best_clf, "alpha_"):
|
3671
|
+
alphas_ = best_clf.alpha_
|
3672
|
+
elif hasattr(best_clf, "Cs_"):
|
3673
|
+
alphas_ = best_clf.Cs_
|
3674
|
+
else:
|
3675
|
+
alphas_= None
|
3676
|
+
coef_ = best_clf.coef_ if hasattr(best_clf, "coef_") else None
|
3584
3677
|
#! dict_pred_stack
|
3585
3678
|
dict_pred_stack={}
|
3586
3679
|
validation_scores_final = {}
|
@@ -3631,6 +3724,9 @@ def predict(
|
|
3631
3724
|
"predictions_proba": (
|
3632
3725
|
y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
|
3633
3726
|
),
|
3727
|
+
"features":share_col_names,
|
3728
|
+
"coef":coef_,
|
3729
|
+
"alphas":alphas_
|
3634
3730
|
}
|
3635
3731
|
else: # "regression"
|
3636
3732
|
dict_pred_stack = {
|
@@ -3641,6 +3737,9 @@ def predict(
|
|
3641
3737
|
"predictions_proba": (
|
3642
3738
|
y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
|
3643
3739
|
),
|
3740
|
+
"features":share_col_names,
|
3741
|
+
"coef":coef_,
|
3742
|
+
"alphas":alphas_
|
3644
3743
|
}
|
3645
3744
|
else: # multi-classes
|
3646
3745
|
if y_pred_proba_final is not None:
|
@@ -3680,6 +3779,9 @@ def predict(
|
|
3680
3779
|
"predictions_proba": (
|
3681
3780
|
y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
|
3682
3781
|
),
|
3782
|
+
"features":share_col_names,
|
3783
|
+
"coef":coef_,
|
3784
|
+
"alphas":alphas_
|
3683
3785
|
}
|
3684
3786
|
else: # "regression"
|
3685
3787
|
dict_pred_stack = {
|
@@ -3690,6 +3792,9 @@ def predict(
|
|
3690
3792
|
"predictions_proba": (
|
3691
3793
|
y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
|
3692
3794
|
),
|
3795
|
+
"features":share_col_names,
|
3796
|
+
"coef":coef_,
|
3797
|
+
"alphas":alphas_
|
3693
3798
|
}
|
3694
3799
|
|
3695
3800
|
else:
|
@@ -3712,8 +3817,11 @@ def predict(
|
|
3712
3817
|
"predictions_proba": (
|
3713
3818
|
y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
|
3714
3819
|
),
|
3820
|
+
"features":share_col_names,
|
3715
3821
|
"y_train": y_train if y_train is not None else [],
|
3716
3822
|
"y_true": y_true if y_true is not None else [],
|
3823
|
+
"coef":coef_,
|
3824
|
+
"alphas":alphas_
|
3717
3825
|
}
|
3718
3826
|
# merge together
|
3719
3827
|
df_pred = pd.DataFrame(
|
@@ -3728,16 +3836,16 @@ def predict(
|
|
3728
3836
|
# if dir_save:
|
3729
3837
|
# ips.figsave(dir_save + f"validate_features_stacking_{now_}.pdf")
|
3730
3838
|
if vote:
|
3731
|
-
print(f"
|
3839
|
+
print(f"⤵ voting...")
|
3732
3840
|
from sklearn.ensemble import VotingClassifier, VotingRegressor
|
3733
|
-
#!
|
3841
|
+
#! voting
|
3734
3842
|
n_top_models = min(n_top_models, df_results.shape[0])
|
3735
3843
|
base_estimators=[]
|
3736
3844
|
for name, cls in zip(list(df_results.iloc[:n_top_models, :].index),df_results.iloc[:n_top_models, :]["best_clf"].tolist()):
|
3737
3845
|
base_estimators.append((name,cls))
|
3738
3846
|
# Apply Voting Classifier/Regressor
|
3739
3847
|
if purpose == "classification":
|
3740
|
-
print(f"
|
3848
|
+
print(f"⤵ VotingClassifier...via{voting}")
|
3741
3849
|
if voting=='hard':
|
3742
3850
|
# Hard voting does not support `predict_proba`
|
3743
3851
|
voting_model = VotingClassifier(estimators=base_estimators)
|
@@ -3745,7 +3853,7 @@ def predict(
|
|
3745
3853
|
# Soft voting supports `predict_proba`
|
3746
3854
|
voting_model = VotingClassifier(estimators=base_estimators, voting="soft")
|
3747
3855
|
else:
|
3748
|
-
print(f"
|
3856
|
+
print(f"⤵ VotingRegressor...")
|
3749
3857
|
voting_model = VotingRegressor(estimators=base_estimators)
|
3750
3858
|
|
3751
3859
|
# Train the Voting Classifier/Regressor
|
@@ -3770,10 +3878,23 @@ def predict(
|
|
3770
3878
|
y_pred_proba_vote = y_pred_proba_vote[:, 1]
|
3771
3879
|
else:
|
3772
3880
|
y_pred_proba_vote = None
|
3881
|
+
|
3882
|
+
# Access alphas if applicable (e.g., ElasticNetCV, LassoCV)
|
3883
|
+
if hasattr(best_clf, "alphas_") or hasattr(best_clf, "Cs_"):
|
3884
|
+
if hasattr(best_clf, "alphas_"):
|
3885
|
+
alphas_ = best_clf.alphas_
|
3886
|
+
elif hasattr(best_clf, "alpha_"):
|
3887
|
+
alphas_ = best_clf.alpha_
|
3888
|
+
elif hasattr(best_clf, "Cs_"):
|
3889
|
+
alphas_ = best_clf.Cs_
|
3890
|
+
else:
|
3891
|
+
alphas_= None
|
3892
|
+
coef_ = best_clf.coef_ if hasattr(best_clf, "coef_") else None
|
3773
3893
|
else: # Regression
|
3774
3894
|
y_pred_proba_vote = None
|
3895
|
+
coef_,alphas_=None,None
|
3775
3896
|
|
3776
|
-
print(f"
|
3897
|
+
print(f"⤵ collecting voting results...")
|
3777
3898
|
#! dict_pred_vote
|
3778
3899
|
dict_pred_vote = {}
|
3779
3900
|
validation_scores_vote = {}
|
@@ -3822,6 +3943,9 @@ def predict(
|
|
3822
3943
|
"predictions_proba": (
|
3823
3944
|
y_pred_proba_vote.tolist() if y_pred_proba_vote is not None else None
|
3824
3945
|
),
|
3946
|
+
"features":share_col_names,
|
3947
|
+
"coef":coef_,
|
3948
|
+
"alphas":alphas_
|
3825
3949
|
}
|
3826
3950
|
else: # Multi-class
|
3827
3951
|
if y_pred_proba_vote is not None:
|
@@ -3856,6 +3980,9 @@ def predict(
|
|
3856
3980
|
"predictions_proba": (
|
3857
3981
|
y_pred_proba_vote.tolist() if y_pred_proba_vote is not None else None
|
3858
3982
|
),
|
3983
|
+
"features":share_col_names,
|
3984
|
+
"coef":coef_,
|
3985
|
+
"alphas":alphas_
|
3859
3986
|
}
|
3860
3987
|
else:
|
3861
3988
|
if y_true is None:
|
@@ -3877,6 +4004,7 @@ def predict(
|
|
3877
4004
|
"predictions_proba": (
|
3878
4005
|
y_pred_proba_vote.tolist() if y_pred_proba_vote is not None else None
|
3879
4006
|
),
|
4007
|
+
"features":share_col_names,
|
3880
4008
|
"y_train": y_train if y_train is not None else [],
|
3881
4009
|
"y_true": y_true if y_true is not None else [],
|
3882
4010
|
}
|
@@ -3907,9 +4035,15 @@ def predict(
|
|
3907
4035
|
now_ = datetime.now().strftime("%y%m%d_%H%M%S")
|
3908
4036
|
# try:
|
3909
4037
|
if df_res.shape[0] > 3:
|
3910
|
-
|
4038
|
+
try:
|
4039
|
+
plot_validate_features(df_res, is_binary=is_binary)
|
4040
|
+
except Exception as e:
|
4041
|
+
print(e)
|
3911
4042
|
else:
|
3912
|
-
|
4043
|
+
try:
|
4044
|
+
plot_validate_features_single(df_res, is_binary=is_binary)
|
4045
|
+
except Exception as e:
|
4046
|
+
print(e)
|
3913
4047
|
if dir_save:
|
3914
4048
|
ips.figsave(dir_save + f"validate_features{now_}.pdf")
|
3915
4049
|
# except Exception as e:
|