PyPI - gpbench - Versions diffs - 1.0.0__py3-none-any.whl - Mend

gpbench 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (188) hide show

gp_agent_tool/compute_dataset_feature.py +67 -0
gp_agent_tool/config.py +65 -0
gp_agent_tool/experience/create_masked_dataset_summary.py +97 -0
gp_agent_tool/experience/dataset_summary_info.py +13 -0
gp_agent_tool/experience/experience_info.py +12 -0
gp_agent_tool/experience/get_matched_experience.py +111 -0
gp_agent_tool/llm_client.py +119 -0
gp_agent_tool/logging_utils.py +24 -0
gp_agent_tool/main.py +347 -0
gp_agent_tool/read_agent/__init__.py +46 -0
gp_agent_tool/read_agent/nodes.py +674 -0
gp_agent_tool/read_agent/prompts.py +547 -0
gp_agent_tool/read_agent/python_repl_tool.py +165 -0
gp_agent_tool/read_agent/state.py +101 -0
gp_agent_tool/read_agent/workflow.py +54 -0
gpbench/__init__.py +25 -0
gpbench/_selftest.py +104 -0
gpbench/method_class/BayesA/BayesA_class.py +141 -0
gpbench/method_class/BayesA/__init__.py +5 -0
gpbench/method_class/BayesA/_bayesfromR.py +96 -0
gpbench/method_class/BayesA/_param_free_base_model.py +84 -0
gpbench/method_class/BayesA/bayesAfromR.py +16 -0
gpbench/method_class/BayesB/BayesB_class.py +140 -0
gpbench/method_class/BayesB/__init__.py +5 -0
gpbench/method_class/BayesB/_bayesfromR.py +96 -0
gpbench/method_class/BayesB/_param_free_base_model.py +84 -0
gpbench/method_class/BayesB/bayesBfromR.py +16 -0
gpbench/method_class/BayesC/BayesC_class.py +141 -0
gpbench/method_class/BayesC/__init__.py +4 -0
gpbench/method_class/BayesC/_bayesfromR.py +96 -0
gpbench/method_class/BayesC/_param_free_base_model.py +84 -0
gpbench/method_class/BayesC/bayesCfromR.py +16 -0
gpbench/method_class/CropARNet/CropARNet_class.py +186 -0
gpbench/method_class/CropARNet/CropARNet_he_class.py +154 -0
gpbench/method_class/CropARNet/__init__.py +5 -0
gpbench/method_class/CropARNet/base_CropARNet_class.py +178 -0
gpbench/method_class/Cropformer/Cropformer_class.py +308 -0
gpbench/method_class/Cropformer/__init__.py +5 -0
gpbench/method_class/Cropformer/cropformer_he_class.py +221 -0
gpbench/method_class/DL_GWAS/DL_GWAS_class.py +250 -0
gpbench/method_class/DL_GWAS/DL_GWAS_he_class.py +169 -0
gpbench/method_class/DL_GWAS/__init__.py +5 -0
gpbench/method_class/DNNGP/DNNGP_class.py +163 -0
gpbench/method_class/DNNGP/DNNGP_he_class.py +138 -0
gpbench/method_class/DNNGP/__init__.py +5 -0
gpbench/method_class/DNNGP/base_dnngp_class.py +116 -0
gpbench/method_class/DeepCCR/DeepCCR_class.py +172 -0
gpbench/method_class/DeepCCR/DeepCCR_he_class.py +161 -0
gpbench/method_class/DeepCCR/__init__.py +5 -0
gpbench/method_class/DeepCCR/base_DeepCCR_class.py +209 -0
gpbench/method_class/DeepGS/DeepGS_class.py +184 -0
gpbench/method_class/DeepGS/DeepGS_he_class.py +150 -0
gpbench/method_class/DeepGS/__init__.py +5 -0
gpbench/method_class/DeepGS/base_deepgs_class.py +153 -0
gpbench/method_class/EIR/EIR_class.py +276 -0
gpbench/method_class/EIR/EIR_he_class.py +184 -0
gpbench/method_class/EIR/__init__.py +5 -0
gpbench/method_class/EIR/utils/__init__.py +0 -0
gpbench/method_class/EIR/utils/array_output_modules.py +97 -0
gpbench/method_class/EIR/utils/common.py +65 -0
gpbench/method_class/EIR/utils/lcl_layers.py +235 -0
gpbench/method_class/EIR/utils/logging.py +59 -0
gpbench/method_class/EIR/utils/mlp_layers.py +92 -0
gpbench/method_class/EIR/utils/models_locally_connected.py +642 -0
gpbench/method_class/EIR/utils/transformer_models.py +546 -0
gpbench/method_class/ElasticNet/ElasticNet_class.py +133 -0
gpbench/method_class/ElasticNet/ElasticNet_he_class.py +91 -0
gpbench/method_class/ElasticNet/__init__.py +5 -0
gpbench/method_class/G2PDeep/G2PDeep_he_class.py +217 -0
gpbench/method_class/G2PDeep/G2Pdeep_class.py +205 -0
gpbench/method_class/G2PDeep/__init__.py +5 -0
gpbench/method_class/G2PDeep/base_G2PDeep_class.py +209 -0
gpbench/method_class/GBLUP/GBLUP_class.py +183 -0
gpbench/method_class/GBLUP/__init__.py +5 -0
gpbench/method_class/GEFormer/GEFormer_class.py +169 -0
gpbench/method_class/GEFormer/GEFormer_he_class.py +137 -0
gpbench/method_class/GEFormer/__init__.py +5 -0
gpbench/method_class/GEFormer/gMLP_class.py +357 -0
gpbench/method_class/LightGBM/LightGBM_class.py +224 -0
gpbench/method_class/LightGBM/LightGBM_he_class.py +121 -0
gpbench/method_class/LightGBM/__init__.py +5 -0
gpbench/method_class/RF/RF_GPU_class.py +165 -0
gpbench/method_class/RF/RF_GPU_he_class.py +124 -0
gpbench/method_class/RF/__init__.py +5 -0
gpbench/method_class/SVC/SVC_GPU.py +181 -0
gpbench/method_class/SVC/SVC_GPU_he.py +106 -0
gpbench/method_class/SVC/__init__.py +5 -0
gpbench/method_class/SoyDNGP/AlexNet_206_class.py +179 -0
gpbench/method_class/SoyDNGP/SoyDNGP_class.py +189 -0
gpbench/method_class/SoyDNGP/SoyDNGP_he_class.py +112 -0
gpbench/method_class/SoyDNGP/__init__.py +5 -0
gpbench/method_class/XGBoost/XGboost_GPU_class.py +198 -0
gpbench/method_class/XGBoost/XGboost_GPU_he_class.py +178 -0
gpbench/method_class/XGBoost/__init__.py +5 -0
gpbench/method_class/__init__.py +52 -0
gpbench/method_class/rrBLUP/__init__.py +5 -0
gpbench/method_class/rrBLUP/rrBLUP_class.py +140 -0
gpbench/method_reg/BayesA/BayesA.py +116 -0
gpbench/method_reg/BayesA/__init__.py +5 -0
gpbench/method_reg/BayesA/_bayesfromR.py +96 -0
gpbench/method_reg/BayesA/_param_free_base_model.py +84 -0
gpbench/method_reg/BayesA/bayesAfromR.py +16 -0
gpbench/method_reg/BayesB/BayesB.py +117 -0
gpbench/method_reg/BayesB/__init__.py +5 -0
gpbench/method_reg/BayesB/_bayesfromR.py +96 -0
gpbench/method_reg/BayesB/_param_free_base_model.py +84 -0
gpbench/method_reg/BayesB/bayesBfromR.py +16 -0
gpbench/method_reg/BayesC/BayesC.py +115 -0
gpbench/method_reg/BayesC/__init__.py +5 -0
gpbench/method_reg/BayesC/_bayesfromR.py +96 -0
gpbench/method_reg/BayesC/_param_free_base_model.py +84 -0
gpbench/method_reg/BayesC/bayesCfromR.py +16 -0
gpbench/method_reg/CropARNet/CropARNet.py +159 -0
gpbench/method_reg/CropARNet/CropARNet_Hyperparameters.py +109 -0
gpbench/method_reg/CropARNet/__init__.py +5 -0
gpbench/method_reg/CropARNet/base_CropARNet.py +137 -0
gpbench/method_reg/Cropformer/Cropformer.py +313 -0
gpbench/method_reg/Cropformer/Cropformer_Hyperparameters.py +250 -0
gpbench/method_reg/Cropformer/__init__.py +5 -0
gpbench/method_reg/DL_GWAS/DL_GWAS.py +186 -0
gpbench/method_reg/DL_GWAS/DL_GWAS_Hyperparameters.py +125 -0
gpbench/method_reg/DL_GWAS/__init__.py +5 -0
gpbench/method_reg/DNNGP/DNNGP.py +157 -0
gpbench/method_reg/DNNGP/DNNGP_Hyperparameters.py +118 -0
gpbench/method_reg/DNNGP/__init__.py +5 -0
gpbench/method_reg/DNNGP/base_dnngp.py +101 -0
gpbench/method_reg/DeepCCR/DeepCCR.py +149 -0
gpbench/method_reg/DeepCCR/DeepCCR_Hyperparameters.py +110 -0
gpbench/method_reg/DeepCCR/__init__.py +5 -0
gpbench/method_reg/DeepCCR/base_DeepCCR.py +171 -0
gpbench/method_reg/DeepGS/DeepGS.py +165 -0
gpbench/method_reg/DeepGS/DeepGS_Hyperparameters.py +114 -0
gpbench/method_reg/DeepGS/__init__.py +5 -0
gpbench/method_reg/DeepGS/base_deepgs.py +98 -0
gpbench/method_reg/EIR/EIR.py +258 -0
gpbench/method_reg/EIR/EIR_Hyperparameters.py +178 -0
gpbench/method_reg/EIR/__init__.py +5 -0
gpbench/method_reg/EIR/utils/__init__.py +0 -0
gpbench/method_reg/EIR/utils/array_output_modules.py +97 -0
gpbench/method_reg/EIR/utils/common.py +65 -0
gpbench/method_reg/EIR/utils/lcl_layers.py +235 -0
gpbench/method_reg/EIR/utils/logging.py +59 -0
gpbench/method_reg/EIR/utils/mlp_layers.py +92 -0
gpbench/method_reg/EIR/utils/models_locally_connected.py +642 -0
gpbench/method_reg/EIR/utils/transformer_models.py +546 -0
gpbench/method_reg/ElasticNet/ElasticNet.py +123 -0
gpbench/method_reg/ElasticNet/ElasticNet_he.py +83 -0
gpbench/method_reg/ElasticNet/__init__.py +5 -0
gpbench/method_reg/G2PDeep/G2PDeep_Hyperparameters.py +107 -0
gpbench/method_reg/G2PDeep/G2Pdeep.py +166 -0
gpbench/method_reg/G2PDeep/__init__.py +5 -0
gpbench/method_reg/G2PDeep/base_G2PDeep.py +209 -0
gpbench/method_reg/GBLUP/GBLUP_R.py +182 -0
gpbench/method_reg/GBLUP/__init__.py +5 -0
gpbench/method_reg/GEFormer/GEFormer.py +164 -0
gpbench/method_reg/GEFormer/GEFormer_Hyperparameters.py +106 -0
gpbench/method_reg/GEFormer/__init__.py +5 -0
gpbench/method_reg/GEFormer/gMLP.py +341 -0
gpbench/method_reg/LightGBM/LightGBM.py +237 -0
gpbench/method_reg/LightGBM/LightGBM_Hyperparameters.py +77 -0
gpbench/method_reg/LightGBM/__init__.py +5 -0
gpbench/method_reg/MVP/MVP.py +182 -0
gpbench/method_reg/MVP/MVP_Hyperparameters.py +126 -0
gpbench/method_reg/MVP/__init__.py +5 -0
gpbench/method_reg/MVP/base_MVP.py +113 -0
gpbench/method_reg/RF/RF_GPU.py +174 -0
gpbench/method_reg/RF/RF_Hyperparameters.py +163 -0
gpbench/method_reg/RF/__init__.py +5 -0
gpbench/method_reg/SVC/SVC_GPU.py +194 -0
gpbench/method_reg/SVC/SVC_Hyperparameters.py +107 -0
gpbench/method_reg/SVC/__init__.py +5 -0
gpbench/method_reg/SoyDNGP/AlexNet_206.py +185 -0
gpbench/method_reg/SoyDNGP/SoyDNGP.py +179 -0
gpbench/method_reg/SoyDNGP/SoyDNGP_Hyperparameters.py +105 -0
gpbench/method_reg/SoyDNGP/__init__.py +5 -0
gpbench/method_reg/XGBoost/XGboost_GPU.py +188 -0
gpbench/method_reg/XGBoost/XGboost_Hyperparameters.py +167 -0
gpbench/method_reg/XGBoost/__init__.py +5 -0
gpbench/method_reg/__init__.py +55 -0
gpbench/method_reg/rrBLUP/__init__.py +5 -0
gpbench/method_reg/rrBLUP/rrBLUP.py +123 -0
gpbench-1.0.0.dist-info/METADATA +379 -0
gpbench-1.0.0.dist-info/RECORD +188 -0
gpbench-1.0.0.dist-info/WHEEL +5 -0
gpbench-1.0.0.dist-info/entry_points.txt +2 -0
gpbench-1.0.0.dist-info/top_level.txt +3 -0
tests/test_import.py +80 -0
tests/test_method.py +232 -0

gpbench/method_reg/GBLUP/GBLUP_R.py ADDED Viewed

@@ -0,0 +1,182 @@
+import os
+import time
+import psutil
+import swanlab
+import argparse
+import random
+import torch
+import numpy as np
+import pandas as pd
+from sklearn.model_selection import KFold
+from scipy.stats import pearsonr
+from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
+# rpy2 导入
+import rpy2.robjects as ro
+from rpy2.robjects import numpy2ri
+numpy2ri.activate()
+# 为 BLUP 求逆
+ro.r('library(MASS)')
+def gblup_r_vanraden_reml(X_train, y_train, X_test):
+    # Pass data to R
+    ro.globalenv['X_train'] = X_train
+    ro.globalenv['y_train'] = y_train
+    ro.globalenv['X_test'] = X_test
+    r_code = """
+    library(rrBLUP)
+    n_train <- nrow(X_train)
+    m <- ncol(X_train)
+    # Step1: allele frequencies
+    p <- colMeans(X_train) / 2
+    p <- pmax(pmin(p, 0.99), 0.01)
+    # Step2: VanRaden standardized genotype
+    Z_train <- sweep(X_train, 2, 2*p, "-") / sqrt(2*p*(1-p))
+    Z_train[is.na(Z_train)] <- 0
+    Z_test <- sweep(X_test, 2, 2*p, "-") / sqrt(2*p*(1-p))
+    Z_test[is.na(Z_test)] <- 0
+    # Step3: Genomic relationship matrix (VanRaden method 2)
+    denom <- sum(2*p*(1-p))
+    G <- Z_train %*% t(Z_train) / denom
+    G <- G + diag(1e-6, n_train)  # stability
+    # Step4: REML GBLUP
+    fit <- mixed.solve(y = y_train, K = G, SE = FALSE)
+    # Extract variance components and fixed effect
+    Vu <- fit$Vu
+    Ve <- fit$Ve
+    mu <- as.numeric(fit$beta)  # <-- 转成标量，避免非兼容数组
+    h2 <- Vu / (Vu + Ve)
+    # Step5: GBLUP prediction for test set
+    y_centered <- y_train - mu
+    A <- G + (Ve / Vu) * diag(n_train)  # G + λ I
+    G_test_train <- Z_test %*% t(Z_train) / denom
+    u_test <- G_test_train %*% solve(A, y_centered)  # strictly correct formula
+    y_pred <- mu + u_test
+    y_pred
+    """
+    y_pred = np.array(ro.r(r_code)).flatten()
+    return y_pred
+def parse_args():
+    parser = argparse.ArgumentParser(description="Argument parser")
+    parser.add_argument('--methods', type=str, default='GBLUP_R/', help='Method name')
+    parser.add_argument('--species', type=str, default='')
+    parser.add_argument('--phe', type=str, default='', help='Dataset name')
+    parser.add_argument('--data_dir', type=str, default='../../data/', help='Path to data directory')
+    parser.add_argument('--result_dir', type=str, default='result/', help='Path to result directory')
+    args = parser.parse_args()
+    return args
+def load_data(args):
+    xData = np.load(os.path.join(args.data_dir, args.species, 'genotype.npz'))["arr_0"]
+    yData = np.load(os.path.join(args.data_dir, args.species, 'phenotype.npz'))["arr_0"]
+    names = np.load(os.path.join(args.data_dir, args.species, 'phenotype.npz'))["arr_1"]
+    nsample = xData.shape[0]
+    nsnp = xData.shape[1]
+    print("Number of samples: ", nsample)
+    print("Number of SNPs: ", nsnp)
+    return xData, yData, nsample, nsnp, names
+def set_seed(seed=42):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(torch.tensor(seed))
+    torch.cuda.manual_seed_all(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+def run_nested_cv(args, data, label, process):
+    result_dir = os.path.join(args.result_dir, args.methods + args.species + args.phe)
+    os.makedirs(result_dir, exist_ok=True)
+    print("Starting 10-fold cross-validation with GBLUP (R VanRaden)...")
+    kf = KFold(n_splits=10, shuffle=True, random_state=42)
+    all_mse, all_mae, all_r2, all_pcc = [], [], [], []
+    time_star = time.time()
+    for fold, (train_idx, test_idx) in enumerate(kf.split(label)):
+        print(f"===== Fold {fold} =====")
+        fold_start_time = time.time()
+        X_train, X_test = data[train_idx], data[test_idx]
+        y_train, y_test = label[train_idx], label[test_idx]
+        # === run strict GBLUP via R ===
+        y_pred = gblup_r_vanraden_reml(X_train, y_train, X_test)
+        # 评价指标
+        pcc = pearsonr(y_test, y_pred)[0]
+        mse = mean_squared_error(y_test, y_pred)
+        r2 = r2_score(y_test, y_pred)
+        mae = mean_absolute_error(y_test, y_pred)
+        all_mse.append(mse)
+        all_r2.append(r2)
+        all_mae.append(mae)
+        all_pcc.append(pcc)
+        fold_time = time.time() - fold_start_time
+        fold_gpu_mem = torch.cuda.max_memory_allocated() / 1024**2 if torch.cuda.is_available() else 0
+        fold_cpu_mem = process.memory_info().rss / 1024**2
+        print(f'Fold {fold}: Corr={pcc:.4f}, MAE={mae:.4f}, MSE={mse:.4f}, R2={r2:.4f}, Time={fold_time:.2f}s, '
+              f'GPU={fold_gpu_mem:.2f}MB, CPU={fold_cpu_mem:.2f}MB')
+        results_df = pd.DataFrame({'Y_test': y_test, 'Y_pred': y_pred})
+        results_df.to_csv(os.path.join(result_dir, f"fold{fold}.csv"), index=False)
+    print("\n===== Cross-validation summary =====")
+    print(f"Average PCC: {np.mean(all_pcc):.4f} ± {np.std(all_pcc):.4f}")
+    print(f"Average MAE: {np.mean(all_mae):.4f} ± {np.std(all_mae):.4f}")
+    print(f"Average MSE: {np.mean(all_mse):.4f} ± {np.std(all_mse):.4f}")
+    print(f"Average R2 : {np.mean(all_r2):.4f} ± {np.std(all_r2):.4f}")
+    print(f"Time: {time.time() - time_star:.2f}s")
+def GBLUP_reg():
+    set_seed(42)
+    torch.cuda.empty_cache()
+    device = torch.device("cuda:0")
+    args = parse_args()
+    process = psutil.Process(os.getpid())
+    all_species =['Cotton/']
+    for sp in all_species:
+        args.species = sp
+        X, Y, nsamples, nsnp, names = load_data(args)
+        for i, phe in enumerate(names):
+            args.phe = phe
+            print("starting run " + args.methods + args.species + args.phe)
+            label = Y[:, i]
+            label = np.nan_to_num(label, nan=np.nanmean(label))
+            start_time = time.time()
+            torch.cuda.reset_peak_memory_stats()
+            run_nested_cv(args, data=X, label=label, process=process)
+            elapsed_time = time.time() - start_time
+            print(f"运行时间: {elapsed_time:.2f} 秒")
+            print("successfully")
+if __name__ == "__main__":
+    GBLUP_reg()

gpbench/method_reg/GBLUP/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+from .GBLUP_R import GBLUP_reg
+GBLUP = GBLUP_reg
+__all__ = ["GBLUP","GBLUP_reg"]

gpbench/method_reg/GEFormer/GEFormer.py ADDED Viewed

@@ -0,0 +1,164 @@
+import os
+import torch
+import argparse
+import psutil
+import time
+import random
+import numpy as np
+import pandas as pd
+import pynvml
+from . import GEFormer_Hyperparameters
+from .gMLP import GEFormer
+from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
+from torch.utils.data  import DataLoader, TensorDataset
+from sklearn.model_selection import KFold, train_test_split
+from scipy.stats import pearsonr
+def parse_args():
+    parser = argparse.ArgumentParser(description="Argument parser")
+    parser.add_argument('--methods', type=str, default='GEFormer/', help='Random seed')
+    parser.add_argument('--species', type=str, default='Pig/')
+    parser.add_argument('--phe', type=str, default='', help='Dataset name')
+    parser.add_argument('--data_dir', type=str, default='../../data/')
+    parser.add_argument('--result_dir', type=str, default='result/')
+    parser.add_argument('--epoch', type=int, default=1000, help='Number of training rounds')
+    parser.add_argument('--batch_size', type=int, default=64, help='Batch size')
+    parser.add_argument('--lr', type=float, default=0.01, help='Learning rate')
+    parser.add_argument('--patience', type=int, default=10, help='Patience for early stopping')
+    parser.add_argument('--dropout1', type=float, default=0.5, help='Dropout rate for layer 1')
+    parser.add_argument('--dropout2', type=float, default=0.5, help='Dropout rate for layer 2')
+    args = parser.parse_args()
+    return args
+def load_data(args):
+    xData = np.load(os.path.join(args.data_dir, args.species, 'genotype.npz'))["arr_0"]
+    yData = np.load(os.path.join(args.data_dir, args.species, 'phenotype.npz'))["arr_0"]
+    names = np.load(os.path.join(args.data_dir, args.species, 'phenotype.npz'))["arr_1"]
+    nsample = xData.shape[0]
+    nsnp = xData.shape[1]
+    print("Number of samples: ", nsample)
+    print("Number of SNPs: ", nsnp)
+    return xData, yData, nsample, nsnp, names
+def set_seed(seed=42):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+def get_gpu_mem_by_pid(pid):
+    procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
+    for p in procs:
+        if p.pid == pid:
+            return p.usedGpuMemory / 1024**2
+    return 0.0
+def run_nested_cv(args, data, label, nsnp, device):
+    result_dir = os.path.join(args.result_dir, args.methods + args.species + args.phe)
+    os.makedirs(result_dir, exist_ok=True)
+    print("Starting 10-fold cross-validation...")
+    kf = KFold(n_splits=10, shuffle=True, random_state=42)
+    all_mse, all_mae, all_r2, all_pcc = [], [], [], []
+    time_star = time.time()
+    for fold, (train_idx, test_idx) in enumerate(kf.split(data)):
+        print(f"Running fold {fold}...")
+        process = psutil.Process(os.getpid())
+        fold_start_time = time.time()
+        x_train, x_test = data[train_idx], data[test_idx]
+        y_train, y_test = label[train_idx], label[test_idx]
+        X_train_sub, X_valid, y_train_sub, y_valid = train_test_split(x_train, y_train, test_size=0.1, random_state=42)
+        x_train_tensor = torch.from_numpy(X_train_sub).float().to(device)
+        y_train_tensor = torch.from_numpy(y_train_sub).float().to(device)
+        x_valid_tensor = torch.from_numpy(X_valid).float().to(device)
+        y_valid_tensor = torch.from_numpy(y_valid).float().to(device)
+        x_test_tensor = torch.from_numpy(x_test).float().to(device)
+        y_test_tensor = torch.from_numpy(y_test).float().to(device)
+        train_data = TensorDataset(x_train_tensor, y_train_tensor)
+        valid_data = TensorDataset(x_valid_tensor, y_valid_tensor)
+        test_data = TensorDataset(x_test_tensor, y_test_tensor)
+        train_loader = DataLoader(train_data, args.batch_size, shuffle=True)
+        valid_loader = DataLoader(valid_data, args.batch_size, shuffle=False)
+        test_loader = DataLoader(test_data, args.batch_size, shuffle=False)
+        model = GEFormer(nsnp=nsnp).to(device)
+        model.train_model(train_loader, valid_loader,args.epoch, args.lr, args.patience, device)
+        y_pred = model.predict(test_loader)
+        mse = mean_squared_error(y_test, y_pred)
+        r2 = r2_score(y_test, y_pred)
+        mae = mean_absolute_error(y_test, y_pred)
+        pcc, _ = pearsonr(y_test, y_pred)
+        mse = mean_squared_error(y_test, y_pred)
+        r2 = r2_score(y_test, y_pred)
+        mae = mean_absolute_error(y_test, y_pred)
+        pcc, _ = pearsonr(y_test, y_pred)
+        all_mse.append(mse)
+        all_r2.append(r2)
+        all_mae.append(mae)
+        all_pcc.append(pcc)
+        fold_time = time.time() - fold_start_time
+        fold_gpu_mem =  get_gpu_mem_by_pid(os.getpid())
+        fold_cpu_mem = process.memory_info().rss / 1024**2
+        print(f'Fold {fold}:  Corr={pcc:.4f}, MAE={mae:.4f}, MSE={mse:.4f}, R2={r2:.4f}, Time={fold_time:.2f}s, '
+              f'GPU={fold_gpu_mem:.2f}MB, CPU={fold_cpu_mem:.2f}MB')
+        torch.cuda.empty_cache()
+        torch.cuda.reset_peak_memory_stats()
+        results_df = pd.DataFrame({'Y_test': y_test, 'Y_pred': y_pred})
+        results_df.to_csv(os.path.join(result_dir, f"fold{fold}.csv"), index=False)
+    print("\n===== Cross-validation summary =====")
+    print(f"Average PCC: {np.mean(all_pcc):.4f} ± {np.std(all_pcc):.4f}")
+    print(f"Average MAE: {np.mean(all_mae):.4f} ± {np.std(all_mae):.4f}")
+    print(f"Average MSE: {np.mean(all_mse):.4f} ± {np.std(all_mse):.4f}")
+    print(f"Average R2 : {np.mean(all_r2):.4f} ± {np.std(all_r2):.4f}")
+    print(f"Time: {time.time() - time_star:.2f}s")
+def GEFormer_reg():
+    set_seed(42)
+    torch.cuda.empty_cache()
+    pynvml.nvmlInit()
+    handle = pynvml.nvmlDeviceGetHandleByIndex(0)
+    args = parse_args()
+    all_species =['Cotton/']
+    for i in range(len(all_species)):
+        args.species = all_species[i]
+        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+        args.device = device
+        X, Y, nsamples, nsnp, names = load_data(args)
+        for j in range(len(names)):
+            args.phe = names[j]
+            print("starting run " + args.methods + args.species + args.phe)
+            label = Y[:, j]
+            label = np.nan_to_num(label, nan=np.nanmean(label))
+            best_params = GEFormer_Hyperparameters.Hyperparameter(X, label, nsnp)
+            args.learning_rate = best_params['learning_rate']
+            args.batch_size = best_params['batch_size']
+            args.patience = best_params['patience']
+            start_time = time.time()
+            torch.cuda.reset_peak_memory_stats()
+            process = psutil.Process(os.getpid())
+            run_nested_cv(args, data=X, label=label, nsnp = nsnp, device = args.device)
+            elapsed_time = time.time() - start_time
+            print(f"running time: {elapsed_time:.2f} s")
+            print("successfully")
+if __name__ == '__main__':
+    GEFormer_reg()

gpbench/method_reg/GEFormer/GEFormer_Hyperparameters.py ADDED Viewed

@@ -0,0 +1,106 @@
+import os
+import time
+import psutil
+import random
+import torch
+import numpy as np
+import optuna
+from sklearn.model_selection import KFold, train_test_split
+from .gMLP import GEFormer
+from scipy.stats import pearsonr
+from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
+from torch.utils.data  import DataLoader, TensorDataset
+from optuna.exceptions import TrialPruned
+def run_nested_cv_with_early_stopping(data, label, nsnp, learning_rate, patience, batch_size, epoch=1000):
+    device = torch.device("cuda:0")
+    print("Starting 10-fold cross-validation...")
+    kf = KFold(n_splits=10, shuffle=True, random_state=42)
+    all_mse, all_mae, all_r2, all_pcc = [], [], [], []
+    for fold, (train_index, test_index) in enumerate(kf.split(data)):
+        print(f"Running fold {fold}...")
+        process = psutil.Process(os.getpid())
+        fold_start_time = time.time()
+        X_train, X_test = data[train_index], data[test_index]
+        y_train, y_test = label[train_index], label[test_index]
+        X_train_sub, X_valid, y_train_sub, y_valid = train_test_split(X_train, y_train, test_size=0.1, random_state=42)
+        x_train_tensor = torch.from_numpy(X_train_sub).float().to(device)
+        y_train_tensor = torch.from_numpy(y_train_sub).float().to(device)
+        x_valid_tensor = torch.from_numpy(X_valid).float().to(device)
+        y_valid_tensor = torch.from_numpy(y_valid).float().to(device)
+        x_test_tensor = torch.from_numpy(X_test).float().to(device)
+        y_test_tensor = torch.from_numpy(y_test).float().to(device)
+        train_data = TensorDataset(x_train_tensor, y_train_tensor)
+        valid_data = TensorDataset(x_valid_tensor, y_valid_tensor)
+        test_data = TensorDataset(x_test_tensor, y_test_tensor)
+        train_loader = DataLoader(train_data, batch_size, shuffle=True)
+        valid_loader = DataLoader(valid_data, batch_size, shuffle=False)
+        test_loader = DataLoader(test_data, batch_size, shuffle=False)
+        model = GEFormer(nsnp=nsnp)
+        model.train_model(train_loader, valid_loader, epoch, learning_rate, patience, device)
+        y_pred = model.predict(test_loader)
+        mse = mean_squared_error(y_test, y_pred)
+        r2 = r2_score(y_test, y_pred)
+        mae = mean_absolute_error(y_test, y_pred)
+        pcc, _ = pearsonr(y_test, y_pred)
+        if np.isnan(pcc):
+            print(f"Fold {fold} resulted in NaN PCC, pruning the trial...")
+            raise TrialPruned()
+        all_mse.append(mse)
+        all_r2.append(r2)
+        all_mae.append(mae)
+        all_pcc.append(pcc)
+        fold_time = time.time() - fold_start_time
+        fold_cpu_mem = process.memory_info().rss / 1024**2
+        print(f'Fold {fold}:  Corr={pcc:.4f}, MAE={mae:.4f}, MSE={mse:.4f}, R2={r2:.4f}, Time={fold_time:.2f}s, '
+                f'CPU={fold_cpu_mem:.2f}MB')
+    return np.mean(all_pcc) if all_pcc else 0.0
+def set_seed(seed=42):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+def Hyperparameter(data, label, nsnp):
+    set_seed(42)
+    def objective(trial):
+        learning_rate = trial.suggest_loguniform("learning_rate", 1e-4,0.1)
+        batch_size = trial.suggest_categorical("batch_size", [32, 64, 128])
+        patience = trial.suggest_int("patience", 1, 10)
+        try:
+            corr_score = run_nested_cv_with_early_stopping(
+                data=data,
+                label=label,
+                nsnp=nsnp,
+                learning_rate=learning_rate,
+                patience=patience,
+                batch_size=batch_size
+            )
+        except TrialPruned:
+            return float("-inf")
+        return corr_score
+    study = optuna.create_study(direction="maximize")
+    study.optimize(objective, n_trials=20)
+    print("best params:", study.best_params)
+    print("successfully")
+    return study.best_params

gpbench/method_reg/GEFormer/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+from .GEFormer import GEFormer_reg
+GEFormer = GEFormer_reg
+__all__ = ["GEFormer","GEFormer_reg"]