gpbench 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (188) hide show
  1. gp_agent_tool/compute_dataset_feature.py +67 -0
  2. gp_agent_tool/config.py +65 -0
  3. gp_agent_tool/experience/create_masked_dataset_summary.py +97 -0
  4. gp_agent_tool/experience/dataset_summary_info.py +13 -0
  5. gp_agent_tool/experience/experience_info.py +12 -0
  6. gp_agent_tool/experience/get_matched_experience.py +111 -0
  7. gp_agent_tool/llm_client.py +119 -0
  8. gp_agent_tool/logging_utils.py +24 -0
  9. gp_agent_tool/main.py +347 -0
  10. gp_agent_tool/read_agent/__init__.py +46 -0
  11. gp_agent_tool/read_agent/nodes.py +674 -0
  12. gp_agent_tool/read_agent/prompts.py +547 -0
  13. gp_agent_tool/read_agent/python_repl_tool.py +165 -0
  14. gp_agent_tool/read_agent/state.py +101 -0
  15. gp_agent_tool/read_agent/workflow.py +54 -0
  16. gpbench/__init__.py +25 -0
  17. gpbench/_selftest.py +104 -0
  18. gpbench/method_class/BayesA/BayesA_class.py +141 -0
  19. gpbench/method_class/BayesA/__init__.py +5 -0
  20. gpbench/method_class/BayesA/_bayesfromR.py +96 -0
  21. gpbench/method_class/BayesA/_param_free_base_model.py +84 -0
  22. gpbench/method_class/BayesA/bayesAfromR.py +16 -0
  23. gpbench/method_class/BayesB/BayesB_class.py +140 -0
  24. gpbench/method_class/BayesB/__init__.py +5 -0
  25. gpbench/method_class/BayesB/_bayesfromR.py +96 -0
  26. gpbench/method_class/BayesB/_param_free_base_model.py +84 -0
  27. gpbench/method_class/BayesB/bayesBfromR.py +16 -0
  28. gpbench/method_class/BayesC/BayesC_class.py +141 -0
  29. gpbench/method_class/BayesC/__init__.py +4 -0
  30. gpbench/method_class/BayesC/_bayesfromR.py +96 -0
  31. gpbench/method_class/BayesC/_param_free_base_model.py +84 -0
  32. gpbench/method_class/BayesC/bayesCfromR.py +16 -0
  33. gpbench/method_class/CropARNet/CropARNet_class.py +186 -0
  34. gpbench/method_class/CropARNet/CropARNet_he_class.py +154 -0
  35. gpbench/method_class/CropARNet/__init__.py +5 -0
  36. gpbench/method_class/CropARNet/base_CropARNet_class.py +178 -0
  37. gpbench/method_class/Cropformer/Cropformer_class.py +308 -0
  38. gpbench/method_class/Cropformer/__init__.py +5 -0
  39. gpbench/method_class/Cropformer/cropformer_he_class.py +221 -0
  40. gpbench/method_class/DL_GWAS/DL_GWAS_class.py +250 -0
  41. gpbench/method_class/DL_GWAS/DL_GWAS_he_class.py +169 -0
  42. gpbench/method_class/DL_GWAS/__init__.py +5 -0
  43. gpbench/method_class/DNNGP/DNNGP_class.py +163 -0
  44. gpbench/method_class/DNNGP/DNNGP_he_class.py +138 -0
  45. gpbench/method_class/DNNGP/__init__.py +5 -0
  46. gpbench/method_class/DNNGP/base_dnngp_class.py +116 -0
  47. gpbench/method_class/DeepCCR/DeepCCR_class.py +172 -0
  48. gpbench/method_class/DeepCCR/DeepCCR_he_class.py +161 -0
  49. gpbench/method_class/DeepCCR/__init__.py +5 -0
  50. gpbench/method_class/DeepCCR/base_DeepCCR_class.py +209 -0
  51. gpbench/method_class/DeepGS/DeepGS_class.py +184 -0
  52. gpbench/method_class/DeepGS/DeepGS_he_class.py +150 -0
  53. gpbench/method_class/DeepGS/__init__.py +5 -0
  54. gpbench/method_class/DeepGS/base_deepgs_class.py +153 -0
  55. gpbench/method_class/EIR/EIR_class.py +276 -0
  56. gpbench/method_class/EIR/EIR_he_class.py +184 -0
  57. gpbench/method_class/EIR/__init__.py +5 -0
  58. gpbench/method_class/EIR/utils/__init__.py +0 -0
  59. gpbench/method_class/EIR/utils/array_output_modules.py +97 -0
  60. gpbench/method_class/EIR/utils/common.py +65 -0
  61. gpbench/method_class/EIR/utils/lcl_layers.py +235 -0
  62. gpbench/method_class/EIR/utils/logging.py +59 -0
  63. gpbench/method_class/EIR/utils/mlp_layers.py +92 -0
  64. gpbench/method_class/EIR/utils/models_locally_connected.py +642 -0
  65. gpbench/method_class/EIR/utils/transformer_models.py +546 -0
  66. gpbench/method_class/ElasticNet/ElasticNet_class.py +133 -0
  67. gpbench/method_class/ElasticNet/ElasticNet_he_class.py +91 -0
  68. gpbench/method_class/ElasticNet/__init__.py +5 -0
  69. gpbench/method_class/G2PDeep/G2PDeep_he_class.py +217 -0
  70. gpbench/method_class/G2PDeep/G2Pdeep_class.py +205 -0
  71. gpbench/method_class/G2PDeep/__init__.py +5 -0
  72. gpbench/method_class/G2PDeep/base_G2PDeep_class.py +209 -0
  73. gpbench/method_class/GBLUP/GBLUP_class.py +183 -0
  74. gpbench/method_class/GBLUP/__init__.py +5 -0
  75. gpbench/method_class/GEFormer/GEFormer_class.py +169 -0
  76. gpbench/method_class/GEFormer/GEFormer_he_class.py +137 -0
  77. gpbench/method_class/GEFormer/__init__.py +5 -0
  78. gpbench/method_class/GEFormer/gMLP_class.py +357 -0
  79. gpbench/method_class/LightGBM/LightGBM_class.py +224 -0
  80. gpbench/method_class/LightGBM/LightGBM_he_class.py +121 -0
  81. gpbench/method_class/LightGBM/__init__.py +5 -0
  82. gpbench/method_class/RF/RF_GPU_class.py +165 -0
  83. gpbench/method_class/RF/RF_GPU_he_class.py +124 -0
  84. gpbench/method_class/RF/__init__.py +5 -0
  85. gpbench/method_class/SVC/SVC_GPU.py +181 -0
  86. gpbench/method_class/SVC/SVC_GPU_he.py +106 -0
  87. gpbench/method_class/SVC/__init__.py +5 -0
  88. gpbench/method_class/SoyDNGP/AlexNet_206_class.py +179 -0
  89. gpbench/method_class/SoyDNGP/SoyDNGP_class.py +189 -0
  90. gpbench/method_class/SoyDNGP/SoyDNGP_he_class.py +112 -0
  91. gpbench/method_class/SoyDNGP/__init__.py +5 -0
  92. gpbench/method_class/XGBoost/XGboost_GPU_class.py +198 -0
  93. gpbench/method_class/XGBoost/XGboost_GPU_he_class.py +178 -0
  94. gpbench/method_class/XGBoost/__init__.py +5 -0
  95. gpbench/method_class/__init__.py +52 -0
  96. gpbench/method_class/rrBLUP/__init__.py +5 -0
  97. gpbench/method_class/rrBLUP/rrBLUP_class.py +140 -0
  98. gpbench/method_reg/BayesA/BayesA.py +116 -0
  99. gpbench/method_reg/BayesA/__init__.py +5 -0
  100. gpbench/method_reg/BayesA/_bayesfromR.py +96 -0
  101. gpbench/method_reg/BayesA/_param_free_base_model.py +84 -0
  102. gpbench/method_reg/BayesA/bayesAfromR.py +16 -0
  103. gpbench/method_reg/BayesB/BayesB.py +117 -0
  104. gpbench/method_reg/BayesB/__init__.py +5 -0
  105. gpbench/method_reg/BayesB/_bayesfromR.py +96 -0
  106. gpbench/method_reg/BayesB/_param_free_base_model.py +84 -0
  107. gpbench/method_reg/BayesB/bayesBfromR.py +16 -0
  108. gpbench/method_reg/BayesC/BayesC.py +115 -0
  109. gpbench/method_reg/BayesC/__init__.py +5 -0
  110. gpbench/method_reg/BayesC/_bayesfromR.py +96 -0
  111. gpbench/method_reg/BayesC/_param_free_base_model.py +84 -0
  112. gpbench/method_reg/BayesC/bayesCfromR.py +16 -0
  113. gpbench/method_reg/CropARNet/CropARNet.py +159 -0
  114. gpbench/method_reg/CropARNet/CropARNet_Hyperparameters.py +109 -0
  115. gpbench/method_reg/CropARNet/__init__.py +5 -0
  116. gpbench/method_reg/CropARNet/base_CropARNet.py +137 -0
  117. gpbench/method_reg/Cropformer/Cropformer.py +313 -0
  118. gpbench/method_reg/Cropformer/Cropformer_Hyperparameters.py +250 -0
  119. gpbench/method_reg/Cropformer/__init__.py +5 -0
  120. gpbench/method_reg/DL_GWAS/DL_GWAS.py +186 -0
  121. gpbench/method_reg/DL_GWAS/DL_GWAS_Hyperparameters.py +125 -0
  122. gpbench/method_reg/DL_GWAS/__init__.py +5 -0
  123. gpbench/method_reg/DNNGP/DNNGP.py +157 -0
  124. gpbench/method_reg/DNNGP/DNNGP_Hyperparameters.py +118 -0
  125. gpbench/method_reg/DNNGP/__init__.py +5 -0
  126. gpbench/method_reg/DNNGP/base_dnngp.py +101 -0
  127. gpbench/method_reg/DeepCCR/DeepCCR.py +149 -0
  128. gpbench/method_reg/DeepCCR/DeepCCR_Hyperparameters.py +110 -0
  129. gpbench/method_reg/DeepCCR/__init__.py +5 -0
  130. gpbench/method_reg/DeepCCR/base_DeepCCR.py +171 -0
  131. gpbench/method_reg/DeepGS/DeepGS.py +165 -0
  132. gpbench/method_reg/DeepGS/DeepGS_Hyperparameters.py +114 -0
  133. gpbench/method_reg/DeepGS/__init__.py +5 -0
  134. gpbench/method_reg/DeepGS/base_deepgs.py +98 -0
  135. gpbench/method_reg/EIR/EIR.py +258 -0
  136. gpbench/method_reg/EIR/EIR_Hyperparameters.py +178 -0
  137. gpbench/method_reg/EIR/__init__.py +5 -0
  138. gpbench/method_reg/EIR/utils/__init__.py +0 -0
  139. gpbench/method_reg/EIR/utils/array_output_modules.py +97 -0
  140. gpbench/method_reg/EIR/utils/common.py +65 -0
  141. gpbench/method_reg/EIR/utils/lcl_layers.py +235 -0
  142. gpbench/method_reg/EIR/utils/logging.py +59 -0
  143. gpbench/method_reg/EIR/utils/mlp_layers.py +92 -0
  144. gpbench/method_reg/EIR/utils/models_locally_connected.py +642 -0
  145. gpbench/method_reg/EIR/utils/transformer_models.py +546 -0
  146. gpbench/method_reg/ElasticNet/ElasticNet.py +123 -0
  147. gpbench/method_reg/ElasticNet/ElasticNet_he.py +83 -0
  148. gpbench/method_reg/ElasticNet/__init__.py +5 -0
  149. gpbench/method_reg/G2PDeep/G2PDeep_Hyperparameters.py +107 -0
  150. gpbench/method_reg/G2PDeep/G2Pdeep.py +166 -0
  151. gpbench/method_reg/G2PDeep/__init__.py +5 -0
  152. gpbench/method_reg/G2PDeep/base_G2PDeep.py +209 -0
  153. gpbench/method_reg/GBLUP/GBLUP_R.py +182 -0
  154. gpbench/method_reg/GBLUP/__init__.py +5 -0
  155. gpbench/method_reg/GEFormer/GEFormer.py +164 -0
  156. gpbench/method_reg/GEFormer/GEFormer_Hyperparameters.py +106 -0
  157. gpbench/method_reg/GEFormer/__init__.py +5 -0
  158. gpbench/method_reg/GEFormer/gMLP.py +341 -0
  159. gpbench/method_reg/LightGBM/LightGBM.py +237 -0
  160. gpbench/method_reg/LightGBM/LightGBM_Hyperparameters.py +77 -0
  161. gpbench/method_reg/LightGBM/__init__.py +5 -0
  162. gpbench/method_reg/MVP/MVP.py +182 -0
  163. gpbench/method_reg/MVP/MVP_Hyperparameters.py +126 -0
  164. gpbench/method_reg/MVP/__init__.py +5 -0
  165. gpbench/method_reg/MVP/base_MVP.py +113 -0
  166. gpbench/method_reg/RF/RF_GPU.py +174 -0
  167. gpbench/method_reg/RF/RF_Hyperparameters.py +163 -0
  168. gpbench/method_reg/RF/__init__.py +5 -0
  169. gpbench/method_reg/SVC/SVC_GPU.py +194 -0
  170. gpbench/method_reg/SVC/SVC_Hyperparameters.py +107 -0
  171. gpbench/method_reg/SVC/__init__.py +5 -0
  172. gpbench/method_reg/SoyDNGP/AlexNet_206.py +185 -0
  173. gpbench/method_reg/SoyDNGP/SoyDNGP.py +179 -0
  174. gpbench/method_reg/SoyDNGP/SoyDNGP_Hyperparameters.py +105 -0
  175. gpbench/method_reg/SoyDNGP/__init__.py +5 -0
  176. gpbench/method_reg/XGBoost/XGboost_GPU.py +188 -0
  177. gpbench/method_reg/XGBoost/XGboost_Hyperparameters.py +167 -0
  178. gpbench/method_reg/XGBoost/__init__.py +5 -0
  179. gpbench/method_reg/__init__.py +55 -0
  180. gpbench/method_reg/rrBLUP/__init__.py +5 -0
  181. gpbench/method_reg/rrBLUP/rrBLUP.py +123 -0
  182. gpbench-1.0.0.dist-info/METADATA +379 -0
  183. gpbench-1.0.0.dist-info/RECORD +188 -0
  184. gpbench-1.0.0.dist-info/WHEEL +5 -0
  185. gpbench-1.0.0.dist-info/entry_points.txt +2 -0
  186. gpbench-1.0.0.dist-info/top_level.txt +3 -0
  187. tests/test_import.py +80 -0
  188. tests/test_method.py +232 -0
@@ -0,0 +1,112 @@
1
+ import os
2
+ import time
3
+ import psutil
4
+ import random
5
+ import torch
6
+ import numpy as np
7
+ import optuna
8
+ from sklearn.model_selection import StratifiedKFold, train_test_split
9
+ from .AlexNet_206_class import AlexNet
10
+ from sklearn.metrics import accuracy_score, precision_recall_fscore_support
11
+ from torch.utils.data import DataLoader, TensorDataset
12
+ from optuna.exceptions import TrialPruned
13
+
14
+ def run_nested_cv_with_early_stopping(data, label, nsnp, num_classes, learning_rate, patience, batch_size, num_round=300):
15
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
16
+ print("Starting 10-fold cross-validation...")
17
+ kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
18
+ all_acc, all_prec, all_rec, all_f1 = [], [], [], []
19
+
20
+ for fold, (train_index, test_index) in enumerate(kf.split(data, label)):
21
+ print(f"Running fold {fold}...")
22
+ process = psutil.Process(os.getpid())
23
+ fold_start_time = time.time()
24
+
25
+ X_train, X_test = data[train_index], data[test_index]
26
+ y_train, y_test = label[train_index], label[test_index]
27
+
28
+ X_train_sub, X_valid, y_train_sub, y_valid = train_test_split(
29
+ X_train, y_train, test_size=0.1, stratify=y_train, random_state=42
30
+ )
31
+
32
+ x_train_tensor = torch.from_numpy(X_train_sub).float().to(device)
33
+ y_train_tensor = torch.from_numpy(y_train_sub).long().to(device)
34
+ x_valid_tensor = torch.from_numpy(X_valid).float().to(device)
35
+ y_valid_tensor = torch.from_numpy(y_valid).long().to(device)
36
+ x_test_tensor = torch.from_numpy(X_test).float().to(device)
37
+ y_test_tensor = torch.from_numpy(y_test).long().to(device)
38
+
39
+ train_data = TensorDataset(x_train_tensor, y_train_tensor)
40
+ valid_data = TensorDataset(x_valid_tensor, y_valid_tensor)
41
+ test_data = TensorDataset(x_test_tensor, y_test_tensor)
42
+
43
+ train_loader = DataLoader(train_data, batch_size, shuffle=True)
44
+ valid_loader = DataLoader(valid_data, batch_size, shuffle=False)
45
+ test_loader = DataLoader(test_data, batch_size, shuffle=False)
46
+
47
+ model = AlexNet(num_classes=num_classes)
48
+ model.train_model(train_loader, valid_loader, num_round, learning_rate, patience, device)
49
+ y_pred = model.predict(test_loader)
50
+
51
+ acc = accuracy_score(y_test, y_pred)
52
+ prec, rec, f1, _ = precision_recall_fscore_support(
53
+ y_test, y_pred, average="macro", zero_division=0
54
+ )
55
+
56
+ if np.isnan(f1) or f1 <= 0:
57
+ print(f"Fold {fold} resulted in NaN or zero F1, pruning the trial...")
58
+ raise TrialPruned()
59
+
60
+ all_acc.append(acc)
61
+ all_prec.append(prec)
62
+ all_rec.append(rec)
63
+ all_f1.append(f1)
64
+
65
+ fold_time = time.time() - fold_start_time
66
+ fold_cpu_mem = process.memory_info().rss / 1024**2
67
+ print(f'Fold {fold}: ACC={acc:.4f}, PREC={prec:.4f}, REC={rec:.4f}, F1={f1:.4f}, '
68
+ f'Time={fold_time:.2f}s, CPU={fold_cpu_mem:.2f}MB')
69
+
70
+ print("\n===== CV Summary =====")
71
+ print(f"ACC : {np.mean(all_acc):.4f} ± {np.std(all_acc):.4f}")
72
+ print(f"PREC: {np.mean(all_prec):.4f} ± {np.std(all_prec):.4f}")
73
+ print(f"REC : {np.mean(all_rec):.4f} ± {np.std(all_rec):.4f}")
74
+ print(f"F1 : {np.mean(all_f1):.4f} ± {np.std(all_f1):.4f}")
75
+
76
+ return float(np.mean(all_f1)) if all_f1 else 0.0
77
+
78
+ def set_seed(seed=42):
79
+ random.seed(seed)
80
+ np.random.seed(seed)
81
+ torch.manual_seed(seed)
82
+ if torch.cuda.is_available():
83
+ torch.cuda.manual_seed_all(seed)
84
+ torch.backends.cudnn.deterministic = True
85
+ torch.backends.cudnn.benchmark = False
86
+
87
+ def Hyperparameter(data, label, nsnp, num_classes):
88
+ set_seed(42)
89
+ def objective(trial):
90
+ learning_rate = trial.suggest_float("learning_rate", 1e-4, 0.1, log=True)
91
+ batch_size = trial.suggest_categorical("batch_size", [32, 64, 128])
92
+ patience = trial.suggest_int("patience", 10, 100, step=10)
93
+ try:
94
+ f1_score = run_nested_cv_with_early_stopping(
95
+ data=data,
96
+ label=label,
97
+ nsnp=nsnp,
98
+ num_classes=num_classes,
99
+ learning_rate=learning_rate,
100
+ patience=patience,
101
+ batch_size=batch_size
102
+ )
103
+ except TrialPruned:
104
+ return float("-inf")
105
+ return f1_score
106
+
107
+ study = optuna.create_study(direction="maximize")
108
+ study.optimize(objective, n_trials=20)
109
+
110
+ print("best params:", study.best_params)
111
+ print("successfully")
112
+ return study.best_params
@@ -0,0 +1,5 @@
1
+ from .SoyDNGP_class import SoyDNGP_class
2
+
3
+ SoyDNGP = SoyDNGP_class
4
+
5
+ __all__ = ["SoyDNGP","SoyDNGP_class"]
@@ -0,0 +1,198 @@
1
+ import os
2
+ import time
3
+ import torch
4
+ import psutil
5
+ import argparse
6
+ import random
7
+ import xgboost as xgb
8
+ import numpy as np
9
+ import pandas as pd
10
+ import pynvml
11
+ import swanlab
12
+
13
+ from sklearn.model_selection import StratifiedKFold
14
+ from sklearn.preprocessing import LabelEncoder
15
+ from sklearn.metrics import accuracy_score, precision_recall_fscore_support
16
+
17
+ from . import XGboost_GPU_he_class
18
+
19
+ # =======================
20
+ # Argument parser
21
+ # =======================
22
+ def parse_args():
23
+ parser = argparse.ArgumentParser()
24
+ parser.add_argument('--methods', type=str, default='XGBoost/')
25
+ parser.add_argument('--species', type=str, default='Horse/')
26
+ parser.add_argument('--phe', type=str, default='')
27
+ parser.add_argument('--data_dir', type=str, default='../../data/')
28
+ parser.add_argument('--result_dir', type=str, default='result/')
29
+
30
+ parser.add_argument('--learning_rate', type=float, default=0.1)
31
+ parser.add_argument('--n_estimators', type=int, default=200)
32
+ parser.add_argument('--max_depth', type=int, default=6)
33
+ parser.add_argument('--min_child_weight', type=int, default=1)
34
+ parser.add_argument('--subsample', type=float, default=0.8)
35
+ parser.add_argument('--colsample_bytree', type=float, default=0.8)
36
+ parser.add_argument('--gamma', type=float, default=0)
37
+ parser.add_argument('--reg_alpha', type=float, default=0)
38
+ parser.add_argument('--reg_lambda', type=float, default=1)
39
+
40
+ parser.add_argument('--use_gpu', action='store_true')
41
+ return parser.parse_args()
42
+
43
+ def load_data(args):
44
+ X = np.load(os.path.join(args.data_dir, args.species, 'genotype.npz'))["arr_0"]
45
+ Y = np.load(os.path.join(args.data_dir, args.species, 'phenotype.npz'))["arr_0"]
46
+ print(f"Samples: {X.shape[0]}, SNPs: {X.shape[1]}")
47
+ return X, Y
48
+
49
+ def get_gpu_mem_by_pid(pid, handle):
50
+ procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
51
+ for p in procs:
52
+ if p.pid == pid:
53
+ return p.usedGpuMemory / 1024**2
54
+ return 0.0
55
+
56
+ def set_seed(seed=42):
57
+ random.seed(seed)
58
+ np.random.seed(seed)
59
+ torch.manual_seed(seed)
60
+ if torch.cuda.is_available():
61
+ torch.cuda.manual_seed_all(seed)
62
+ torch.backends.cudnn.deterministic = True
63
+ torch.backends.cudnn.benchmark = False
64
+
65
+ def run_cv(args, X, label):
66
+ result_dir = os.path.join(args.result_dir, args.methods + args.species + args.phe)
67
+ os.makedirs(result_dir, exist_ok=True)
68
+ le = LabelEncoder()
69
+ y_all = le.fit_transform(label)
70
+ np.save(os.path.join(result_dir, 'label_mapping.npy'), le.classes_)
71
+ num_classes = len(np.unique(y_all))
72
+
73
+ kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
74
+
75
+ # ===== GPU / CPU =====
76
+ use_gpu = args.use_gpu and torch.cuda.is_available()
77
+ if use_gpu:
78
+ tree_method = "hist"
79
+ device = "cuda:0"
80
+ print("🚀 Using GPU XGBoost")
81
+ else:
82
+ tree_method = "hist"
83
+ device = "cpu"
84
+ print("⚠ Using CPU XGBoost")
85
+
86
+ all_acc, all_prec, all_rec, all_f1 = [], [], [], []
87
+ start_time = time.time()
88
+ process = psutil.Process(os.getpid())
89
+
90
+ for fold, (train_idx, test_idx) in enumerate(kf.split(X, y_all)):
91
+ print(f"\n===== Fold {fold} =====")
92
+ fold_start = time.time()
93
+
94
+ X_train, X_test = X[train_idx], X[test_idx]
95
+ y_train, y_test = y_all[train_idx], y_all[test_idx]
96
+
97
+ # ===== Objective =====
98
+ if num_classes == 2:
99
+ objective = "binary:logistic"
100
+ eval_metric = "logloss"
101
+ else:
102
+ objective = "multi:softprob"
103
+ eval_metric = "mlogloss"
104
+
105
+ model = xgb.XGBClassifier(
106
+ learning_rate=args.learning_rate,
107
+ n_estimators=args.n_estimators,
108
+ max_depth=args.max_depth,
109
+ min_child_weight=args.min_child_weight,
110
+ subsample=args.subsample,
111
+ colsample_bytree=args.colsample_bytree,
112
+ gamma=args.gamma,
113
+ reg_alpha=args.reg_alpha,
114
+ reg_lambda=args.reg_lambda,
115
+ objective=objective,
116
+ eval_metric=eval_metric,
117
+ num_class=num_classes if num_classes > 2 else None,
118
+ tree_method=tree_method,
119
+ device=device,
120
+ random_state=42,
121
+ n_jobs=-1
122
+ )
123
+
124
+ model.fit(X_train, y_train)
125
+
126
+ # ===== Prediction =====
127
+ y_proba = model.predict_proba(X_test)
128
+ y_pred = np.argmax(y_proba, axis=1)
129
+
130
+ # ===== Metrics =====
131
+ acc = accuracy_score(y_test, y_pred)
132
+ prec, rec, f1, _ = precision_recall_fscore_support(
133
+ y_test, y_pred, average="macro", zero_division=0
134
+ )
135
+
136
+ all_acc.append(acc)
137
+ all_prec.append(prec)
138
+ all_rec.append(rec)
139
+ all_f1.append(f1)
140
+
141
+ fold_time = time.time() - fold_start
142
+ gpu_mem = get_gpu_mem_by_pid(os.getpid(), handle) if use_gpu else 0.0
143
+ cpu_mem = process.memory_info().rss / 1024**2
144
+
145
+ print(
146
+ f"ACC={acc:.4f}, PREC={prec:.4f}, REC={rec:.4f}, "
147
+ f"F1={f1:.4f}, Time={fold_time:.2f}s"
148
+ )
149
+
150
+ df = pd.DataFrame({
151
+ "Y_test": le.inverse_transform(y_test),
152
+ "Y_pred": le.inverse_transform(y_pred)
153
+ })
154
+ df.to_csv(os.path.join(result_dir, f"fold{fold}.csv"), index=False)
155
+
156
+ print("\n===== CV Summary =====")
157
+ print(f"ACC : {np.mean(all_acc):.4f} ± {np.std(all_acc):.4f}")
158
+ print(f"PREC: {np.mean(all_prec):.4f} ± {np.std(all_prec):.4f}")
159
+ print(f"REC : {np.mean(all_rec):.4f} ± {np.std(all_rec):.4f}")
160
+ print(f"F1 : {np.mean(all_f1):.4f} ± {np.std(all_f1):.4f}")
161
+ print(f"Total Time: {time.time() - start_time:.2f}s")
162
+
163
+
164
+ def XGBoost_class():
165
+ set_seed(42)
166
+ pynvml.nvmlInit()
167
+ handle = pynvml.nvmlDeviceGetHandleByIndex(0)
168
+
169
+ args = parse_args()
170
+
171
+ all_species = ["Human/Sim/"]
172
+ for species in all_species:
173
+ args.species = species
174
+ X, Y = load_data(args)
175
+ print(f"\n=== Running {args.methods}{args.species}{args.phe} ===")
176
+ label = Y[:, 0]
177
+
178
+ best_params = XGboost_GPU_he_class.Hyperparameter(X, label)
179
+ args.learning_rate = best_params['learning_rate']
180
+ args.n_estimators = best_params['n_estimators']
181
+ args.max_depth = best_params['max_depth']
182
+ args.subsample = best_params['subsample']
183
+ args.colsample_bytree = best_params['colsample_bytree']
184
+ args.gamma = best_params['gamma']
185
+ args.reg_alpha = best_params['reg_alpha']
186
+ args.reg_lambda = best_params['reg_lambda']
187
+ args.min_child_weight = best_params['min_child_weight']
188
+
189
+ start_time = time.time()
190
+ run_cv(args, X, label)
191
+
192
+ elapsed_time = time.time() - start_time
193
+ print(f"Total running time: {elapsed_time:.2f} s")
194
+ print("✔ Finished successfully")
195
+
196
+
197
+ if __name__ == "__main__":
198
+ XGBoost_class()
@@ -0,0 +1,178 @@
1
+ import os
2
+ import random
3
+ import torch
4
+ import numpy as np
5
+ import argparse
6
+ import time
7
+ import optuna
8
+ import xgboost as xgb
9
+
10
+ from sklearn.model_selection import StratifiedKFold
11
+ from sklearn.preprocessing import LabelEncoder
12
+ from sklearn.metrics import (
13
+ accuracy_score,
14
+ precision_recall_fscore_support
15
+ )
16
+
17
+ def set_seed(seed=42):
18
+ random.seed(seed)
19
+ np.random.seed(seed)
20
+ torch.manual_seed(seed)
21
+ if torch.cuda.is_available():
22
+ torch.cuda.manual_seed_all(seed)
23
+ torch.backends.cudnn.deterministic = True
24
+ torch.backends.cudnn.benchmark = False
25
+
26
+ def run_nested_cv_with_early_stopping(
27
+ data,
28
+ label,
29
+ outer_cv,
30
+ learning_rate,
31
+ n_estimators,
32
+ max_depth,
33
+ min_child_weight,
34
+ subsample,
35
+ colsample_bytree,
36
+ gamma,
37
+ reg_alpha,
38
+ reg_lambda,
39
+ use_gpu=True
40
+ ):
41
+ all_acc, all_prec, all_rec, all_f1 = [], [], [], []
42
+
43
+ # ===== Encode labels =====
44
+ le = LabelEncoder()
45
+ y_all = le.fit_transform(label)
46
+ num_classes = len(np.unique(y_all))
47
+
48
+ # ===== GPU / CPU =====
49
+ gpu_available = torch.cuda.is_available() and use_gpu
50
+ if gpu_available:
51
+ tree_method = "hist"
52
+ device = "cuda:0"
53
+ print("🚀 使用 GPU 加速 XGBoost")
54
+ else:
55
+ tree_method = "hist"
56
+ device = "cpu"
57
+ print("⚠ 使用 CPU XGBoost")
58
+
59
+ start_time = time.time()
60
+
61
+ for fold, (train_idx, test_idx) in enumerate(outer_cv.split(data, y_all)):
62
+ X_train, X_test = data[train_idx], data[test_idx]
63
+ y_train, y_test = y_all[train_idx], y_all[test_idx]
64
+
65
+ if num_classes == 2:
66
+ objective = "binary:logistic"
67
+ eval_metric = "logloss"
68
+ num_class_param = None
69
+ else:
70
+ objective = "multi:softprob"
71
+ eval_metric = "mlogloss"
72
+ num_class_param = num_classes
73
+
74
+ model = xgb.XGBClassifier(
75
+ learning_rate=learning_rate,
76
+ n_estimators=n_estimators,
77
+ max_depth=max_depth,
78
+ min_child_weight=min_child_weight,
79
+ subsample=subsample,
80
+ colsample_bytree=colsample_bytree,
81
+ gamma=gamma,
82
+ reg_alpha=reg_alpha,
83
+ reg_lambda=reg_lambda,
84
+ objective=objective,
85
+ eval_metric=eval_metric,
86
+ num_class=num_class_param,
87
+ early_stopping_rounds=50,
88
+ random_state=42,
89
+ tree_method=tree_method,
90
+ device=device,
91
+ n_jobs=-1
92
+ )
93
+
94
+ model.fit(
95
+ X_train,
96
+ y_train,
97
+ eval_set=[(X_test, y_test)],
98
+ verbose=False
99
+ )
100
+
101
+ # ===== Prediction =====
102
+ y_proba = model.predict_proba(X_test)
103
+ y_pred = np.argmax(y_proba, axis=1)
104
+
105
+ # ===== Metrics =====
106
+ acc = accuracy_score(y_test, y_pred)
107
+ prec, rec, f1, _ = precision_recall_fscore_support(
108
+ y_test,
109
+ y_pred,
110
+ average="macro",
111
+ zero_division=0
112
+ )
113
+
114
+ all_acc.append(acc)
115
+ all_prec.append(prec)
116
+ all_rec.append(rec)
117
+ all_f1.append(f1)
118
+
119
+ accel = "GPU" if gpu_available else "CPU"
120
+ print(
121
+ f"Fold {fold + 1}[{accel}]: "
122
+ f"ACC={acc:.4f}, "
123
+ f"PREC={prec:.4f}, "
124
+ f"REC={rec:.4f}, "
125
+ f"F1={f1:.4f}"
126
+ )
127
+
128
+ print("\n==== Final Results ====")
129
+ print(f"ACC : {np.mean(all_acc):.4f} ± {np.std(all_acc):.4f}")
130
+ print(f"PREC: {np.mean(all_prec):.4f} ± {np.std(all_prec):.4f}")
131
+ print(f"REC : {np.mean(all_rec):.4f} ± {np.std(all_rec):.4f}")
132
+ print(f"F1 : {np.mean(all_f1):.4f} ± {np.std(all_f1):.4f}")
133
+ print(f"Time: {time.time() - start_time:.2f}s")
134
+
135
+ return np.mean(all_f1)
136
+
137
+ def Hyperparameter(data, label, use_gpu=True):
138
+ set_seed(42)
139
+
140
+ def objective(trial):
141
+ params = {
142
+ "learning_rate": trial.suggest_float("learning_rate", 1e-2, 0.3),
143
+ "n_estimators": trial.suggest_int("n_estimators", 50, 800),
144
+ "max_depth": trial.suggest_int("max_depth", 3, 10),
145
+ "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
146
+ "subsample": trial.suggest_float("subsample", 0.5, 1.0),
147
+ "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
148
+ "gamma": trial.suggest_float("gamma", 0, 10),
149
+ "reg_alpha": trial.suggest_float("reg_alpha", 1e-3, 10, log=True),
150
+ "reg_lambda": trial.suggest_float("reg_lambda", 1e-3, 10, log=True),
151
+ }
152
+
153
+ outer_cv = StratifiedKFold(
154
+ n_splits=10, shuffle=True, random_state=42
155
+ )
156
+
157
+ f1_mean = run_nested_cv_with_early_stopping(
158
+ data=data,
159
+ label=label,
160
+ outer_cv=outer_cv,
161
+ use_gpu=use_gpu,
162
+ **params
163
+ )
164
+ return f1_mean
165
+
166
+ study = optuna.create_study(direction="maximize")
167
+
168
+ study.set_user_attr("gpu_available", torch.cuda.is_available())
169
+ study.set_user_attr("using_gpu", use_gpu and torch.cuda.is_available())
170
+ study.set_user_attr("xgboost_version", xgb.__version__)
171
+
172
+ study.optimize(objective, n_trials=20)
173
+
174
+ print("\n===== Optuna Result =====")
175
+ print("Best F1:", study.best_value)
176
+ print("Best params:", study.best_params)
177
+
178
+ return study.best_params
@@ -0,0 +1,5 @@
1
+ from .XGboost_GPU_class import XGBoost_class
2
+
3
+ XGBoost = XGBoost_class
4
+
5
+ __all__ = ["XGBoost","XGBoost_class"]
@@ -0,0 +1,52 @@
1
+
2
+ #==============================================================
3
+ #使用示例:
4
+ #import method_class
5
+ #print(method_class.METHODS)
6
+ #m = method_class.load_method("BayesA")
7
+ #print(m) # method_class.BayesA
8
+
9
+ #或
10
+
11
+ #from method_class.BayesA import BayesA_class
12
+ #BayesA_class()
13
+ #==============================================================
14
+ from __future__ import annotations
15
+ import importlib
16
+ from typing import List
17
+
18
+ METHODS: List[str] = [
19
+ "BayesA",
20
+ "BayesB",
21
+ "BayesC",
22
+ "LightGBM",
23
+ "CropARNet",
24
+ "Cropformer",
25
+ "DeepCCR",
26
+ "DeepGS",
27
+ "DNNGP",
28
+ "EIR",
29
+ "G2PDeep",
30
+ "GBLUP",
31
+ "GEFormer",
32
+ "RF",
33
+ "rrBLUP",
34
+ "SoyDNGP",
35
+ "SVC",
36
+ "XGBoost",
37
+ "ElasticNet",
38
+ "DL_GWAS"
39
+ ]
40
+
41
+ __all__ = ["METHODS", "load_method"]
42
+
43
+ def load_method(name: str):
44
+ """
45
+ 动态加载某个方法子包并返回该子包模块对象
46
+ 用法:
47
+ m = method_class.load_method("BayesA")
48
+ # m 里面就是 method_class.BayesA 包
49
+ """
50
+ if name not in METHODS:
51
+ raise ValueError(f"Unknown method '{name}'. Available: {METHODS}")
52
+ return importlib.import_module(f"{__name__}.{name}")
@@ -0,0 +1,5 @@
1
+ from .rrBLUP_class import rrBLUP_class
2
+
3
+ rrBLUP = rrBLUP_class
4
+
5
+ __all__ = ["rrBLUP","rrBLUP_class"]
@@ -0,0 +1,140 @@
1
+ import os
2
+ import time
3
+ import psutil
4
+ import swanlab
5
+ import argparse
6
+ import random
7
+ import torch
8
+ import numpy as np
9
+ import pandas as pd
10
+ from sklearn.model_selection import StratifiedKFold
11
+ from sklearn.preprocessing import LabelEncoder
12
+ from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
13
+ from rpy2.robjects.packages import importr
14
+ from rpy2.robjects import pandas2ri
15
+ pandas2ri.activate()
16
+
17
+
18
+ def parse_args():
19
+ parser = argparse.ArgumentParser(description="Argument parser")
20
+ parser.add_argument('--methods', type=str, default='rrBLUP/', help='Method name')
21
+ parser.add_argument('--species', type=str, default='', help='Species name')
22
+ parser.add_argument('--data_dir', type=str, default='../../data/')
23
+ parser.add_argument('--result_dir', type=str, default='result/')
24
+ return parser.parse_args()
25
+
26
+
27
+ def load_data(args):
28
+ xData = np.load(os.path.join(args.data_dir, args.species, 'genotype.npz'))["arr_0"]
29
+ yData = np.load(os.path.join(args.data_dir, args.species, 'phenotype.npz'))["arr_0"]
30
+ names = np.load(os.path.join(args.data_dir, args.species, 'phenotype.npz'))["arr_1"]
31
+ nsample = xData.shape[0]
32
+ nsnp = xData.shape[1]
33
+ print("Number of samples: ", nsample)
34
+ print("Number of SNPs: ", nsnp)
35
+ return xData, yData, nsample, nsnp, names
36
+
37
+
38
+ def set_seed(seed=42):
39
+ random.seed(seed)
40
+ np.random.seed(seed)
41
+ torch.manual_seed(seed)
42
+ torch.cuda.manual_seed_all(seed)
43
+ torch.backends.cudnn.deterministic = True
44
+ torch.backends.cudnn.benchmark = False
45
+
46
+
47
+ def run_nested_cv(args, data, label):
48
+ result_dir = os.path.join(args.result_dir, args.methods + args.species)
49
+ os.makedirs(result_dir, exist_ok=True)
50
+ rrblup = importr('rrBLUP')
51
+ print("Successfully loaded rrBLUP")
52
+
53
+ print("Starting 10-fold cross-validation...")
54
+ skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
55
+ le = LabelEncoder()
56
+ label_all = le.fit_transform(label)
57
+
58
+ np.save(os.path.join(result_dir, 'label_mapping.npy'), le.classes_)
59
+
60
+ all_acc, all_prec, all_rec, all_f1 = [], [], [], []
61
+ start_time = time.time()
62
+ process = psutil.Process(os.getpid())
63
+
64
+ for fold, (train_index, test_index) in enumerate(skf.split(data, label_all)):
65
+ fold_start = time.time()
66
+ print(f"\n===== Fold {fold} =====")
67
+ X_train, X_test = data[train_index], data[test_index]
68
+ Y_train, Y_test = label_all[train_index], label_all[test_index]
69
+
70
+ if torch.cuda.is_available():
71
+ torch.cuda.reset_peak_memory_stats()
72
+
73
+ classes = np.unique(Y_train)
74
+ scores = np.zeros((len(classes), X_test.shape[0]))
75
+ for idx, cls in enumerate(classes):
76
+ y_train_bin = (Y_train == cls).astype(float)
77
+ model = rrblup.mixed_solve(y=y_train_bin, Z=X_train)
78
+ beta = np.array(model.rx2('beta')).flatten()
79
+ u = np.array(model.rx2('u')).flatten()
80
+ y_pred = np.dot(X_test, u) + beta
81
+ scores[idx, :] = y_pred
82
+
83
+ Y_pred = np.argmax(scores, axis=0)
84
+ acc = accuracy_score(Y_test, Y_pred)
85
+ prec, rec, f1, _ = precision_recall_fscore_support(Y_test, Y_pred, average='macro', zero_division=0)
86
+ cm = confusion_matrix(Y_test, Y_pred)
87
+
88
+ all_acc.append(acc)
89
+ all_prec.append(prec)
90
+ all_rec.append(rec)
91
+ all_f1.append(f1)
92
+
93
+ fold_time = time.time() - fold_start
94
+ fold_gpu_mem = torch.cuda.max_memory_allocated() / 1024**2 if torch.cuda.is_available() else 0
95
+ fold_cpu_mem = process.memory_info().rss / 1024**2
96
+ print(f'Fold {fold}: ACC={acc:.4f}, PREC={prec:.4f}, REC={rec:.4f}, F1={f1:.4f}, '
97
+ f'Time={fold_time:.2f}s, GPU={fold_gpu_mem:.2f}MB, CPU={fold_cpu_mem:.2f}MB')
98
+
99
+ Y_test_orig = le.inverse_transform(Y_test)
100
+ Y_pred_orig = le.inverse_transform(Y_pred)
101
+ results_df = pd.DataFrame({'Y_test': Y_test_orig, 'Y_pred': Y_pred_orig})
102
+ results_df.to_csv(os.path.join(result_dir, f"fold{fold}.csv"), index=False)
103
+
104
+ print("\n===== Cross-validation summary =====")
105
+ print(f"Average ACC: {np.mean(all_acc):.4f} ± {np.std(all_acc):.4f}")
106
+ print(f"Average PREC: {np.mean(all_prec):.4f} ± {np.std(all_prec):.4f}")
107
+ print(f"Average REC: {np.mean(all_rec):.4f} ± {np.std(all_rec):.4f}")
108
+ print(f"Average F1 : {np.mean(all_f1):.4f} ± {np.std(all_f1):.4f}")
109
+ print(f"Total time : {time.time() - start_time:.2f}s")
110
+
111
+
112
+ def rrBLUP_class():
113
+ set_seed(42)
114
+ torch.cuda.empty_cache()
115
+ args = parse_args()
116
+
117
+ all_species = ["Human/Sim/"]
118
+
119
+ for i in range(len(all_species)):
120
+ args.species = all_species[i]
121
+ X, Y, nsamples, nsnp, names = load_data(args)
122
+ print("Starting run " + args.methods + args.species)
123
+ label = Y[:, 0]
124
+
125
+ s = pd.Series(label)
126
+ fill_val = s.mode().iloc[0] if not s.dropna().empty else 0
127
+ label = np.nan_to_num(label, nan=fill_val)
128
+
129
+ start_time = time.time()
130
+ torch.cuda.reset_peak_memory_stats()
131
+ process = psutil.Process(os.getpid())
132
+ run_nested_cv(args, data=X, label=label)
133
+
134
+ elapsed_time = time.time() - start_time
135
+ print(f"Total running time: {elapsed_time:.2f} s")
136
+ print("Successfully finished!")
137
+
138
+
139
+ if __name__ == "__main__":
140
+ rrBLUP_class()