gpbench 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (188) hide show
  1. gp_agent_tool/compute_dataset_feature.py +67 -0
  2. gp_agent_tool/config.py +65 -0
  3. gp_agent_tool/experience/create_masked_dataset_summary.py +97 -0
  4. gp_agent_tool/experience/dataset_summary_info.py +13 -0
  5. gp_agent_tool/experience/experience_info.py +12 -0
  6. gp_agent_tool/experience/get_matched_experience.py +111 -0
  7. gp_agent_tool/llm_client.py +119 -0
  8. gp_agent_tool/logging_utils.py +24 -0
  9. gp_agent_tool/main.py +347 -0
  10. gp_agent_tool/read_agent/__init__.py +46 -0
  11. gp_agent_tool/read_agent/nodes.py +674 -0
  12. gp_agent_tool/read_agent/prompts.py +547 -0
  13. gp_agent_tool/read_agent/python_repl_tool.py +165 -0
  14. gp_agent_tool/read_agent/state.py +101 -0
  15. gp_agent_tool/read_agent/workflow.py +54 -0
  16. gpbench/__init__.py +25 -0
  17. gpbench/_selftest.py +104 -0
  18. gpbench/method_class/BayesA/BayesA_class.py +141 -0
  19. gpbench/method_class/BayesA/__init__.py +5 -0
  20. gpbench/method_class/BayesA/_bayesfromR.py +96 -0
  21. gpbench/method_class/BayesA/_param_free_base_model.py +84 -0
  22. gpbench/method_class/BayesA/bayesAfromR.py +16 -0
  23. gpbench/method_class/BayesB/BayesB_class.py +140 -0
  24. gpbench/method_class/BayesB/__init__.py +5 -0
  25. gpbench/method_class/BayesB/_bayesfromR.py +96 -0
  26. gpbench/method_class/BayesB/_param_free_base_model.py +84 -0
  27. gpbench/method_class/BayesB/bayesBfromR.py +16 -0
  28. gpbench/method_class/BayesC/BayesC_class.py +141 -0
  29. gpbench/method_class/BayesC/__init__.py +4 -0
  30. gpbench/method_class/BayesC/_bayesfromR.py +96 -0
  31. gpbench/method_class/BayesC/_param_free_base_model.py +84 -0
  32. gpbench/method_class/BayesC/bayesCfromR.py +16 -0
  33. gpbench/method_class/CropARNet/CropARNet_class.py +186 -0
  34. gpbench/method_class/CropARNet/CropARNet_he_class.py +154 -0
  35. gpbench/method_class/CropARNet/__init__.py +5 -0
  36. gpbench/method_class/CropARNet/base_CropARNet_class.py +178 -0
  37. gpbench/method_class/Cropformer/Cropformer_class.py +308 -0
  38. gpbench/method_class/Cropformer/__init__.py +5 -0
  39. gpbench/method_class/Cropformer/cropformer_he_class.py +221 -0
  40. gpbench/method_class/DL_GWAS/DL_GWAS_class.py +250 -0
  41. gpbench/method_class/DL_GWAS/DL_GWAS_he_class.py +169 -0
  42. gpbench/method_class/DL_GWAS/__init__.py +5 -0
  43. gpbench/method_class/DNNGP/DNNGP_class.py +163 -0
  44. gpbench/method_class/DNNGP/DNNGP_he_class.py +138 -0
  45. gpbench/method_class/DNNGP/__init__.py +5 -0
  46. gpbench/method_class/DNNGP/base_dnngp_class.py +116 -0
  47. gpbench/method_class/DeepCCR/DeepCCR_class.py +172 -0
  48. gpbench/method_class/DeepCCR/DeepCCR_he_class.py +161 -0
  49. gpbench/method_class/DeepCCR/__init__.py +5 -0
  50. gpbench/method_class/DeepCCR/base_DeepCCR_class.py +209 -0
  51. gpbench/method_class/DeepGS/DeepGS_class.py +184 -0
  52. gpbench/method_class/DeepGS/DeepGS_he_class.py +150 -0
  53. gpbench/method_class/DeepGS/__init__.py +5 -0
  54. gpbench/method_class/DeepGS/base_deepgs_class.py +153 -0
  55. gpbench/method_class/EIR/EIR_class.py +276 -0
  56. gpbench/method_class/EIR/EIR_he_class.py +184 -0
  57. gpbench/method_class/EIR/__init__.py +5 -0
  58. gpbench/method_class/EIR/utils/__init__.py +0 -0
  59. gpbench/method_class/EIR/utils/array_output_modules.py +97 -0
  60. gpbench/method_class/EIR/utils/common.py +65 -0
  61. gpbench/method_class/EIR/utils/lcl_layers.py +235 -0
  62. gpbench/method_class/EIR/utils/logging.py +59 -0
  63. gpbench/method_class/EIR/utils/mlp_layers.py +92 -0
  64. gpbench/method_class/EIR/utils/models_locally_connected.py +642 -0
  65. gpbench/method_class/EIR/utils/transformer_models.py +546 -0
  66. gpbench/method_class/ElasticNet/ElasticNet_class.py +133 -0
  67. gpbench/method_class/ElasticNet/ElasticNet_he_class.py +91 -0
  68. gpbench/method_class/ElasticNet/__init__.py +5 -0
  69. gpbench/method_class/G2PDeep/G2PDeep_he_class.py +217 -0
  70. gpbench/method_class/G2PDeep/G2Pdeep_class.py +205 -0
  71. gpbench/method_class/G2PDeep/__init__.py +5 -0
  72. gpbench/method_class/G2PDeep/base_G2PDeep_class.py +209 -0
  73. gpbench/method_class/GBLUP/GBLUP_class.py +183 -0
  74. gpbench/method_class/GBLUP/__init__.py +5 -0
  75. gpbench/method_class/GEFormer/GEFormer_class.py +169 -0
  76. gpbench/method_class/GEFormer/GEFormer_he_class.py +137 -0
  77. gpbench/method_class/GEFormer/__init__.py +5 -0
  78. gpbench/method_class/GEFormer/gMLP_class.py +357 -0
  79. gpbench/method_class/LightGBM/LightGBM_class.py +224 -0
  80. gpbench/method_class/LightGBM/LightGBM_he_class.py +121 -0
  81. gpbench/method_class/LightGBM/__init__.py +5 -0
  82. gpbench/method_class/RF/RF_GPU_class.py +165 -0
  83. gpbench/method_class/RF/RF_GPU_he_class.py +124 -0
  84. gpbench/method_class/RF/__init__.py +5 -0
  85. gpbench/method_class/SVC/SVC_GPU.py +181 -0
  86. gpbench/method_class/SVC/SVC_GPU_he.py +106 -0
  87. gpbench/method_class/SVC/__init__.py +5 -0
  88. gpbench/method_class/SoyDNGP/AlexNet_206_class.py +179 -0
  89. gpbench/method_class/SoyDNGP/SoyDNGP_class.py +189 -0
  90. gpbench/method_class/SoyDNGP/SoyDNGP_he_class.py +112 -0
  91. gpbench/method_class/SoyDNGP/__init__.py +5 -0
  92. gpbench/method_class/XGBoost/XGboost_GPU_class.py +198 -0
  93. gpbench/method_class/XGBoost/XGboost_GPU_he_class.py +178 -0
  94. gpbench/method_class/XGBoost/__init__.py +5 -0
  95. gpbench/method_class/__init__.py +52 -0
  96. gpbench/method_class/rrBLUP/__init__.py +5 -0
  97. gpbench/method_class/rrBLUP/rrBLUP_class.py +140 -0
  98. gpbench/method_reg/BayesA/BayesA.py +116 -0
  99. gpbench/method_reg/BayesA/__init__.py +5 -0
  100. gpbench/method_reg/BayesA/_bayesfromR.py +96 -0
  101. gpbench/method_reg/BayesA/_param_free_base_model.py +84 -0
  102. gpbench/method_reg/BayesA/bayesAfromR.py +16 -0
  103. gpbench/method_reg/BayesB/BayesB.py +117 -0
  104. gpbench/method_reg/BayesB/__init__.py +5 -0
  105. gpbench/method_reg/BayesB/_bayesfromR.py +96 -0
  106. gpbench/method_reg/BayesB/_param_free_base_model.py +84 -0
  107. gpbench/method_reg/BayesB/bayesBfromR.py +16 -0
  108. gpbench/method_reg/BayesC/BayesC.py +115 -0
  109. gpbench/method_reg/BayesC/__init__.py +5 -0
  110. gpbench/method_reg/BayesC/_bayesfromR.py +96 -0
  111. gpbench/method_reg/BayesC/_param_free_base_model.py +84 -0
  112. gpbench/method_reg/BayesC/bayesCfromR.py +16 -0
  113. gpbench/method_reg/CropARNet/CropARNet.py +159 -0
  114. gpbench/method_reg/CropARNet/CropARNet_Hyperparameters.py +109 -0
  115. gpbench/method_reg/CropARNet/__init__.py +5 -0
  116. gpbench/method_reg/CropARNet/base_CropARNet.py +137 -0
  117. gpbench/method_reg/Cropformer/Cropformer.py +313 -0
  118. gpbench/method_reg/Cropformer/Cropformer_Hyperparameters.py +250 -0
  119. gpbench/method_reg/Cropformer/__init__.py +5 -0
  120. gpbench/method_reg/DL_GWAS/DL_GWAS.py +186 -0
  121. gpbench/method_reg/DL_GWAS/DL_GWAS_Hyperparameters.py +125 -0
  122. gpbench/method_reg/DL_GWAS/__init__.py +5 -0
  123. gpbench/method_reg/DNNGP/DNNGP.py +157 -0
  124. gpbench/method_reg/DNNGP/DNNGP_Hyperparameters.py +118 -0
  125. gpbench/method_reg/DNNGP/__init__.py +5 -0
  126. gpbench/method_reg/DNNGP/base_dnngp.py +101 -0
  127. gpbench/method_reg/DeepCCR/DeepCCR.py +149 -0
  128. gpbench/method_reg/DeepCCR/DeepCCR_Hyperparameters.py +110 -0
  129. gpbench/method_reg/DeepCCR/__init__.py +5 -0
  130. gpbench/method_reg/DeepCCR/base_DeepCCR.py +171 -0
  131. gpbench/method_reg/DeepGS/DeepGS.py +165 -0
  132. gpbench/method_reg/DeepGS/DeepGS_Hyperparameters.py +114 -0
  133. gpbench/method_reg/DeepGS/__init__.py +5 -0
  134. gpbench/method_reg/DeepGS/base_deepgs.py +98 -0
  135. gpbench/method_reg/EIR/EIR.py +258 -0
  136. gpbench/method_reg/EIR/EIR_Hyperparameters.py +178 -0
  137. gpbench/method_reg/EIR/__init__.py +5 -0
  138. gpbench/method_reg/EIR/utils/__init__.py +0 -0
  139. gpbench/method_reg/EIR/utils/array_output_modules.py +97 -0
  140. gpbench/method_reg/EIR/utils/common.py +65 -0
  141. gpbench/method_reg/EIR/utils/lcl_layers.py +235 -0
  142. gpbench/method_reg/EIR/utils/logging.py +59 -0
  143. gpbench/method_reg/EIR/utils/mlp_layers.py +92 -0
  144. gpbench/method_reg/EIR/utils/models_locally_connected.py +642 -0
  145. gpbench/method_reg/EIR/utils/transformer_models.py +546 -0
  146. gpbench/method_reg/ElasticNet/ElasticNet.py +123 -0
  147. gpbench/method_reg/ElasticNet/ElasticNet_he.py +83 -0
  148. gpbench/method_reg/ElasticNet/__init__.py +5 -0
  149. gpbench/method_reg/G2PDeep/G2PDeep_Hyperparameters.py +107 -0
  150. gpbench/method_reg/G2PDeep/G2Pdeep.py +166 -0
  151. gpbench/method_reg/G2PDeep/__init__.py +5 -0
  152. gpbench/method_reg/G2PDeep/base_G2PDeep.py +209 -0
  153. gpbench/method_reg/GBLUP/GBLUP_R.py +182 -0
  154. gpbench/method_reg/GBLUP/__init__.py +5 -0
  155. gpbench/method_reg/GEFormer/GEFormer.py +164 -0
  156. gpbench/method_reg/GEFormer/GEFormer_Hyperparameters.py +106 -0
  157. gpbench/method_reg/GEFormer/__init__.py +5 -0
  158. gpbench/method_reg/GEFormer/gMLP.py +341 -0
  159. gpbench/method_reg/LightGBM/LightGBM.py +237 -0
  160. gpbench/method_reg/LightGBM/LightGBM_Hyperparameters.py +77 -0
  161. gpbench/method_reg/LightGBM/__init__.py +5 -0
  162. gpbench/method_reg/MVP/MVP.py +182 -0
  163. gpbench/method_reg/MVP/MVP_Hyperparameters.py +126 -0
  164. gpbench/method_reg/MVP/__init__.py +5 -0
  165. gpbench/method_reg/MVP/base_MVP.py +113 -0
  166. gpbench/method_reg/RF/RF_GPU.py +174 -0
  167. gpbench/method_reg/RF/RF_Hyperparameters.py +163 -0
  168. gpbench/method_reg/RF/__init__.py +5 -0
  169. gpbench/method_reg/SVC/SVC_GPU.py +194 -0
  170. gpbench/method_reg/SVC/SVC_Hyperparameters.py +107 -0
  171. gpbench/method_reg/SVC/__init__.py +5 -0
  172. gpbench/method_reg/SoyDNGP/AlexNet_206.py +185 -0
  173. gpbench/method_reg/SoyDNGP/SoyDNGP.py +179 -0
  174. gpbench/method_reg/SoyDNGP/SoyDNGP_Hyperparameters.py +105 -0
  175. gpbench/method_reg/SoyDNGP/__init__.py +5 -0
  176. gpbench/method_reg/XGBoost/XGboost_GPU.py +188 -0
  177. gpbench/method_reg/XGBoost/XGboost_Hyperparameters.py +167 -0
  178. gpbench/method_reg/XGBoost/__init__.py +5 -0
  179. gpbench/method_reg/__init__.py +55 -0
  180. gpbench/method_reg/rrBLUP/__init__.py +5 -0
  181. gpbench/method_reg/rrBLUP/rrBLUP.py +123 -0
  182. gpbench-1.0.0.dist-info/METADATA +379 -0
  183. gpbench-1.0.0.dist-info/RECORD +188 -0
  184. gpbench-1.0.0.dist-info/WHEEL +5 -0
  185. gpbench-1.0.0.dist-info/entry_points.txt +2 -0
  186. gpbench-1.0.0.dist-info/top_level.txt +3 -0
  187. tests/test_import.py +80 -0
  188. tests/test_method.py +232 -0
@@ -0,0 +1,179 @@
1
+ import os
2
+ import time
3
+ import psutil
4
+ import argparse
5
+ import random
6
+ import torch
7
+ import numpy as np
8
+ import pandas as pd
9
+ from sklearn.model_selection import KFold, train_test_split
10
+ from .AlexNet_206 import AlexNet
11
+ from scipy.stats import pearsonr
12
+ from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
13
+ from torch.utils.data import DataLoader, TensorDataset
14
+ from . import SoyDNGP_Hyperparameters
15
+ import pynvml
16
+
17
+ def parse_args():
18
+ parser = argparse.ArgumentParser(description="Argument parser")
19
+ parser.add_argument('--methods', type=str, default='SoyDNGP/', help='Random seed')
20
+ parser.add_argument('--species', type=str, default='Chicken/', help='Species name')
21
+ parser.add_argument('--phe', type=str, default='', help='Dataset name')
22
+ parser.add_argument('--data_dir', type=str, default='../../data/')
23
+ parser.add_argument('--result_dir', type=str, default='result/')
24
+
25
+ parser.add_argument('--epochs', type=int, default=1000, help='Number of training rounds')
26
+ parser.add_argument('--batch_size', type=int, default=32, help='Batch size')
27
+ parser.add_argument('--learning_rate', type=float, default=0.01, help='Learning rate')
28
+ parser.add_argument('--patience', type=int, default=10, help='Patience for early stopping')
29
+ args = parser.parse_args()
30
+ return args
31
+
32
+ def get_data(dataframe):
33
+ data_matrix = np.array(dataframe)
34
+ total_sample, total_snp = data_matrix.shape
35
+ one_hot = np.zeros((total_sample, total_snp, 3), dtype=np.float32)
36
+ one_hot[data_matrix == 2] = [1, 1, 0]
37
+ one_hot[data_matrix == 1] = [1, 0, 1]
38
+ one_hot[data_matrix == 0] = [0, 1, 1]
39
+
40
+ target_snp = 206 * 206
41
+ if total_snp != target_snp:
42
+ new_one_hot = np.zeros((total_sample, target_snp, 3), dtype=np.float32)
43
+ copy_len = min(total_snp, target_snp)
44
+ new_one_hot[:, :copy_len] = one_hot[:, :copy_len]
45
+ one_hot = new_one_hot
46
+
47
+ one_hot = one_hot.reshape(total_sample, 206, 206, 3)
48
+ return one_hot
49
+
50
+
51
+ def load_data(args):
52
+ xData = np.load(os.path.join(args.data_dir, args.species, 'genotype.npz'))["arr_0"]
53
+ yData = np.load(os.path.join(args.data_dir, args.species, 'phenotype.npz'))["arr_0"]
54
+ names = np.load(os.path.join(args.data_dir, args.species, 'phenotype.npz'))["arr_1"]
55
+
56
+ nsample = xData.shape[0]
57
+ nsnp = xData.shape[1]
58
+ print("Number of samples: ", nsample)
59
+ print("Number of SNPs: ", nsnp)
60
+ xData = get_data(xData)
61
+ return xData, yData, nsample, nsnp, names
62
+
63
+ def set_seed(seed=42):
64
+ random.seed(seed)
65
+ np.random.seed(seed)
66
+ torch.manual_seed(seed)
67
+ torch.cuda.manual_seed_all(seed)
68
+ torch.backends.cudnn.deterministic = True
69
+ torch.backends.cudnn.benchmark = False
70
+
71
+ def get_gpu_mem_by_pid(pid):
72
+ procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
73
+ for p in procs:
74
+ if p.pid == pid:
75
+ return p.usedGpuMemory / 1024**2
76
+ return 0.0
77
+
78
+
79
+ def run_nested_cv(args, data, label, nsnp, device):
80
+ result_dir = os.path.join(args.result_dir, args.methods + args.species + args.phe)
81
+ os.makedirs(result_dir, exist_ok=True)
82
+ print("Starting 10-fold cross-validation...")
83
+ kf = KFold(n_splits=10, shuffle=True, random_state=42)
84
+
85
+ all_mse, all_mae, all_r2, all_pcc = [], [], [], []
86
+ time_star = time.time()
87
+ for fold, (train_index, test_index) in enumerate(kf.split(data)):
88
+ print(f"Running fold {fold}...")
89
+ process = psutil.Process(os.getpid())
90
+ fold_start_time = time.time()
91
+
92
+ X_train, X_test = data[train_index], data[test_index]
93
+ y_train, y_test = label[train_index], label[test_index]
94
+
95
+ X_train_sub, X_valid, y_train_sub, y_valid = train_test_split(X_train, y_train, test_size=0.1, random_state=42)
96
+
97
+ x_train_tensor = torch.from_numpy(X_train_sub).float().to(device)
98
+ y_train_tensor = torch.from_numpy(y_train_sub).float().to(device)
99
+ x_valid_tensor = torch.from_numpy(X_valid).float().to(device)
100
+ y_valid_tensor = torch.from_numpy(y_valid).float().to(device)
101
+ x_test_tensor = torch.from_numpy(X_test).float().to(device)
102
+ y_test_tensor = torch.from_numpy(y_test).float().to(device)
103
+
104
+ train_data = TensorDataset(x_train_tensor, y_train_tensor)
105
+ valid_data = TensorDataset(x_valid_tensor, y_valid_tensor)
106
+ test_data = TensorDataset(x_test_tensor, y_test_tensor)
107
+
108
+ train_loader = DataLoader(train_data, args.batch_size, shuffle=True)
109
+ valid_loader = DataLoader(valid_data, args.batch_size, shuffle=False)
110
+ test_loader = DataLoader(test_data, args.batch_size, shuffle=False)
111
+
112
+ model = AlexNet()
113
+ model.train_model(train_loader, valid_loader, args.epochs, args.learning_rate, args.patience, device)
114
+ y_pred = model.predict(test_loader)
115
+
116
+ mse = mean_squared_error(y_test, y_pred)
117
+ r2 = r2_score(y_test, y_pred)
118
+ mae = mean_absolute_error(y_test, y_pred)
119
+ pcc, _ = pearsonr(y_test, y_pred)
120
+
121
+ all_mse.append(mse)
122
+ all_r2.append(r2)
123
+ all_mae.append(mae)
124
+ all_pcc.append(pcc)
125
+
126
+ fold_time = time.time() - fold_start_time
127
+ fold_gpu_mem = get_gpu_mem_by_pid(os.getpid())
128
+ fold_cpu_mem = process.memory_info().rss / 1024**2
129
+ print(f'Fold {fold}: Corr={pcc:.4f}, MAE={mae:.4f}, MSE={mse:.4f}, R2={r2:.4f}, Time={fold_time:.2f}s, '
130
+ f'GPU={fold_gpu_mem:.2f}MB, CPU={fold_cpu_mem:.2f}MB')
131
+
132
+ torch.cuda.empty_cache()
133
+ torch.cuda.reset_peak_memory_stats()
134
+ results_df = pd.DataFrame({'Y_test': y_test, 'Y_pred': y_pred})
135
+ results_df.to_csv(os.path.join(result_dir, f"fold{fold}.csv"), index=False)
136
+
137
+ print("\n===== Cross-validation summary =====")
138
+ print(f"Average PCC: {np.mean(all_pcc):.4f} ± {np.std(all_pcc):.4f}")
139
+ print(f"Average MAE: {np.mean(all_mae):.4f} ± {np.std(all_mae):.4f}")
140
+ print(f"Average MSE: {np.mean(all_mse):.4f} ± {np.std(all_mse):.4f}")
141
+ print(f"Average R2 : {np.mean(all_r2):.4f} ± {np.std(all_r2):.4f}")
142
+ print(f"Time: {time.time() - time_star:.2f}s")
143
+
144
+
145
+ def SoyDNGP_reg():
146
+ set_seed(42)
147
+ torch.cuda.empty_cache()
148
+ pynvml.nvmlInit()
149
+ handle = pynvml.nvmlDeviceGetHandleByIndex(0)
150
+
151
+ args = parse_args()
152
+ all_species =['Cotton/']
153
+
154
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
155
+ args.device = device
156
+ for i in range(len(all_species)):
157
+ args.species = all_species[i]
158
+ X, Y, nsamples, nsnp, names = load_data(args)
159
+ for j in range(len(names)):
160
+ args.phe = names[j]
161
+ print("starting run " + args.methods + args.species + args.phe)
162
+ label = Y[:, j]
163
+ label = np.nan_to_num(label, nan=np.nanmean(label))
164
+ best_params = SoyDNGP_Hyperparameters.Hyperparameter(X, label, nsnp)
165
+ args.learning_rate = best_params['learning_rate']
166
+ args.batch_size = best_params['batch_size']
167
+ args.patience = best_params['patience']
168
+ start_time = time.time()
169
+ torch.cuda.reset_peak_memory_stats()
170
+ process = psutil.Process(os.getpid())
171
+ run_nested_cv(args, data=X, label=label, nsnp = nsnp, device = args.device)
172
+
173
+ elapsed_time = time.time() - start_time
174
+ print(f"running time: {elapsed_time:.2f} s")
175
+ print("successfully")
176
+
177
+
178
+ if __name__ == "__main__":
179
+ SoyDNGP_reg()
@@ -0,0 +1,105 @@
1
+ import os
2
+ import time
3
+ import psutil
4
+ import random
5
+ import torch
6
+ import numpy as np
7
+ import optuna
8
+ from sklearn.model_selection import KFold, train_test_split
9
+ from .AlexNet_206 import AlexNet
10
+ from scipy.stats import pearsonr
11
+ from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
12
+ from torch.utils.data import DataLoader, TensorDataset
13
+ from optuna.exceptions import TrialPruned
14
+
15
+ def run_nested_cv_with_early_stopping(data, label, nsnp, learning_rate, patience, batch_size, num_round=300):
16
+ device = torch.device("cuda:0")
17
+ print("Starting 10-fold cross-validation...")
18
+ kf = KFold(n_splits=10, shuffle=True, random_state=42)
19
+ all_mse, all_mae, all_r2, all_pcc = [], [], [], []
20
+
21
+ for fold, (train_index, test_index) in enumerate(kf.split(data)):
22
+ print(f"Running fold {fold}...")
23
+ process = psutil.Process(os.getpid())
24
+ fold_start_time = time.time()
25
+
26
+ X_train, X_test = data[train_index], data[test_index]
27
+ y_train, y_test = label[train_index], label[test_index]
28
+
29
+ X_train_sub, X_valid, y_train_sub, y_valid = train_test_split(X_train, y_train, test_size=0.1, random_state=42)
30
+
31
+ x_train_tensor = torch.from_numpy(X_train_sub).float().to(device)
32
+ y_train_tensor = torch.from_numpy(y_train_sub).float().to(device)
33
+ x_valid_tensor = torch.from_numpy(X_valid).float().to(device)
34
+ y_valid_tensor = torch.from_numpy(y_valid).float().to(device)
35
+ x_test_tensor = torch.from_numpy(X_test).float().to(device)
36
+ y_test_tensor = torch.from_numpy(y_test).float().to(device)
37
+
38
+ train_data = TensorDataset(x_train_tensor, y_train_tensor)
39
+ valid_data = TensorDataset(x_valid_tensor, y_valid_tensor)
40
+ test_data = TensorDataset(x_test_tensor, y_test_tensor)
41
+
42
+ train_loader = DataLoader(train_data, batch_size, shuffle=True)
43
+ valid_loader = DataLoader(valid_data, batch_size, shuffle=False)
44
+ test_loader = DataLoader(test_data, batch_size, shuffle=False)
45
+
46
+ model = AlexNet()
47
+ model.train_model(train_loader, valid_loader, num_round, learning_rate, patience, device)
48
+ y_pred = model.predict(test_loader)
49
+
50
+ mse = mean_squared_error(y_test, y_pred)
51
+ r2 = r2_score(y_test, y_pred)
52
+ mae = mean_absolute_error(y_test, y_pred)
53
+ pcc, _ = pearsonr(y_test, y_pred)
54
+
55
+ if np.isnan(pcc):
56
+ print(f"Fold {fold} resulted in NaN PCC, pruning the trial...")
57
+ raise TrialPruned()
58
+
59
+ all_mse.append(mse)
60
+ all_r2.append(r2)
61
+ all_mae.append(mae)
62
+ all_pcc.append(pcc)
63
+
64
+ fold_time = time.time() - fold_start_time
65
+ fold_cpu_mem = process.memory_info().rss / 1024**2
66
+ print(f'Fold {fold}: Corr={pcc:.4f}, MAE={mae:.4f}, MSE={mse:.4f}, R2={r2:.4f}, Time={fold_time:.2f}s, '
67
+ f'CPU={fold_cpu_mem:.2f}MB')
68
+
69
+ return np.mean(all_pcc) if all_pcc else 0.0
70
+
71
+ def set_seed(seed=42):
72
+ random.seed(seed)
73
+ np.random.seed(seed)
74
+ torch.manual_seed(seed)
75
+ if torch.cuda.is_available():
76
+ torch.cuda.manual_seed_all(seed)
77
+ torch.backends.cudnn.deterministic = True
78
+ torch.backends.cudnn.benchmark = False
79
+
80
+ def Hyperparameter(data, label, nsnp):
81
+ set_seed(42)
82
+ def objective(trial):
83
+ learning_rate = trial.suggest_loguniform("learning_rate", 1e-4,0.1)
84
+ batch_size = trial.suggest_categorical("batch_size", [32, 64, 128])
85
+ patience = trial.suggest_int("patience", 1, 10)
86
+ try:
87
+ corr_score = run_nested_cv_with_early_stopping(
88
+ data=data,
89
+ label=label,
90
+ nsnp=nsnp,
91
+ learning_rate=learning_rate,
92
+ patience=patience,
93
+ batch_size=batch_size
94
+ )
95
+
96
+ except TrialPruned:
97
+ return float("-inf")
98
+ return corr_score
99
+
100
+ study = optuna.create_study(direction="maximize")
101
+ study.optimize(objective, n_trials=20)
102
+
103
+ print("best params:", study.best_params)
104
+ print("successfully")
105
+ return study.best_params
@@ -0,0 +1,5 @@
1
+ from .SoyDNGP import SoyDNGP_reg
2
+
3
+ SoyDNGP = SoyDNGP_reg
4
+
5
+ __all__ = ["SoyDNGP","SoyDNGP_reg"]
@@ -0,0 +1,188 @@
1
+ import os
2
+ import time
3
+ import torch
4
+ import psutil
5
+ import argparse
6
+ import random
7
+ import xgboost as xgb
8
+ import numpy as np
9
+ import pandas as pd
10
+ import pynvml
11
+ from sklearn.model_selection import KFold
12
+ from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
13
+ from scipy.stats import pearsonr
14
+ from . import XGboost_Hyperparameters
15
+
16
+
17
+ def parse_args():
18
+ parser = argparse.ArgumentParser(description="Argument parser for XGBoost")
19
+ parser.add_argument('--methods', type=str, default='XGBoost/', help='Method name')
20
+ parser.add_argument('--species', type=str, default='Chickpea/GSTP012/', help='Species name')
21
+ parser.add_argument('--phe', type=str, default='', help='Dataset name')
22
+ parser.add_argument('--data_dir', type=str, default='../../data/')
23
+ parser.add_argument('--result_dir', type=str, default='result/')
24
+
25
+ parser.add_argument('--learning_rate', type=float, default=0.1)
26
+ parser.add_argument('--n_estimators', type=int, default=100)
27
+ parser.add_argument('--max_depth', type=int, default=6)
28
+ parser.add_argument('--min_child_weight', type=int, default=1)
29
+ parser.add_argument('--subsample', type=float, default=0.8)
30
+ parser.add_argument('--colsample_bytree', type=float, default=0.8)
31
+ parser.add_argument('--gamma', type=float, default=0)
32
+ parser.add_argument('--reg_alpha', type=float, default=0)
33
+ parser.add_argument('--reg_lambda', type=float, default=1)
34
+ parser.add_argument('--use_gpu', type=bool, default=True, help='Whether to use GPU acceleration') # 新增参数
35
+ args = parser.parse_args()
36
+ return args
37
+
38
+ def load_data(args):
39
+ xData = np.load(os.path.join(args.data_dir, args.species, 'genotype.npz'))["arr_0"]
40
+ yData = np.load(os.path.join(args.data_dir, args.species, 'phenotype.npz'))["arr_0"]
41
+ names = np.load(os.path.join(args.data_dir, args.species, 'phenotype.npz'))["arr_1"]
42
+
43
+ nsample = xData.shape[0]
44
+ nsnp = xData.shape[1]
45
+ print("Number of samples: ", nsample)
46
+ print("Number of SNPs: ", nsnp)
47
+ return xData, yData, nsample, nsnp, names
48
+
49
+ def get_gpu_mem_by_pid(pid):
50
+ procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
51
+ for p in procs:
52
+ if p.pid == pid:
53
+ return p.usedGpuMemory / 1024**2
54
+ return 0.0
55
+
56
+ def set_seed(seed=42):
57
+ random.seed(seed)
58
+ np.random.seed(seed)
59
+ torch.manual_seed(seed)
60
+ if torch.cuda.is_available():
61
+ torch.cuda.manual_seed_all(seed)
62
+ torch.backends.cudnn.deterministic = True
63
+ torch.backends.cudnn.benchmark = False
64
+
65
+ def run_nested_cv(args, data, label):
66
+ result_dir = os.path.join(args.result_dir, args.methods + args.species + args.phe)
67
+ os.makedirs(result_dir, exist_ok=True)
68
+ print("Starting 10-fold cross-validation with XGBoost...")
69
+ use_gpu = args.use_gpu and torch.cuda.is_available()
70
+ if use_gpu:
71
+ print("🚀 GPU XGBoost (XGBoost 2.0+ API)")
72
+ tree_method = 'hist'
73
+ device = 'cuda:0'
74
+ else:
75
+ print("⚠ CPU")
76
+ tree_method = 'hist'
77
+ device = 'cpu'
78
+
79
+ kf = KFold(n_splits=10, shuffle=True, random_state=42)
80
+
81
+ all_mse, all_mae, all_r2, all_pcc = [], [], [], []
82
+ time_star = time.time()
83
+
84
+ for fold, (train_index, test_index) in enumerate(kf.split(data)):
85
+ print(f"Running fold {fold}...")
86
+ fold_start_time = time.time()
87
+ process = psutil.Process(os.getpid())
88
+
89
+
90
+ x_train, x_test = data[train_index], data[test_index]
91
+ y_train, y_test = label[train_index], label[test_index]
92
+
93
+ model = xgb.XGBRegressor(
94
+ learning_rate=args.learning_rate,
95
+ n_estimators=args.n_estimators,
96
+ max_depth=args.max_depth,
97
+ min_child_weight=args.min_child_weight,
98
+ subsample=args.subsample,
99
+ colsample_bytree=args.colsample_bytree,
100
+ gamma=args.gamma,
101
+ reg_alpha=args.reg_alpha,
102
+ reg_lambda=args.reg_lambda,
103
+ objective='reg:squarederror',
104
+ eval_metric='rmse',
105
+ random_state=42,
106
+
107
+ tree_method=tree_method,
108
+ device=device,
109
+ n_jobs=-1,
110
+ )
111
+
112
+ model.fit(x_train, y_train)
113
+
114
+ y_test_preds = model.predict(x_test)
115
+ y_pred = y_test_preds.reshape(-1)
116
+ y_test_original = y_test.reshape(-1)
117
+
118
+ mse = mean_squared_error(y_test_original, y_pred)
119
+ r2 = r2_score(y_test_original, y_pred)
120
+ mae = mean_absolute_error(y_test_original, y_pred)
121
+ pcc, _ = pearsonr(y_test_original, y_pred)
122
+
123
+ all_mse.append(mse)
124
+ all_r2.append(r2)
125
+ all_mae.append(mae)
126
+ all_pcc.append(pcc)
127
+
128
+ fold_time = time.time() - fold_start_time
129
+ fold_gpu_mem = get_gpu_mem_by_pid(os.getpid()) if use_gpu else 0.0
130
+ fold_cpu_mem = process.memory_info().rss / 1024**2
131
+
132
+ acceleration_status = "GPU" if use_gpu else "CPU"
133
+ print(f'Fold {fold}[{acceleration_status}]: Corr={pcc:.4f}, MAE={mae:.4f}, MSE={mse:.4f}, R2={r2:.4f}, Time={fold_time:.2f}s, '
134
+ f'GPU={fold_gpu_mem:.2f}MB, CPU={fold_cpu_mem:.2f}MB')
135
+
136
+ results_df = pd.DataFrame({'Y_test': y_test, 'Y_pred': y_test_preds})
137
+ results_df.to_csv(os.path.join(result_dir, f"fold{fold}.csv"), index=False)
138
+
139
+ print("\n===== Cross-validation summary =====")
140
+ print(f"Average PCC: {np.mean(all_pcc):.4f} ± {np.std(all_pcc):.4f}")
141
+ print(f"Average MAE: {np.mean(all_mae):.4f} ± {np.std(all_mae):.4f}")
142
+ print(f"Average MSE: {np.mean(all_mse):.4f} ± {np.std(all_mse):.4f}")
143
+ print(f"Average R2 : {np.mean(all_r2):.4f} ± {np.std(all_r2):.4f}")
144
+ print(f"Time: {time.time() - time_star:.2f}s")
145
+
146
+
147
+ def XGBoost_reg():
148
+ set_seed(42)
149
+ pynvml.nvmlInit()
150
+ handle = pynvml.nvmlDeviceGetHandleByIndex(0)
151
+ args = parse_args()
152
+ all_species =['Cotton/']
153
+
154
+ for i in range(len(all_species)):
155
+ args.species = all_species[i]
156
+ os.makedirs(args.result_dir + args.methods + args.species + args.phe, exist_ok=True)
157
+ X, Y, nsamples, nsnp, names = load_data(args)
158
+
159
+ for j in range(len(names)):
160
+ args.phe = names[j]
161
+ print("starting run " + args.methods + args.species + args.phe)
162
+ label = Y[:, j]
163
+ label = np.nan_to_num(label, nan=np.nanmean(label))
164
+
165
+ best_params = XGboost_Hyperparameters.Hyperparameter(X, label)
166
+ args.learning_rate =best_params['learning_rate']
167
+ args.n_estimators = best_params['n_estimators']
168
+ args.max_depth = best_params['max_depth']
169
+ args.subsample = best_params['subsample']
170
+ args.colsample_bytree = best_params['colsample_bytree']
171
+ args.gamma = best_params['gamma']
172
+ args.reg_alpha = best_params['reg_alpha']
173
+ args.reg_lambda = best_params['reg_lambda']
174
+
175
+ start_time = time.time()
176
+ if torch.cuda.is_available():
177
+ torch.cuda.reset_peak_memory_stats()
178
+ process = psutil.Process(os.getpid())
179
+
180
+ run_nested_cv(args, data=X, label=label)
181
+
182
+ elapsed_time = time.time() - start_time
183
+ print(f"running time: {elapsed_time:.2f} s")
184
+ print("successfully")
185
+
186
+
187
+ if __name__ == "__main__":
188
+ XGBoost_reg()
@@ -0,0 +1,167 @@
1
+ import os
2
+ import random
3
+ import torch
4
+ import numpy as np
5
+ import argparse
6
+ from sklearn.model_selection import KFold
7
+ from sklearn.preprocessing import StandardScaler
8
+ import xgboost as xgb
9
+ from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
10
+ import optuna
11
+ import cupy as cp
12
+
13
+ def run_nested_cv_with_early_stopping(data, label, outer_cv, learning_rate, n_estimators, max_depth, min_child_weight,
14
+ subsample, colsample_bytree, gamma, reg_alpha, reg_lambda, use_gpu=True):
15
+ best_corr_coefs = []
16
+ best_maes = []
17
+ best_r2s = []
18
+ best_mses = []
19
+
20
+ # 检查 GPU 可用性
21
+ gpu_available = torch.cuda.is_available() and use_gpu
22
+
23
+ # 使用新的 XGBoost 2.0+ API
24
+ if gpu_available:
25
+ print("🚀 使用 GPU 加速 XGBoost (XGBoost 2.0+ API)")
26
+ tree_method = 'hist' # 使用 hist 算法
27
+ device = 'cuda:0' # 新的设备参数
28
+ else:
29
+ print("⚠ 使用 CPU 版本")
30
+ tree_method = 'hist'
31
+ device = 'cpu'
32
+
33
+ import time
34
+ time_star = time.time()
35
+
36
+ for fold, (train_idx, test_idx) in enumerate(outer_cv.split(data)):
37
+ x_train, x_test = data[train_idx], data[test_idx]
38
+ y_train, y_test = label[train_idx], label[test_idx]
39
+
40
+ # # 标准化数据
41
+ # scaler = StandardScaler()
42
+ # x_train = scaler.fit_transform(x_train)
43
+ # x_test = scaler.transform(x_test)
44
+
45
+ # # ==== y 标准化 ====
46
+ # scaler_y = StandardScaler()
47
+ # y_train_scaled = scaler_y.fit_transform(y_train.reshape(-1, 1)).reshape(-1)
48
+ # y_test_scaled = scaler_y.transform(y_test.reshape(-1, 1)).reshape(-1)
49
+
50
+ # x_train = cp.asarray(x_train) # 转换为cupy数组(GPU内存)
51
+ # y_train_scaled = cp.asarray(y_train_scaled)
52
+ # x_test = cp.asarray(x_test) # 转换为cupy数组(GPU内存)
53
+ # y_test_scaled = cp.asarray(y_test_scaled)
54
+
55
+ # 初始化XGBoost模型 - 使用新的GPU参数设置 (XGBoost 2.0+)
56
+ model = xgb.XGBRegressor(
57
+ learning_rate=learning_rate,
58
+ n_estimators=n_estimators,
59
+ max_depth=max_depth,
60
+ min_child_weight=min_child_weight,
61
+ subsample=subsample,
62
+ colsample_bytree=colsample_bytree,
63
+ gamma=gamma,
64
+ reg_alpha=reg_alpha,
65
+ reg_lambda=reg_lambda,
66
+ objective='reg:squarederror',
67
+ early_stopping_rounds=50,
68
+ eval_metric='rmse',
69
+ random_state=42,
70
+ # ==== 新的 GPU 加速参数 (XGBoost 2.0+) ====
71
+ tree_method=tree_method, # 使用 hist 算法
72
+ device=device, # 新的设备参数:'cuda' 或 'cpu'
73
+ n_jobs=-1, # 使用所有CPU核心
74
+ )
75
+
76
+ # 训练模型
77
+ model.fit(x_train,
78
+ y_train,
79
+ eval_set=[(x_test, y_test)],
80
+ verbose=False)
81
+
82
+ # 预测
83
+ y_test_preds = model.predict(x_test)
84
+ # y_test_preds = scaler_y.inverse_transform(y_test_preds.reshape(-1, 1)).reshape(-1)
85
+ # y_test_trues = scaler_y.inverse_transform(y_test_scaled.reshape(-1, 1)).reshape(-1)
86
+ y_test_preds = y_test_preds.reshape(-1)
87
+ y_test_trues = y_test.reshape(-1)
88
+ # 计算评价指标
89
+ corr_coef = np.corrcoef(y_test_preds, y_test_trues)[0, 1]
90
+ mae = mean_absolute_error(y_test_trues, y_test_preds)
91
+ mse = mean_squared_error(y_test_trues, y_test_preds)
92
+ r2 = r2_score(y_test_trues, y_test_preds)
93
+
94
+ best_corr_coefs.append(corr_coef)
95
+ best_maes.append(mae)
96
+ best_r2s.append(r2)
97
+ best_mses.append(mse)
98
+
99
+ acceleration_status = "GPU" if gpu_available else "CPU"
100
+ print(f'Fold {fold + 1}[{acceleration_status}]: MAE={mae:.4f}, MSE={mse:.4f}, R2={r2:.4f}, Corr={corr_coef:.4f}')
101
+
102
+ print("==== Final Results ====")
103
+ print(f"加速方式: {'GPU' if gpu_available else 'CPU'}")
104
+ print(f"MAE: {np.mean(best_maes):.4f} ± {np.std(best_maes):.4f}")
105
+ print(f"MSE: {np.mean(best_mses):.4f} ± {np.std(best_mses):.4f}")
106
+ print(f"R2 : {np.mean(best_r2s):.4f} ± {np.std(best_r2s):.4f}")
107
+ print(f"Corr: {np.mean(best_corr_coefs):.4f} ± {np.std(best_corr_coefs):.4f}")
108
+
109
+ print(f"Time: {time.time() - time_star:.2f}s")
110
+ return np.mean(best_corr_coefs)
111
+
112
+ # 设置随机种子
113
+ def set_seed(seed=42):
114
+ random.seed(seed)
115
+ np.random.seed(seed)
116
+ torch.manual_seed(seed)
117
+ if torch.cuda.is_available():
118
+ torch.cuda.manual_seed_all(seed)
119
+ torch.backends.cudnn.deterministic = True
120
+ torch.backends.cudnn.benchmark = False
121
+
122
+ def Hyperparameter(data, label, use_gpu=True):
123
+ set_seed(42)
124
+
125
+ def objective(trial):
126
+ learning_rate = trial.suggest_float("learning_rate", 1e-2, 0.2)
127
+ n_estimators = trial.suggest_int("n_estimators", 50, 1000)
128
+ max_depth = trial.suggest_int("max_depth", 3, 10)
129
+ min_child_weight = trial.suggest_int("min_child_weight", 1, 10)
130
+ subsample = trial.suggest_float("subsample", 0.05, 1.0)
131
+ colsample_bytree = trial.suggest_float("colsample_bytree", 0.5, 1.0)
132
+ gamma = trial.suggest_float("gamma", 0, 10)
133
+ reg_alpha = trial.suggest_float("reg_alpha", 1e-3, 10, log=True)
134
+ reg_lambda = trial.suggest_float("reg_lambda",1e-3, 10, log=True)
135
+
136
+ outer_cv = KFold(n_splits=10, shuffle=True, random_state=42)
137
+
138
+ corr_score = run_nested_cv_with_early_stopping(
139
+ data=data,
140
+ label=label,
141
+ outer_cv=outer_cv,
142
+ learning_rate=learning_rate,
143
+ n_estimators=n_estimators,
144
+ max_depth=max_depth,
145
+ min_child_weight=min_child_weight,
146
+ subsample=subsample,
147
+ colsample_bytree=colsample_bytree,
148
+ gamma=gamma,
149
+ reg_alpha=reg_alpha,
150
+ reg_lambda=reg_lambda,
151
+ use_gpu=use_gpu
152
+ )
153
+ return corr_score
154
+
155
+ # 运行Optuna超参数优化
156
+ study = optuna.create_study(direction="maximize")
157
+
158
+ # 添加GPU信息到study
159
+ study.set_user_attr('gpu_available', torch.cuda.is_available())
160
+ study.set_user_attr('using_gpu', use_gpu and torch.cuda.is_available())
161
+ study.set_user_attr('xgboost_version', xgb.__version__)
162
+
163
+ study.optimize(objective, n_trials=20)
164
+
165
+ print("最佳参数:", study.best_params)
166
+ print(f"优化完成 - 使用 {'GPU' if (use_gpu and torch.cuda.is_available()) else 'CPU'}")
167
+ return study.best_params
@@ -0,0 +1,5 @@
1
+ from .XGboost_GPU import XGBoost_reg
2
+
3
+ XGBoost = XGBoost_reg
4
+
5
+ __all__ = ["XGBoost","XGBoost_reg"]