gpbench 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (188) hide show
  1. gp_agent_tool/compute_dataset_feature.py +67 -0
  2. gp_agent_tool/config.py +65 -0
  3. gp_agent_tool/experience/create_masked_dataset_summary.py +97 -0
  4. gp_agent_tool/experience/dataset_summary_info.py +13 -0
  5. gp_agent_tool/experience/experience_info.py +12 -0
  6. gp_agent_tool/experience/get_matched_experience.py +111 -0
  7. gp_agent_tool/llm_client.py +119 -0
  8. gp_agent_tool/logging_utils.py +24 -0
  9. gp_agent_tool/main.py +347 -0
  10. gp_agent_tool/read_agent/__init__.py +46 -0
  11. gp_agent_tool/read_agent/nodes.py +674 -0
  12. gp_agent_tool/read_agent/prompts.py +547 -0
  13. gp_agent_tool/read_agent/python_repl_tool.py +165 -0
  14. gp_agent_tool/read_agent/state.py +101 -0
  15. gp_agent_tool/read_agent/workflow.py +54 -0
  16. gpbench/__init__.py +25 -0
  17. gpbench/_selftest.py +104 -0
  18. gpbench/method_class/BayesA/BayesA_class.py +141 -0
  19. gpbench/method_class/BayesA/__init__.py +5 -0
  20. gpbench/method_class/BayesA/_bayesfromR.py +96 -0
  21. gpbench/method_class/BayesA/_param_free_base_model.py +84 -0
  22. gpbench/method_class/BayesA/bayesAfromR.py +16 -0
  23. gpbench/method_class/BayesB/BayesB_class.py +140 -0
  24. gpbench/method_class/BayesB/__init__.py +5 -0
  25. gpbench/method_class/BayesB/_bayesfromR.py +96 -0
  26. gpbench/method_class/BayesB/_param_free_base_model.py +84 -0
  27. gpbench/method_class/BayesB/bayesBfromR.py +16 -0
  28. gpbench/method_class/BayesC/BayesC_class.py +141 -0
  29. gpbench/method_class/BayesC/__init__.py +4 -0
  30. gpbench/method_class/BayesC/_bayesfromR.py +96 -0
  31. gpbench/method_class/BayesC/_param_free_base_model.py +84 -0
  32. gpbench/method_class/BayesC/bayesCfromR.py +16 -0
  33. gpbench/method_class/CropARNet/CropARNet_class.py +186 -0
  34. gpbench/method_class/CropARNet/CropARNet_he_class.py +154 -0
  35. gpbench/method_class/CropARNet/__init__.py +5 -0
  36. gpbench/method_class/CropARNet/base_CropARNet_class.py +178 -0
  37. gpbench/method_class/Cropformer/Cropformer_class.py +308 -0
  38. gpbench/method_class/Cropformer/__init__.py +5 -0
  39. gpbench/method_class/Cropformer/cropformer_he_class.py +221 -0
  40. gpbench/method_class/DL_GWAS/DL_GWAS_class.py +250 -0
  41. gpbench/method_class/DL_GWAS/DL_GWAS_he_class.py +169 -0
  42. gpbench/method_class/DL_GWAS/__init__.py +5 -0
  43. gpbench/method_class/DNNGP/DNNGP_class.py +163 -0
  44. gpbench/method_class/DNNGP/DNNGP_he_class.py +138 -0
  45. gpbench/method_class/DNNGP/__init__.py +5 -0
  46. gpbench/method_class/DNNGP/base_dnngp_class.py +116 -0
  47. gpbench/method_class/DeepCCR/DeepCCR_class.py +172 -0
  48. gpbench/method_class/DeepCCR/DeepCCR_he_class.py +161 -0
  49. gpbench/method_class/DeepCCR/__init__.py +5 -0
  50. gpbench/method_class/DeepCCR/base_DeepCCR_class.py +209 -0
  51. gpbench/method_class/DeepGS/DeepGS_class.py +184 -0
  52. gpbench/method_class/DeepGS/DeepGS_he_class.py +150 -0
  53. gpbench/method_class/DeepGS/__init__.py +5 -0
  54. gpbench/method_class/DeepGS/base_deepgs_class.py +153 -0
  55. gpbench/method_class/EIR/EIR_class.py +276 -0
  56. gpbench/method_class/EIR/EIR_he_class.py +184 -0
  57. gpbench/method_class/EIR/__init__.py +5 -0
  58. gpbench/method_class/EIR/utils/__init__.py +0 -0
  59. gpbench/method_class/EIR/utils/array_output_modules.py +97 -0
  60. gpbench/method_class/EIR/utils/common.py +65 -0
  61. gpbench/method_class/EIR/utils/lcl_layers.py +235 -0
  62. gpbench/method_class/EIR/utils/logging.py +59 -0
  63. gpbench/method_class/EIR/utils/mlp_layers.py +92 -0
  64. gpbench/method_class/EIR/utils/models_locally_connected.py +642 -0
  65. gpbench/method_class/EIR/utils/transformer_models.py +546 -0
  66. gpbench/method_class/ElasticNet/ElasticNet_class.py +133 -0
  67. gpbench/method_class/ElasticNet/ElasticNet_he_class.py +91 -0
  68. gpbench/method_class/ElasticNet/__init__.py +5 -0
  69. gpbench/method_class/G2PDeep/G2PDeep_he_class.py +217 -0
  70. gpbench/method_class/G2PDeep/G2Pdeep_class.py +205 -0
  71. gpbench/method_class/G2PDeep/__init__.py +5 -0
  72. gpbench/method_class/G2PDeep/base_G2PDeep_class.py +209 -0
  73. gpbench/method_class/GBLUP/GBLUP_class.py +183 -0
  74. gpbench/method_class/GBLUP/__init__.py +5 -0
  75. gpbench/method_class/GEFormer/GEFormer_class.py +169 -0
  76. gpbench/method_class/GEFormer/GEFormer_he_class.py +137 -0
  77. gpbench/method_class/GEFormer/__init__.py +5 -0
  78. gpbench/method_class/GEFormer/gMLP_class.py +357 -0
  79. gpbench/method_class/LightGBM/LightGBM_class.py +224 -0
  80. gpbench/method_class/LightGBM/LightGBM_he_class.py +121 -0
  81. gpbench/method_class/LightGBM/__init__.py +5 -0
  82. gpbench/method_class/RF/RF_GPU_class.py +165 -0
  83. gpbench/method_class/RF/RF_GPU_he_class.py +124 -0
  84. gpbench/method_class/RF/__init__.py +5 -0
  85. gpbench/method_class/SVC/SVC_GPU.py +181 -0
  86. gpbench/method_class/SVC/SVC_GPU_he.py +106 -0
  87. gpbench/method_class/SVC/__init__.py +5 -0
  88. gpbench/method_class/SoyDNGP/AlexNet_206_class.py +179 -0
  89. gpbench/method_class/SoyDNGP/SoyDNGP_class.py +189 -0
  90. gpbench/method_class/SoyDNGP/SoyDNGP_he_class.py +112 -0
  91. gpbench/method_class/SoyDNGP/__init__.py +5 -0
  92. gpbench/method_class/XGBoost/XGboost_GPU_class.py +198 -0
  93. gpbench/method_class/XGBoost/XGboost_GPU_he_class.py +178 -0
  94. gpbench/method_class/XGBoost/__init__.py +5 -0
  95. gpbench/method_class/__init__.py +52 -0
  96. gpbench/method_class/rrBLUP/__init__.py +5 -0
  97. gpbench/method_class/rrBLUP/rrBLUP_class.py +140 -0
  98. gpbench/method_reg/BayesA/BayesA.py +116 -0
  99. gpbench/method_reg/BayesA/__init__.py +5 -0
  100. gpbench/method_reg/BayesA/_bayesfromR.py +96 -0
  101. gpbench/method_reg/BayesA/_param_free_base_model.py +84 -0
  102. gpbench/method_reg/BayesA/bayesAfromR.py +16 -0
  103. gpbench/method_reg/BayesB/BayesB.py +117 -0
  104. gpbench/method_reg/BayesB/__init__.py +5 -0
  105. gpbench/method_reg/BayesB/_bayesfromR.py +96 -0
  106. gpbench/method_reg/BayesB/_param_free_base_model.py +84 -0
  107. gpbench/method_reg/BayesB/bayesBfromR.py +16 -0
  108. gpbench/method_reg/BayesC/BayesC.py +115 -0
  109. gpbench/method_reg/BayesC/__init__.py +5 -0
  110. gpbench/method_reg/BayesC/_bayesfromR.py +96 -0
  111. gpbench/method_reg/BayesC/_param_free_base_model.py +84 -0
  112. gpbench/method_reg/BayesC/bayesCfromR.py +16 -0
  113. gpbench/method_reg/CropARNet/CropARNet.py +159 -0
  114. gpbench/method_reg/CropARNet/CropARNet_Hyperparameters.py +109 -0
  115. gpbench/method_reg/CropARNet/__init__.py +5 -0
  116. gpbench/method_reg/CropARNet/base_CropARNet.py +137 -0
  117. gpbench/method_reg/Cropformer/Cropformer.py +313 -0
  118. gpbench/method_reg/Cropformer/Cropformer_Hyperparameters.py +250 -0
  119. gpbench/method_reg/Cropformer/__init__.py +5 -0
  120. gpbench/method_reg/DL_GWAS/DL_GWAS.py +186 -0
  121. gpbench/method_reg/DL_GWAS/DL_GWAS_Hyperparameters.py +125 -0
  122. gpbench/method_reg/DL_GWAS/__init__.py +5 -0
  123. gpbench/method_reg/DNNGP/DNNGP.py +157 -0
  124. gpbench/method_reg/DNNGP/DNNGP_Hyperparameters.py +118 -0
  125. gpbench/method_reg/DNNGP/__init__.py +5 -0
  126. gpbench/method_reg/DNNGP/base_dnngp.py +101 -0
  127. gpbench/method_reg/DeepCCR/DeepCCR.py +149 -0
  128. gpbench/method_reg/DeepCCR/DeepCCR_Hyperparameters.py +110 -0
  129. gpbench/method_reg/DeepCCR/__init__.py +5 -0
  130. gpbench/method_reg/DeepCCR/base_DeepCCR.py +171 -0
  131. gpbench/method_reg/DeepGS/DeepGS.py +165 -0
  132. gpbench/method_reg/DeepGS/DeepGS_Hyperparameters.py +114 -0
  133. gpbench/method_reg/DeepGS/__init__.py +5 -0
  134. gpbench/method_reg/DeepGS/base_deepgs.py +98 -0
  135. gpbench/method_reg/EIR/EIR.py +258 -0
  136. gpbench/method_reg/EIR/EIR_Hyperparameters.py +178 -0
  137. gpbench/method_reg/EIR/__init__.py +5 -0
  138. gpbench/method_reg/EIR/utils/__init__.py +0 -0
  139. gpbench/method_reg/EIR/utils/array_output_modules.py +97 -0
  140. gpbench/method_reg/EIR/utils/common.py +65 -0
  141. gpbench/method_reg/EIR/utils/lcl_layers.py +235 -0
  142. gpbench/method_reg/EIR/utils/logging.py +59 -0
  143. gpbench/method_reg/EIR/utils/mlp_layers.py +92 -0
  144. gpbench/method_reg/EIR/utils/models_locally_connected.py +642 -0
  145. gpbench/method_reg/EIR/utils/transformer_models.py +546 -0
  146. gpbench/method_reg/ElasticNet/ElasticNet.py +123 -0
  147. gpbench/method_reg/ElasticNet/ElasticNet_he.py +83 -0
  148. gpbench/method_reg/ElasticNet/__init__.py +5 -0
  149. gpbench/method_reg/G2PDeep/G2PDeep_Hyperparameters.py +107 -0
  150. gpbench/method_reg/G2PDeep/G2Pdeep.py +166 -0
  151. gpbench/method_reg/G2PDeep/__init__.py +5 -0
  152. gpbench/method_reg/G2PDeep/base_G2PDeep.py +209 -0
  153. gpbench/method_reg/GBLUP/GBLUP_R.py +182 -0
  154. gpbench/method_reg/GBLUP/__init__.py +5 -0
  155. gpbench/method_reg/GEFormer/GEFormer.py +164 -0
  156. gpbench/method_reg/GEFormer/GEFormer_Hyperparameters.py +106 -0
  157. gpbench/method_reg/GEFormer/__init__.py +5 -0
  158. gpbench/method_reg/GEFormer/gMLP.py +341 -0
  159. gpbench/method_reg/LightGBM/LightGBM.py +237 -0
  160. gpbench/method_reg/LightGBM/LightGBM_Hyperparameters.py +77 -0
  161. gpbench/method_reg/LightGBM/__init__.py +5 -0
  162. gpbench/method_reg/MVP/MVP.py +182 -0
  163. gpbench/method_reg/MVP/MVP_Hyperparameters.py +126 -0
  164. gpbench/method_reg/MVP/__init__.py +5 -0
  165. gpbench/method_reg/MVP/base_MVP.py +113 -0
  166. gpbench/method_reg/RF/RF_GPU.py +174 -0
  167. gpbench/method_reg/RF/RF_Hyperparameters.py +163 -0
  168. gpbench/method_reg/RF/__init__.py +5 -0
  169. gpbench/method_reg/SVC/SVC_GPU.py +194 -0
  170. gpbench/method_reg/SVC/SVC_Hyperparameters.py +107 -0
  171. gpbench/method_reg/SVC/__init__.py +5 -0
  172. gpbench/method_reg/SoyDNGP/AlexNet_206.py +185 -0
  173. gpbench/method_reg/SoyDNGP/SoyDNGP.py +179 -0
  174. gpbench/method_reg/SoyDNGP/SoyDNGP_Hyperparameters.py +105 -0
  175. gpbench/method_reg/SoyDNGP/__init__.py +5 -0
  176. gpbench/method_reg/XGBoost/XGboost_GPU.py +188 -0
  177. gpbench/method_reg/XGBoost/XGboost_Hyperparameters.py +167 -0
  178. gpbench/method_reg/XGBoost/__init__.py +5 -0
  179. gpbench/method_reg/__init__.py +55 -0
  180. gpbench/method_reg/rrBLUP/__init__.py +5 -0
  181. gpbench/method_reg/rrBLUP/rrBLUP.py +123 -0
  182. gpbench-1.0.0.dist-info/METADATA +379 -0
  183. gpbench-1.0.0.dist-info/RECORD +188 -0
  184. gpbench-1.0.0.dist-info/WHEEL +5 -0
  185. gpbench-1.0.0.dist-info/entry_points.txt +2 -0
  186. gpbench-1.0.0.dist-info/top_level.txt +3 -0
  187. tests/test_import.py +80 -0
  188. tests/test_method.py +232 -0
@@ -0,0 +1,182 @@
1
+ import os
2
+ import time
3
+ import psutil
4
+ import argparse
5
+ import random
6
+ import torch
7
+ import numpy as np
8
+ import pandas as pd
9
+ from sklearn.model_selection import KFold, train_test_split
10
+ from scipy.stats import pearsonr
11
+ from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
12
+ from torch.utils.data import DataLoader, TensorDataset
13
+ from .base_MVP import MVP
14
+ from . import MVP_Hyperparameters
15
+ import pynvml
16
+
17
+
18
+ def parse_args():
19
+ parser = argparse.ArgumentParser(description="MVP regression")
20
+ parser.add_argument('--methods', type=str, default='MVP/')
21
+ parser.add_argument('--species', type=str, default='Wheat/')
22
+ parser.add_argument('--phe', type=str, default='')
23
+ parser.add_argument('--data_dir', type=str, default='../../data/')
24
+ parser.add_argument('--result_dir', type=str, default='result/')
25
+
26
+ parser.add_argument('--epochs', type=int, default=100)
27
+ parser.add_argument('--batch_size', type=int, default=64)
28
+ parser.add_argument('--learning_rate', type=float, default=0.01)
29
+ parser.add_argument('--patience', type=int, default=10)
30
+ parser.add_argument('--nb_filters', type=int, default=32)
31
+ return parser.parse_args()
32
+
33
+
34
+ def load_data(args):
35
+ xData = np.load(os.path.join(args.data_dir, args.species, 'genotype.npz'))["arr_0"]
36
+ yData = np.load(os.path.join(args.data_dir, args.species, 'phenotype.npz'))["arr_0"]
37
+ names = np.load(os.path.join(args.data_dir, args.species, 'phenotype.npz'))["arr_1"]
38
+
39
+ nsample = xData.shape[0]
40
+ nsnp = xData.shape[1]
41
+ print("Number of samples: ", nsample)
42
+ print("Number of SNPs: ", nsnp)
43
+ return xData, yData, nsample, nsnp, names
44
+
45
+
46
+ def set_seed(seed=42):
47
+ random.seed(seed)
48
+ np.random.seed(seed)
49
+ torch.manual_seed(seed)
50
+ torch.cuda.manual_seed_all(seed)
51
+ torch.backends.cudnn.deterministic = True
52
+ torch.backends.cudnn.benchmark = False
53
+
54
+
55
+ def get_gpu_mem_by_pid(pid, handle=None):
56
+ if handle is None:
57
+ return 0.0
58
+ try:
59
+ procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
60
+ for p in procs:
61
+ if p.pid == pid:
62
+ return p.usedGpuMemory / 1024**2
63
+ return 0.0
64
+ except Exception:
65
+ return 0.0
66
+
67
+
68
+ def run_nested_cv(args, data, label, n_features, device, gpu_handle=None):
69
+ result_dir = os.path.join(args.result_dir, args.methods + args.species + args.phe)
70
+ os.makedirs(result_dir, exist_ok=True)
71
+ print("Starting 10-fold cross-validation...")
72
+ kf = KFold(n_splits=10, shuffle=True, random_state=42)
73
+
74
+ all_mse, all_mae, all_r2, all_pcc = [], [], [], []
75
+ time_star = time.time()
76
+
77
+ for fold, (train_index, test_index) in enumerate(kf.split(data)):
78
+ print(f"Running fold {fold}...")
79
+ process = psutil.Process(os.getpid())
80
+ fold_start_time = time.time()
81
+
82
+ X_train, X_test = data[train_index], data[test_index]
83
+ y_train, y_test = label[train_index], label[test_index]
84
+
85
+ X_train_sub, X_valid, y_train_sub, y_valid = train_test_split(
86
+ X_train, y_train, test_size=0.1, random_state=42
87
+ )
88
+
89
+ x_train_tensor = torch.from_numpy(X_train_sub).float().unsqueeze(2).unsqueeze(3)
90
+ y_train_tensor = torch.from_numpy(y_train_sub).float().unsqueeze(1)
91
+
92
+ x_valid_tensor = torch.from_numpy(X_valid).float().unsqueeze(2).unsqueeze(3)
93
+ y_valid_tensor = torch.from_numpy(y_valid).float().unsqueeze(1)
94
+
95
+ x_test_tensor = torch.from_numpy(X_test).float().unsqueeze(2).unsqueeze(3)
96
+ y_test_tensor = torch.from_numpy(y_test).float()
97
+
98
+ train_data = TensorDataset(x_train_tensor, y_train_tensor)
99
+ valid_data = TensorDataset(x_valid_tensor, y_valid_tensor)
100
+ test_data = TensorDataset(x_test_tensor, y_test_tensor)
101
+
102
+ train_loader = DataLoader(train_data, args.batch_size, shuffle=True,
103
+ num_workers=4, pin_memory=True if torch.cuda.is_available() else False)
104
+ valid_loader = DataLoader(valid_data, args.batch_size, shuffle=False,
105
+ num_workers=4, pin_memory=True if torch.cuda.is_available() else False)
106
+ test_loader = DataLoader(test_data, args.batch_size, shuffle=False,
107
+ num_workers=4, pin_memory=True if torch.cuda.is_available() else False)
108
+
109
+ model = MVP(input_size=n_features, nb_filters=args.nb_filters)
110
+ model.train_model(train_loader, valid_loader, args.epochs, args.learning_rate, args.patience, device)
111
+ y_pred = model.predict(test_loader)
112
+
113
+ mse = mean_squared_error(y_test, y_pred)
114
+ r2 = r2_score(y_test, y_pred)
115
+ mae = mean_absolute_error(y_test, y_pred)
116
+ pcc, _ = pearsonr(y_test, y_pred)
117
+
118
+ all_mse.append(mse)
119
+ all_r2.append(r2)
120
+ all_mae.append(mae)
121
+ all_pcc.append(pcc)
122
+
123
+ fold_time = time.time() - fold_start_time
124
+ fold_gpu_mem = get_gpu_mem_by_pid(os.getpid(), gpu_handle)
125
+ fold_cpu_mem = process.memory_info().rss / 1024**2
126
+ print(f'Fold {fold}: Corr={pcc:.4f}, MAE={mae:.4f}, MSE={mse:.4f}, R2={r2:.4f}, '
127
+ f'Time={fold_time:.2f}s, GPU={fold_gpu_mem:.2f}MB, CPU={fold_cpu_mem:.2f}MB')
128
+
129
+ if torch.cuda.is_available():
130
+ torch.cuda.empty_cache()
131
+ torch.cuda.reset_peak_memory_stats()
132
+ results_df = pd.DataFrame({'Y_test': y_test, 'Y_pred': y_pred})
133
+ results_df.to_csv(os.path.join(result_dir, f"fold{fold}.csv"), index=False)
134
+
135
+ print("\n===== Cross-validation summary =====")
136
+ print(f"Average PCC: {np.mean(all_pcc):.4f} ± {np.std(all_pcc):.4f}")
137
+ print(f"Average MAE: {np.mean(all_mae):.4f} ± {np.std(all_mae):.4f}")
138
+ print(f"Average MSE: {np.mean(all_mse):.4f} ± {np.std(all_mse):.4f}")
139
+ print(f"Average R2 : {np.mean(all_r2):.4f} ± {np.std(all_r2):.4f}")
140
+ print(f"Time: {time.time() - time_star:.2f}s")
141
+
142
+
143
+ def MVP_reg():
144
+ set_seed(42)
145
+ gpu_handle = None
146
+ try:
147
+ if torch.cuda.is_available():
148
+ pynvml.nvmlInit()
149
+ gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(0)
150
+ except Exception as e:
151
+ print(f"Warning: GPU monitoring initialization failed: {e}")
152
+ gpu_handle = None
153
+
154
+ args = parse_args()
155
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
156
+ all_species =['Cotton/']
157
+
158
+ for i in range(len(all_species)):
159
+ args.species = all_species[i]
160
+ X, Y, nsamples, nsnp, names = load_data(args)
161
+ for j in range(len(names)):
162
+ args.phe = names[j]
163
+ print("starting run " + args.methods + args.species + args.phe)
164
+ label = Y[:, j]
165
+ label = np.nan_to_num(label, nan=np.nanmean(label))
166
+ best_params = MVP_Hyperparameters.Hyperparameter(X, label, nsnp)
167
+ args.learning_rate = best_params['learning_rate']
168
+ args.batch_size = best_params['batch_size']
169
+ args.patience = best_params['patience']
170
+ args.nb_filters = best_params['nb_filters']
171
+ start_time = time.time()
172
+ if torch.cuda.is_available():
173
+ torch.cuda.reset_peak_memory_stats()
174
+ process = psutil.Process(os.getpid())
175
+ run_nested_cv(args, data=X, label=label, n_features=nsnp, device=device, gpu_handle=gpu_handle)
176
+ elapsed_time = time.time() - start_time
177
+ print(f"running time: {elapsed_time:.2f} s")
178
+ print("successfully")
179
+
180
+
181
+ if __name__ == "__main__":
182
+ MVP_reg()
@@ -0,0 +1,126 @@
1
+ import os
2
+ import time
3
+ import psutil
4
+ import random
5
+ import torch
6
+ import numpy as np
7
+ import optuna
8
+ from sklearn.model_selection import KFold, train_test_split
9
+ from scipy.stats import pearsonr
10
+ from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
11
+ from torch.utils.data import DataLoader, TensorDataset
12
+ from optuna.exceptions import TrialPruned
13
+ from .base_MVP import MVP
14
+
15
+
16
+ def run_nested_cv_with_early_stopping(data, label, n_features, learning_rate, patience, batch_size, nb_filters, num_round=100):
17
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
18
+ print("Starting 10-fold cross-validation...")
19
+ kf = KFold(n_splits=10, shuffle=True, random_state=42)
20
+ all_mse, all_mae, all_r2, all_pcc = [], [], [], []
21
+
22
+ for fold, (train_index, test_index) in enumerate(kf.split(data)):
23
+ print(f"Running fold {fold}...")
24
+ process = psutil.Process(os.getpid())
25
+ fold_start_time = time.time()
26
+
27
+ X_train, X_test = data[train_index], data[test_index]
28
+ y_train, y_test = label[train_index], label[test_index]
29
+
30
+ X_train_sub, X_valid, y_train_sub, y_valid = train_test_split(
31
+ X_train, y_train, test_size=0.1, random_state=42
32
+ )
33
+
34
+ x_train_tensor = torch.from_numpy(X_train_sub).float().unsqueeze(2).unsqueeze(3) # (N, n_features, 1, 1)
35
+ y_train_tensor = torch.from_numpy(y_train_sub).float().unsqueeze(1) # (N, 1)
36
+
37
+ x_valid_tensor = torch.from_numpy(X_valid).float().unsqueeze(2).unsqueeze(3)
38
+ y_valid_tensor = torch.from_numpy(y_valid).float().unsqueeze(1) # (N, 1)
39
+
40
+ x_test_tensor = torch.from_numpy(X_test).float().unsqueeze(2).unsqueeze(3)
41
+ y_test_tensor = torch.from_numpy(y_test).float()
42
+
43
+ train_data = TensorDataset(x_train_tensor, y_train_tensor)
44
+ valid_data = TensorDataset(x_valid_tensor, y_valid_tensor)
45
+ test_data = TensorDataset(x_test_tensor, y_test_tensor)
46
+
47
+ train_loader = DataLoader(train_data, batch_size, shuffle=True,
48
+ num_workers=4, pin_memory=True if torch.cuda.is_available() else False)
49
+ valid_loader = DataLoader(valid_data, batch_size, shuffle=False,
50
+ num_workers=4, pin_memory=True if torch.cuda.is_available() else False)
51
+ test_loader = DataLoader(test_data, batch_size, shuffle=False,
52
+ num_workers=4, pin_memory=True if torch.cuda.is_available() else False)
53
+
54
+ model = MVP(input_size=n_features, nb_filters=nb_filters)
55
+ model.train_model(train_loader, valid_loader, num_round, learning_rate, patience, device)
56
+ y_pred = model.predict(test_loader)
57
+
58
+ mse = mean_squared_error(y_test, y_pred)
59
+ r2 = r2_score(y_test, y_pred)
60
+ mae = mean_absolute_error(y_test, y_pred)
61
+ pcc, _ = pearsonr(y_test, y_pred)
62
+
63
+ if np.isnan(pcc) or np.isinf(pcc):
64
+ print(f"Fold {fold} resulted in NaN/Inf PCC, pruning the trial...")
65
+ raise TrialPruned()
66
+
67
+ all_mse.append(mse)
68
+ all_r2.append(r2)
69
+ all_mae.append(mae)
70
+ all_pcc.append(pcc)
71
+
72
+ fold_time = time.time() - fold_start_time
73
+ fold_cpu_mem = process.memory_info().rss / 1024**2
74
+ print(f'Fold {fold}: Corr={pcc:.4f}, MAE={mae:.4f}, MSE={mse:.4f}, R2={r2:.4f}, '
75
+ f'Time={fold_time:.2f}s, CPU={fold_cpu_mem:.2f}MB')
76
+
77
+ if torch.cuda.is_available():
78
+ torch.cuda.empty_cache()
79
+ del model
80
+
81
+ print("\n===== CV Summary =====")
82
+ print(f"PCC : {np.mean(all_pcc):.4f} ± {np.std(all_pcc):.4f}")
83
+ print(f"MAE : {np.mean(all_mae):.4f} ± {np.std(all_mae):.4f}")
84
+ print(f"MSE : {np.mean(all_mse):.4f} ± {np.std(all_mse):.4f}")
85
+ print(f"R2 : {np.mean(all_r2):.4f} ± {np.std(all_r2):.4f}")
86
+
87
+ return float(np.mean(all_pcc)) if all_pcc else 0.0
88
+
89
+
90
+ def set_seed(seed=42):
91
+ random.seed(seed)
92
+ np.random.seed(seed)
93
+ torch.manual_seed(seed)
94
+ if torch.cuda.is_available():
95
+ torch.cuda.manual_seed_all(seed)
96
+ torch.backends.cudnn.deterministic = True
97
+ torch.backends.cudnn.benchmark = False
98
+
99
+
100
+ def Hyperparameter(data, label, n_features):
101
+ set_seed(42)
102
+ def objective(trial):
103
+ learning_rate = trial.suggest_float("learning_rate", 1e-4, 0.1, log=True)
104
+ batch_size = trial.suggest_categorical("batch_size", [16, 32, 64])
105
+ patience = trial.suggest_int("patience", 1, 10)
106
+ nb_filters = trial.suggest_categorical("nb_filters", [8, 16, 32])
107
+ try:
108
+ corr_score = run_nested_cv_with_early_stopping(
109
+ data=data,
110
+ label=label,
111
+ n_features=n_features,
112
+ learning_rate=learning_rate,
113
+ patience=patience,
114
+ batch_size=batch_size,
115
+ nb_filters=nb_filters
116
+ )
117
+ except TrialPruned:
118
+ return float("-inf")
119
+ return corr_score
120
+
121
+ study = optuna.create_study(direction="maximize")
122
+ study.optimize(objective, n_trials=20)
123
+
124
+ print("best params:", study.best_params)
125
+ print("successfully")
126
+ return study.best_params
@@ -0,0 +1,5 @@
1
+ from .MVP import MVP_reg
2
+
3
+ MVP = MVP_reg
4
+
5
+ __all__ = ["MVP","MVP_reg"]
@@ -0,0 +1,113 @@
1
+ import torch
2
+ import torch.nn as nn
3
+ import numpy as np
4
+
5
+
6
+ class MVP(nn.Module):
7
+ def __init__(self, input_size, nb_filters=32):
8
+ super().__init__()
9
+ self.input_size = input_size
10
+ self.nb_filters = nb_filters
11
+ self.kernel_size = (3, 1)
12
+
13
+ self.conv2d1 = nn.Conv2d(in_channels=1, out_channels=nb_filters,
14
+ kernel_size=self.kernel_size, padding='same')
15
+ self.conv2d2 = nn.Conv2d(in_channels=nb_filters, out_channels=nb_filters,
16
+ kernel_size=self.kernel_size, padding='same')
17
+ self.relu = nn.ReLU()
18
+
19
+ flattened_dim = nb_filters * input_size * 1
20
+ self.fc1 = nn.Linear(flattened_dim, 512)
21
+ self.fc2 = nn.Linear(512, 1)
22
+
23
+ def forward(self, x):
24
+ if x.dim() == 4 and x.size(1) != 1:
25
+ x = x.view(x.size(0), 1, x.size(1), x.size(2))
26
+ elif x.dim() == 2:
27
+ x = x.unsqueeze(1).unsqueeze(3)
28
+ elif x.dim() == 5:
29
+ x = x.squeeze(-1)
30
+
31
+ for i in range(2):
32
+ x_res = x
33
+ if i == 0:
34
+ x = self.conv2d1(x)
35
+ else:
36
+ x = self.conv2d2(x)
37
+ x = self.relu(x)
38
+ x = self.conv2d2(x)
39
+ x = x + x_res
40
+ x = self.relu(x)
41
+
42
+ x = x.view(x.shape[0], -1)
43
+ x = self.fc1(x)
44
+ x = self.relu(x)
45
+ x = self.fc2(x)
46
+
47
+ return x
48
+
49
+ def train_model(self, train_loader, valid_loader, num_epochs, learning_rate, patience, device):
50
+ optimizer = torch.optim.Adam(self.parameters(), lr=learning_rate, weight_decay=1e-5)
51
+ criterion = nn.MSELoss()
52
+ self.to(device)
53
+
54
+ best_loss = float('inf')
55
+ best_state = None
56
+ trigger_times = 0
57
+
58
+ for epoch in range(num_epochs):
59
+ self.train()
60
+ train_loss = 0.0
61
+ for inputs, labels in train_loader:
62
+ inputs, labels = inputs.to(device, non_blocking=True), labels.to(device, non_blocking=True).float()
63
+ if labels.dim() == 1:
64
+ labels = labels.unsqueeze(1)
65
+
66
+ optimizer.zero_grad()
67
+ outputs = self(inputs)
68
+ loss = criterion(outputs, labels)
69
+ loss.backward()
70
+ optimizer.step()
71
+ train_loss += loss.item() * inputs.size(0)
72
+
73
+ self.eval()
74
+ valid_loss = 0.0
75
+ with torch.no_grad():
76
+ for inputs, labels in valid_loader:
77
+ inputs, labels = inputs.to(device, non_blocking=True), labels.to(device, non_blocking=True).float()
78
+ if labels.dim() == 1:
79
+ labels = labels.unsqueeze(1)
80
+
81
+ outputs = self(inputs)
82
+ loss = criterion(outputs, labels)
83
+ valid_loss += loss.item() * inputs.size(0)
84
+
85
+ train_loss /= len(train_loader.dataset)
86
+ valid_loss /= len(valid_loader.dataset)
87
+
88
+ if valid_loss < best_loss:
89
+ best_loss = valid_loss
90
+ best_state = self.state_dict()
91
+ trigger_times = 0
92
+ else:
93
+ trigger_times += 1
94
+ if trigger_times >= patience:
95
+ print(f"Early stopping at epoch {epoch+1}")
96
+ break
97
+
98
+ if best_state is not None:
99
+ self.load_state_dict(best_state)
100
+ return best_loss
101
+
102
+ def predict(self, test_loader):
103
+ self.eval()
104
+ device = next(self.parameters()).device
105
+ y_pred = []
106
+ with torch.no_grad():
107
+ for inputs, _ in test_loader:
108
+ inputs = inputs.to(device, non_blocking=True)
109
+ outputs = self(inputs)
110
+ y_pred.append(outputs.cpu().numpy())
111
+ y_pred = np.concatenate(y_pred, axis=0)
112
+ y_pred = np.squeeze(y_pred)
113
+ return y_pred
@@ -0,0 +1,174 @@
1
+ import os
2
+ import time
3
+ import psutil
4
+ import pynvml
5
+ import argparse
6
+ import random
7
+ from sklearn.preprocessing import StandardScaler
8
+ import torch
9
+ import pandas as pd
10
+ import numpy as np
11
+ from sklearn.model_selection import KFold
12
+ from scipy.stats import pearsonr
13
+ from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
14
+ from . import RF_Hyperparameters
15
+ import cupy as cp
16
+
17
+
18
+ def parse_args():
19
+ parser = argparse.ArgumentParser(description="Argument parser")
20
+ parser.add_argument('--methods', type=str, default='RandomForest/', help='Method name')
21
+ parser.add_argument('--species', type=str, default='Cattle/', help='Species name')
22
+ parser.add_argument('--phe', type=str, default='', help='Dataset name')
23
+ parser.add_argument('--data_dir', type=str, default='../../data/')
24
+ parser.add_argument('--result_dir', type=str, default='result/')
25
+
26
+ parser.add_argument('--n_estimators', type=int, default=100)
27
+ parser.add_argument('--max_depth', type=int, default=1)
28
+ parser.add_argument('--n_jobs', type=int, default=-1, help='Number of CPU cores to use (-1 for all cores)')
29
+ parser.add_argument('--use_gpu', type=bool, default=True, help='Whether to use GPU acceleration')
30
+ args = parser.parse_args()
31
+ return args
32
+
33
+ def load_data(args):
34
+ xData = np.load(os.path.join(args.data_dir, args.species, 'genotype.npz'))["arr_0"]
35
+ yData = np.load(os.path.join(args.data_dir, args.species, 'phenotype.npz'))["arr_0"]
36
+ names = np.load(os.path.join(args.data_dir, args.species, 'phenotype.npz'))["arr_1"]
37
+
38
+ nsample = xData.shape[0]
39
+ nsnp = xData.shape[1]
40
+ print("Number of samples: ", nsample)
41
+ print("Number of SNPs: ", nsnp)
42
+ return xData, yData, nsample, nsnp, names
43
+
44
+ def get_gpu_mem_by_pid(pid):
45
+ procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
46
+ for p in procs:
47
+ if p.pid == pid:
48
+ return p.usedGpuMemory / 1024**2
49
+ return 0.0
50
+
51
+ def set_seed(seed=42):
52
+ random.seed(seed)
53
+ np.random.seed(seed)
54
+ torch.manual_seed(seed)
55
+ if torch.cuda.is_available():
56
+ torch.cuda.manual_seed_all(seed)
57
+ torch.backends.cudnn.deterministic = True
58
+ torch.backends.cudnn.benchmark = False
59
+
60
+ try:
61
+ import cuml
62
+ import cupy as cp
63
+ from cuml.ensemble import RandomForestRegressor as cuRandomForestRegressor
64
+ GPU_AVAILABLE = True
65
+ except ImportError:
66
+ GPU_AVAILABLE = False
67
+
68
+ def run_nested_cv(args, data, label):
69
+ result_dir = os.path.join(args.result_dir, args.methods + args.species + args.phe)
70
+ os.makedirs(result_dir, exist_ok=True)
71
+ print("Starting 10-fold cross-validation with Random Forest...")
72
+
73
+ kf = KFold(n_splits=10, shuffle=True, random_state=42)
74
+
75
+ all_mse, all_mae, all_r2, all_pcc = [], [], [], []
76
+ time_star = time.time()
77
+
78
+ for fold, (train_index, test_index) in enumerate(kf.split(data)):
79
+ print(f"Running fold {fold}...")
80
+ process = psutil.Process(os.getpid())
81
+ fold_start_time = time.time()
82
+
83
+ x_train, x_test = data[train_index], data[test_index]
84
+ y_train, y_test = label[train_index], label[test_index]
85
+
86
+ x_train = x_train.astype(np.float32)
87
+ x_test = x_test.astype(np.float32)
88
+ y_train_scaled = y_train.astype(np.float32)
89
+ y_test_scaled = y_test.astype(np.float32)
90
+
91
+ x_train_gpu = cp.asarray(x_train)
92
+ x_test_gpu = cp.asarray(x_test)
93
+ y_train_gpu = cp.asarray(y_train_scaled)
94
+
95
+ model = cuRandomForestRegressor(
96
+ n_estimators=args.n_estimators,
97
+ max_depth=args.max_depth,
98
+ random_state=42,
99
+ n_streams=1
100
+ )
101
+
102
+ model.fit(x_train_gpu, y_train_gpu)
103
+
104
+ y_test_preds = model.predict(x_test_gpu)
105
+ y_test_preds = cp.asnumpy(y_test_preds)
106
+ y_test_scaled_cpu = cp.asnumpy(cp.asarray(y_test_scaled))
107
+
108
+ y_test_original = y_test_scaled_cpu.reshape(-1)
109
+ y_pred = y_test_preds.reshape(-1)
110
+
111
+ mse = mean_squared_error(y_test_original, y_pred)
112
+ r2 = r2_score(y_test_original, y_pred)
113
+ mae = mean_absolute_error(y_test_original, y_pred)
114
+ pcc, _ = pearsonr(y_test_original, y_pred)
115
+
116
+ all_mse.append(mse)
117
+ all_r2.append(r2)
118
+ all_mae.append(mae)
119
+ all_pcc.append(pcc)
120
+
121
+ fold_time = time.time() - fold_start_time
122
+ fold_gpu_mem = get_gpu_mem_by_pid(os.getpid())
123
+ fold_cpu_mem = process.memory_info().rss / 1024**2
124
+
125
+ acceleration_status = "GPU" if GPU_AVAILABLE else f"CPU({args.n_jobs} cores)"
126
+ print(f'Fold {fold}[{acceleration_status}]: Corr={pcc:.4f}, MAE={mae:.4f}, MSE={mse:.4f}, R2={r2:.4f}, Time={fold_time:.2f}s, '
127
+ f'GPU={fold_gpu_mem:.2f}MB, CPU={fold_cpu_mem:.2f}MB')
128
+
129
+ results_df = pd.DataFrame({'Y_test': y_test, 'Y_pred': y_test_preds})
130
+ results_df.to_csv(os.path.join(result_dir, f"fold{fold}.csv"), index=False)
131
+
132
+ print("\n===== Cross-validation summary =====")
133
+ acceleration_status = "GPU" if GPU_AVAILABLE else f"CPU({args.n_jobs} cores)"
134
+ print(f"Average PCC: {np.mean(all_pcc):.4f} ± {np.std(all_pcc):.4f}")
135
+ print(f"Average MAE: {np.mean(all_mae):.4f} ± {np.std(all_mae):.4f}")
136
+ print(f"Average MSE: {np.mean(all_mse):.4f} ± {np.std(all_mse):.4f}")
137
+ print(f"Average R2 : {np.mean(all_r2):.4f} ± {np.std(all_r2):.4f}")
138
+ print(f"Time: {time.time() - time_star:.2f}s")
139
+
140
+
141
+
142
+ def RF_reg():
143
+ set_seed(42)
144
+ pynvml.nvmlInit()
145
+ handle = pynvml.nvmlDeviceGetHandleByIndex(0)
146
+ all_species =['Cotton/']
147
+
148
+ args = parse_args()
149
+ for i in range(len(all_species)):
150
+ args.species = all_species[i]
151
+ X, Y, nsamples, nsnp, names = load_data(args)
152
+
153
+ for j in range(len(names)):
154
+ args.phe = names[j]
155
+ print(f"Starting run: {args.methods}{args.species}{args.phe}")
156
+
157
+ label = Y[:, j]
158
+ label = np.nan_to_num(label, nan=np.nanmean(label))
159
+ best_params = RF_Hyperparameters.Hyperparameter(X, label)
160
+ args.n_estimators = best_params['n_estimators']
161
+ args.max_depth = best_params['max_depth']
162
+ start_time = time.time()
163
+ if torch.cuda.is_available():
164
+ torch.cuda.reset_peak_memory_stats()
165
+ process = psutil.Process(os.getpid())
166
+ run_nested_cv(args, data=X, label=label)
167
+
168
+ elapsed_time = time.time() - start_time
169
+ print(f"running time: {elapsed_time:.2f} s")
170
+ print("successfully")
171
+
172
+
173
+ if __name__ == "__main__":
174
+ RF_reg()