gpbench 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (188) hide show
  1. gp_agent_tool/compute_dataset_feature.py +67 -0
  2. gp_agent_tool/config.py +65 -0
  3. gp_agent_tool/experience/create_masked_dataset_summary.py +97 -0
  4. gp_agent_tool/experience/dataset_summary_info.py +13 -0
  5. gp_agent_tool/experience/experience_info.py +12 -0
  6. gp_agent_tool/experience/get_matched_experience.py +111 -0
  7. gp_agent_tool/llm_client.py +119 -0
  8. gp_agent_tool/logging_utils.py +24 -0
  9. gp_agent_tool/main.py +347 -0
  10. gp_agent_tool/read_agent/__init__.py +46 -0
  11. gp_agent_tool/read_agent/nodes.py +674 -0
  12. gp_agent_tool/read_agent/prompts.py +547 -0
  13. gp_agent_tool/read_agent/python_repl_tool.py +165 -0
  14. gp_agent_tool/read_agent/state.py +101 -0
  15. gp_agent_tool/read_agent/workflow.py +54 -0
  16. gpbench/__init__.py +25 -0
  17. gpbench/_selftest.py +104 -0
  18. gpbench/method_class/BayesA/BayesA_class.py +141 -0
  19. gpbench/method_class/BayesA/__init__.py +5 -0
  20. gpbench/method_class/BayesA/_bayesfromR.py +96 -0
  21. gpbench/method_class/BayesA/_param_free_base_model.py +84 -0
  22. gpbench/method_class/BayesA/bayesAfromR.py +16 -0
  23. gpbench/method_class/BayesB/BayesB_class.py +140 -0
  24. gpbench/method_class/BayesB/__init__.py +5 -0
  25. gpbench/method_class/BayesB/_bayesfromR.py +96 -0
  26. gpbench/method_class/BayesB/_param_free_base_model.py +84 -0
  27. gpbench/method_class/BayesB/bayesBfromR.py +16 -0
  28. gpbench/method_class/BayesC/BayesC_class.py +141 -0
  29. gpbench/method_class/BayesC/__init__.py +4 -0
  30. gpbench/method_class/BayesC/_bayesfromR.py +96 -0
  31. gpbench/method_class/BayesC/_param_free_base_model.py +84 -0
  32. gpbench/method_class/BayesC/bayesCfromR.py +16 -0
  33. gpbench/method_class/CropARNet/CropARNet_class.py +186 -0
  34. gpbench/method_class/CropARNet/CropARNet_he_class.py +154 -0
  35. gpbench/method_class/CropARNet/__init__.py +5 -0
  36. gpbench/method_class/CropARNet/base_CropARNet_class.py +178 -0
  37. gpbench/method_class/Cropformer/Cropformer_class.py +308 -0
  38. gpbench/method_class/Cropformer/__init__.py +5 -0
  39. gpbench/method_class/Cropformer/cropformer_he_class.py +221 -0
  40. gpbench/method_class/DL_GWAS/DL_GWAS_class.py +250 -0
  41. gpbench/method_class/DL_GWAS/DL_GWAS_he_class.py +169 -0
  42. gpbench/method_class/DL_GWAS/__init__.py +5 -0
  43. gpbench/method_class/DNNGP/DNNGP_class.py +163 -0
  44. gpbench/method_class/DNNGP/DNNGP_he_class.py +138 -0
  45. gpbench/method_class/DNNGP/__init__.py +5 -0
  46. gpbench/method_class/DNNGP/base_dnngp_class.py +116 -0
  47. gpbench/method_class/DeepCCR/DeepCCR_class.py +172 -0
  48. gpbench/method_class/DeepCCR/DeepCCR_he_class.py +161 -0
  49. gpbench/method_class/DeepCCR/__init__.py +5 -0
  50. gpbench/method_class/DeepCCR/base_DeepCCR_class.py +209 -0
  51. gpbench/method_class/DeepGS/DeepGS_class.py +184 -0
  52. gpbench/method_class/DeepGS/DeepGS_he_class.py +150 -0
  53. gpbench/method_class/DeepGS/__init__.py +5 -0
  54. gpbench/method_class/DeepGS/base_deepgs_class.py +153 -0
  55. gpbench/method_class/EIR/EIR_class.py +276 -0
  56. gpbench/method_class/EIR/EIR_he_class.py +184 -0
  57. gpbench/method_class/EIR/__init__.py +5 -0
  58. gpbench/method_class/EIR/utils/__init__.py +0 -0
  59. gpbench/method_class/EIR/utils/array_output_modules.py +97 -0
  60. gpbench/method_class/EIR/utils/common.py +65 -0
  61. gpbench/method_class/EIR/utils/lcl_layers.py +235 -0
  62. gpbench/method_class/EIR/utils/logging.py +59 -0
  63. gpbench/method_class/EIR/utils/mlp_layers.py +92 -0
  64. gpbench/method_class/EIR/utils/models_locally_connected.py +642 -0
  65. gpbench/method_class/EIR/utils/transformer_models.py +546 -0
  66. gpbench/method_class/ElasticNet/ElasticNet_class.py +133 -0
  67. gpbench/method_class/ElasticNet/ElasticNet_he_class.py +91 -0
  68. gpbench/method_class/ElasticNet/__init__.py +5 -0
  69. gpbench/method_class/G2PDeep/G2PDeep_he_class.py +217 -0
  70. gpbench/method_class/G2PDeep/G2Pdeep_class.py +205 -0
  71. gpbench/method_class/G2PDeep/__init__.py +5 -0
  72. gpbench/method_class/G2PDeep/base_G2PDeep_class.py +209 -0
  73. gpbench/method_class/GBLUP/GBLUP_class.py +183 -0
  74. gpbench/method_class/GBLUP/__init__.py +5 -0
  75. gpbench/method_class/GEFormer/GEFormer_class.py +169 -0
  76. gpbench/method_class/GEFormer/GEFormer_he_class.py +137 -0
  77. gpbench/method_class/GEFormer/__init__.py +5 -0
  78. gpbench/method_class/GEFormer/gMLP_class.py +357 -0
  79. gpbench/method_class/LightGBM/LightGBM_class.py +224 -0
  80. gpbench/method_class/LightGBM/LightGBM_he_class.py +121 -0
  81. gpbench/method_class/LightGBM/__init__.py +5 -0
  82. gpbench/method_class/RF/RF_GPU_class.py +165 -0
  83. gpbench/method_class/RF/RF_GPU_he_class.py +124 -0
  84. gpbench/method_class/RF/__init__.py +5 -0
  85. gpbench/method_class/SVC/SVC_GPU.py +181 -0
  86. gpbench/method_class/SVC/SVC_GPU_he.py +106 -0
  87. gpbench/method_class/SVC/__init__.py +5 -0
  88. gpbench/method_class/SoyDNGP/AlexNet_206_class.py +179 -0
  89. gpbench/method_class/SoyDNGP/SoyDNGP_class.py +189 -0
  90. gpbench/method_class/SoyDNGP/SoyDNGP_he_class.py +112 -0
  91. gpbench/method_class/SoyDNGP/__init__.py +5 -0
  92. gpbench/method_class/XGBoost/XGboost_GPU_class.py +198 -0
  93. gpbench/method_class/XGBoost/XGboost_GPU_he_class.py +178 -0
  94. gpbench/method_class/XGBoost/__init__.py +5 -0
  95. gpbench/method_class/__init__.py +52 -0
  96. gpbench/method_class/rrBLUP/__init__.py +5 -0
  97. gpbench/method_class/rrBLUP/rrBLUP_class.py +140 -0
  98. gpbench/method_reg/BayesA/BayesA.py +116 -0
  99. gpbench/method_reg/BayesA/__init__.py +5 -0
  100. gpbench/method_reg/BayesA/_bayesfromR.py +96 -0
  101. gpbench/method_reg/BayesA/_param_free_base_model.py +84 -0
  102. gpbench/method_reg/BayesA/bayesAfromR.py +16 -0
  103. gpbench/method_reg/BayesB/BayesB.py +117 -0
  104. gpbench/method_reg/BayesB/__init__.py +5 -0
  105. gpbench/method_reg/BayesB/_bayesfromR.py +96 -0
  106. gpbench/method_reg/BayesB/_param_free_base_model.py +84 -0
  107. gpbench/method_reg/BayesB/bayesBfromR.py +16 -0
  108. gpbench/method_reg/BayesC/BayesC.py +115 -0
  109. gpbench/method_reg/BayesC/__init__.py +5 -0
  110. gpbench/method_reg/BayesC/_bayesfromR.py +96 -0
  111. gpbench/method_reg/BayesC/_param_free_base_model.py +84 -0
  112. gpbench/method_reg/BayesC/bayesCfromR.py +16 -0
  113. gpbench/method_reg/CropARNet/CropARNet.py +159 -0
  114. gpbench/method_reg/CropARNet/CropARNet_Hyperparameters.py +109 -0
  115. gpbench/method_reg/CropARNet/__init__.py +5 -0
  116. gpbench/method_reg/CropARNet/base_CropARNet.py +137 -0
  117. gpbench/method_reg/Cropformer/Cropformer.py +313 -0
  118. gpbench/method_reg/Cropformer/Cropformer_Hyperparameters.py +250 -0
  119. gpbench/method_reg/Cropformer/__init__.py +5 -0
  120. gpbench/method_reg/DL_GWAS/DL_GWAS.py +186 -0
  121. gpbench/method_reg/DL_GWAS/DL_GWAS_Hyperparameters.py +125 -0
  122. gpbench/method_reg/DL_GWAS/__init__.py +5 -0
  123. gpbench/method_reg/DNNGP/DNNGP.py +157 -0
  124. gpbench/method_reg/DNNGP/DNNGP_Hyperparameters.py +118 -0
  125. gpbench/method_reg/DNNGP/__init__.py +5 -0
  126. gpbench/method_reg/DNNGP/base_dnngp.py +101 -0
  127. gpbench/method_reg/DeepCCR/DeepCCR.py +149 -0
  128. gpbench/method_reg/DeepCCR/DeepCCR_Hyperparameters.py +110 -0
  129. gpbench/method_reg/DeepCCR/__init__.py +5 -0
  130. gpbench/method_reg/DeepCCR/base_DeepCCR.py +171 -0
  131. gpbench/method_reg/DeepGS/DeepGS.py +165 -0
  132. gpbench/method_reg/DeepGS/DeepGS_Hyperparameters.py +114 -0
  133. gpbench/method_reg/DeepGS/__init__.py +5 -0
  134. gpbench/method_reg/DeepGS/base_deepgs.py +98 -0
  135. gpbench/method_reg/EIR/EIR.py +258 -0
  136. gpbench/method_reg/EIR/EIR_Hyperparameters.py +178 -0
  137. gpbench/method_reg/EIR/__init__.py +5 -0
  138. gpbench/method_reg/EIR/utils/__init__.py +0 -0
  139. gpbench/method_reg/EIR/utils/array_output_modules.py +97 -0
  140. gpbench/method_reg/EIR/utils/common.py +65 -0
  141. gpbench/method_reg/EIR/utils/lcl_layers.py +235 -0
  142. gpbench/method_reg/EIR/utils/logging.py +59 -0
  143. gpbench/method_reg/EIR/utils/mlp_layers.py +92 -0
  144. gpbench/method_reg/EIR/utils/models_locally_connected.py +642 -0
  145. gpbench/method_reg/EIR/utils/transformer_models.py +546 -0
  146. gpbench/method_reg/ElasticNet/ElasticNet.py +123 -0
  147. gpbench/method_reg/ElasticNet/ElasticNet_he.py +83 -0
  148. gpbench/method_reg/ElasticNet/__init__.py +5 -0
  149. gpbench/method_reg/G2PDeep/G2PDeep_Hyperparameters.py +107 -0
  150. gpbench/method_reg/G2PDeep/G2Pdeep.py +166 -0
  151. gpbench/method_reg/G2PDeep/__init__.py +5 -0
  152. gpbench/method_reg/G2PDeep/base_G2PDeep.py +209 -0
  153. gpbench/method_reg/GBLUP/GBLUP_R.py +182 -0
  154. gpbench/method_reg/GBLUP/__init__.py +5 -0
  155. gpbench/method_reg/GEFormer/GEFormer.py +164 -0
  156. gpbench/method_reg/GEFormer/GEFormer_Hyperparameters.py +106 -0
  157. gpbench/method_reg/GEFormer/__init__.py +5 -0
  158. gpbench/method_reg/GEFormer/gMLP.py +341 -0
  159. gpbench/method_reg/LightGBM/LightGBM.py +237 -0
  160. gpbench/method_reg/LightGBM/LightGBM_Hyperparameters.py +77 -0
  161. gpbench/method_reg/LightGBM/__init__.py +5 -0
  162. gpbench/method_reg/MVP/MVP.py +182 -0
  163. gpbench/method_reg/MVP/MVP_Hyperparameters.py +126 -0
  164. gpbench/method_reg/MVP/__init__.py +5 -0
  165. gpbench/method_reg/MVP/base_MVP.py +113 -0
  166. gpbench/method_reg/RF/RF_GPU.py +174 -0
  167. gpbench/method_reg/RF/RF_Hyperparameters.py +163 -0
  168. gpbench/method_reg/RF/__init__.py +5 -0
  169. gpbench/method_reg/SVC/SVC_GPU.py +194 -0
  170. gpbench/method_reg/SVC/SVC_Hyperparameters.py +107 -0
  171. gpbench/method_reg/SVC/__init__.py +5 -0
  172. gpbench/method_reg/SoyDNGP/AlexNet_206.py +185 -0
  173. gpbench/method_reg/SoyDNGP/SoyDNGP.py +179 -0
  174. gpbench/method_reg/SoyDNGP/SoyDNGP_Hyperparameters.py +105 -0
  175. gpbench/method_reg/SoyDNGP/__init__.py +5 -0
  176. gpbench/method_reg/XGBoost/XGboost_GPU.py +188 -0
  177. gpbench/method_reg/XGBoost/XGboost_Hyperparameters.py +167 -0
  178. gpbench/method_reg/XGBoost/__init__.py +5 -0
  179. gpbench/method_reg/__init__.py +55 -0
  180. gpbench/method_reg/rrBLUP/__init__.py +5 -0
  181. gpbench/method_reg/rrBLUP/rrBLUP.py +123 -0
  182. gpbench-1.0.0.dist-info/METADATA +379 -0
  183. gpbench-1.0.0.dist-info/RECORD +188 -0
  184. gpbench-1.0.0.dist-info/WHEEL +5 -0
  185. gpbench-1.0.0.dist-info/entry_points.txt +2 -0
  186. gpbench-1.0.0.dist-info/top_level.txt +3 -0
  187. tests/test_import.py +80 -0
  188. tests/test_method.py +232 -0
@@ -0,0 +1,157 @@
1
+ import os
2
+ import time
3
+ import psutil
4
+ import argparse
5
+ import random
6
+ import torch
7
+ import numpy as np
8
+ import pandas as pd
9
+ from sklearn.model_selection import KFold, train_test_split
10
+ from .base_dnngp import DNNGP
11
+ from scipy.stats import pearsonr
12
+ from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
13
+ from torch.utils.data import DataLoader, TensorDataset
14
+ from . import DNNGP_Hyperparameters
15
+
16
+ def parse_args():
17
+ parser = argparse.ArgumentParser(description="Argument parser")
18
+ parser.add_argument('--methods', type=str, default='DNNGP/', help='Random seed')
19
+ parser.add_argument('--species', type=str, default='')
20
+ parser.add_argument('--phe', type=str, default='', help='Dataset name')
21
+ parser.add_argument('--data_dir', type=str, default='../../data/')
22
+ parser.add_argument('--result_dir', type=str, default='result/')
23
+
24
+ parser.add_argument('--epoch', type=int, default=1000, help='Number of training rounds')
25
+ parser.add_argument('--batch_size', type=int, default=32, help='Batch size')
26
+ parser.add_argument('--lr', type=float, default=0.01, help='Learning rate')
27
+ parser.add_argument('--weight_decay', type=float, default=0.001, help='Weight decay')
28
+ parser.add_argument('--patience', type=int, default=10, help='Patience for early stopping')
29
+ parser.add_argument('--dropout1', type=float, default=0.5, help='Dropout rate for layer 1')
30
+ parser.add_argument('--dropout2', type=float, default=0.5, help='Dropout rate for layer 2')
31
+ args = parser.parse_args()
32
+ return args
33
+
34
+ def load_data(args):
35
+ xData = np.load(os.path.join(args.data_dir, args.species, 'genotype.npz'))["arr_0"]
36
+ yData = np.load(os.path.join(args.data_dir, args.species, 'phenotype.npz'))["arr_0"]
37
+ names = np.load(os.path.join(args.data_dir, args.species, 'phenotype.npz'))["arr_1"]
38
+
39
+ nsample = xData.shape[0]
40
+ nsnp = xData.shape[1]
41
+ print("Number of samples: ", nsample)
42
+ print("Number of SNPs: ", nsnp)
43
+ return xData, yData, nsample, nsnp, names
44
+
45
+ def set_seed(seed=42):
46
+ random.seed(seed)
47
+ np.random.seed(seed)
48
+ torch.manual_seed(seed)
49
+ torch.cuda.manual_seed_all(seed)
50
+ torch.backends.cudnn.deterministic = True
51
+ torch.backends.cudnn.benchmark = False
52
+
53
+ def run_nested_cv(args, data, label, nsnp, device):
54
+ result_dir = os.path.join(args.result_dir, args.methods + args.species + args.phe)
55
+ os.makedirs(result_dir, exist_ok=True)
56
+ print("Starting 10-fold cross-validation...")
57
+ kf = KFold(n_splits=10, shuffle=True, random_state=42)
58
+
59
+ all_mse, all_mae, all_r2, all_pcc = [], [], [], []
60
+ time_star = time.time()
61
+ for fold, (train_index, test_index) in enumerate(kf.split(data)):
62
+ print(f"Running fold {fold}...")
63
+ process = psutil.Process(os.getpid())
64
+ fold_start_time = time.time()
65
+
66
+ X_train, X_test = data[train_index], data[test_index]
67
+ y_train, y_test = label[train_index], label[test_index]
68
+
69
+ X_train_sub, X_valid, y_train_sub, y_valid = train_test_split(X_train, y_train, test_size=0.1, random_state=42)
70
+
71
+ x_train_tensor = torch.from_numpy(X_train_sub).float().to(device)
72
+ y_train_tensor = torch.from_numpy(y_train_sub).float().to(device)
73
+ x_valid_tensor = torch.from_numpy(X_valid).float().to(device)
74
+ y_valid_tensor = torch.from_numpy(y_valid).float().to(device)
75
+ x_test_tensor = torch.from_numpy(X_test).float().to(device)
76
+ y_test_tensor = torch.from_numpy(y_test).float().to(device)
77
+
78
+ x_train_tensor = x_train_tensor.unsqueeze(1)
79
+ x_valid_tensor = x_valid_tensor.unsqueeze(1)
80
+ x_test_tensor = x_test_tensor.unsqueeze(1)
81
+
82
+ train_data = TensorDataset(x_train_tensor, y_train_tensor)
83
+ valid_data = TensorDataset(x_valid_tensor, y_valid_tensor)
84
+ test_data = TensorDataset(x_test_tensor, y_test_tensor)
85
+
86
+ train_loader = DataLoader(train_data, args.batch_size, shuffle=True)
87
+ valid_loader = DataLoader(valid_data, args.batch_size, shuffle=False)
88
+ test_loader = DataLoader(test_data, args.batch_size, shuffle=False)
89
+
90
+ model = DNNGP(nsnp, args.dropout1, args.dropout2)
91
+ model.train_model(train_loader, valid_loader, args.epoch, args.lr, args.weight_decay, args.patience, device)
92
+ y_pred = model.predict(test_loader)
93
+
94
+ mse = mean_squared_error(y_test, y_pred)
95
+ r2 = r2_score(y_test, y_pred)
96
+ mae = mean_absolute_error(y_test, y_pred)
97
+ pcc, _ = pearsonr(y_test, y_pred)
98
+
99
+ all_mse.append(mse)
100
+ all_r2.append(r2)
101
+ all_mae.append(mae)
102
+ all_pcc.append(pcc)
103
+
104
+ fold_time = time.time() - fold_start_time
105
+ fold_gpu_mem = torch.cuda.max_memory_allocated() / 1024**2 if torch.cuda.is_available() else 0
106
+ fold_cpu_mem = process.memory_info().rss / 1024**2
107
+ print(f'Fold {fold}: Corr={pcc:.4f}, MAE={mae:.4f}, MSE={mse:.4f}, R2={r2:.4f}, Time={fold_time:.2f}s, '
108
+ f'GPU={fold_gpu_mem:.2f}MB, CPU={fold_cpu_mem:.2f}MB')
109
+
110
+ torch.cuda.empty_cache()
111
+ torch.cuda.reset_peak_memory_stats()
112
+ results_df = pd.DataFrame({'Y_test': y_test, 'Y_pred': y_pred})
113
+ results_df.to_csv(os.path.join(result_dir, f"fold{fold}.csv"), index=False)
114
+
115
+ print("\n===== Cross-validation summary =====")
116
+ print(f"Average PCC: {np.mean(all_pcc):.4f} ± {np.std(all_pcc):.4f}")
117
+ print(f"Average MAE: {np.mean(all_mae):.4f} ± {np.std(all_mae):.4f}")
118
+ print(f"Average MSE: {np.mean(all_mse):.4f} ± {np.std(all_mse):.4f}")
119
+ print(f"Average R2 : {np.mean(all_r2):.4f} ± {np.std(all_r2):.4f}")
120
+ print(f"Time: {time.time() - time_star:.2f}s")
121
+
122
+
123
+ def DNNGP_reg():
124
+ set_seed(42)
125
+ torch.cuda.empty_cache()
126
+ args = parse_args()
127
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
128
+ all_species =['Cotton/']
129
+
130
+ for i in range(len(all_species)):
131
+ args.species = all_species[i]
132
+ args.device = device
133
+ X, Y, nsamples, nsnp, names = load_data(args)
134
+ for j in range(len(names)):
135
+ args.phe = names[j]
136
+ print("starting run " + args.methods + args.species + args.phe)
137
+ label = Y[:, j]
138
+ label = np.nan_to_num(label, nan=np.nanmean(label))
139
+ best_params = DNNGP_Hyperparameters.Hyperparameter(X, label, nsnp)
140
+ args.learning_rate = best_params['learning_rate']
141
+ args.patience = best_params['patience']
142
+ args.dropout1 = best_params['dropout1']
143
+ args.dropout2 = best_params['dropout2']
144
+ args.weight_decay = best_params['weight_decay']
145
+ start_time = time.time()
146
+ torch.cuda.reset_peak_memory_stats()
147
+ process = psutil.Process(os.getpid())
148
+
149
+ run_nested_cv(args, data=X, label=label, nsnp = nsnp, device = args.device)
150
+
151
+ elapsed_time = time.time() - start_time
152
+ print(f"running time: {elapsed_time:.2f} s")
153
+ print("successfully")
154
+
155
+
156
+ if __name__ == "__main__":
157
+ DNNGP_reg()
@@ -0,0 +1,118 @@
1
+ import os
2
+ import time
3
+ import psutil
4
+ import random
5
+ import torch
6
+ import numpy as np
7
+ import optuna
8
+ from sklearn.model_selection import KFold, train_test_split
9
+ from .base_dnngp import DNNGP
10
+ from scipy.stats import pearsonr
11
+ from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
12
+ from torch.utils.data import DataLoader, TensorDataset
13
+ from optuna.exceptions import TrialPruned
14
+
15
+ def run_nested_cv_with_early_stopping(data, label, nsnp, learning_rate,
16
+ dropout1, dropout2, weight_decay, patience, batch_size=64, epoch=1000):
17
+ device = torch.device("cuda:0")
18
+ print("Starting 10-fold cross-validation...")
19
+ kf = KFold(n_splits=10, shuffle=True, random_state=42)
20
+ all_mse, all_mae, all_r2, all_pcc = [], [], [], []
21
+
22
+ for fold, (train_index, test_index) in enumerate(kf.split(data)):
23
+ print(f"Running fold {fold}...")
24
+ process = psutil.Process(os.getpid())
25
+ fold_start_time = time.time()
26
+
27
+ X_train, X_test = data[train_index], data[test_index]
28
+ y_train, y_test = label[train_index], label[test_index]
29
+
30
+ X_train_sub, X_valid, y_train_sub, y_valid = train_test_split(X_train, y_train, test_size=0.1, random_state=42)
31
+
32
+ x_train_tensor = torch.from_numpy(X_train_sub).float().to(device)
33
+ y_train_tensor = torch.from_numpy(y_train_sub).float().to(device)
34
+ x_valid_tensor = torch.from_numpy(X_valid).float().to(device)
35
+ y_valid_tensor = torch.from_numpy(y_valid).float().to(device)
36
+ x_test_tensor = torch.from_numpy(X_test).float().to(device)
37
+ y_test_tensor = torch.from_numpy(y_test).float().to(device)
38
+
39
+ x_train_tensor = x_train_tensor.unsqueeze(1)
40
+ x_valid_tensor = x_valid_tensor.unsqueeze(1)
41
+ x_test_tensor = x_test_tensor.unsqueeze(1)
42
+
43
+ train_data = TensorDataset(x_train_tensor, y_train_tensor)
44
+ valid_data = TensorDataset(x_valid_tensor, y_valid_tensor)
45
+ test_data = TensorDataset(x_test_tensor, y_test_tensor)
46
+
47
+ train_loader = DataLoader(train_data, batch_size, shuffle=True)
48
+ valid_loader = DataLoader(valid_data, batch_size, shuffle=False)
49
+ test_loader = DataLoader(test_data, batch_size, shuffle=False)
50
+
51
+ model = DNNGP(nsnp, dropout1, dropout2)
52
+ model.train_model(train_loader, valid_loader, epoch, learning_rate, weight_decay, patience, device)
53
+ y_pred = model.predict(test_loader)
54
+
55
+ mse = mean_squared_error(y_test, y_pred)
56
+ r2 = r2_score(y_test, y_pred)
57
+ mae = mean_absolute_error(y_test, y_pred)
58
+ pcc, _ = pearsonr(y_test, y_pred)
59
+
60
+ if np.isnan(pcc):
61
+ print(f"Fold {fold} resulted in NaN PCC, pruning the trial...")
62
+ raise TrialPruned()
63
+
64
+ all_mse.append(mse)
65
+ all_r2.append(r2)
66
+ all_mae.append(mae)
67
+ all_pcc.append(pcc)
68
+
69
+ fold_time = time.time() - fold_start_time
70
+ fold_gpu_mem = torch.cuda.max_memory_allocated() / 1024**2 if torch.cuda.is_available() else 0
71
+ fold_cpu_mem = process.memory_info().rss / 1024**2
72
+ print(f'Fold {fold}: Corr={pcc:.4f}, MAE={mae:.4f}, MSE={mse:.4f}, R2={r2:.4f}, Time={fold_time:.2f}s, '
73
+ f'GPU={fold_gpu_mem:.2f}MB, CPU={fold_cpu_mem:.2f}MB')
74
+
75
+ return np.mean(all_pcc) if all_pcc else 0.0
76
+
77
+ def set_seed(seed=42):
78
+ random.seed(seed)
79
+ np.random.seed(seed)
80
+ torch.manual_seed(seed)
81
+ if torch.cuda.is_available():
82
+ torch.cuda.manual_seed_all(seed)
83
+ torch.backends.cudnn.deterministic = True
84
+ torch.backends.cudnn.benchmark = False
85
+
86
+ def Hyperparameter(data, label, nsnp):
87
+ set_seed(42)
88
+
89
+ def objective(trial):
90
+ lr = trial.suggest_float("learning_rate", 1e-4, 0.1)
91
+ patience = trial.suggest_int("patience", 10, 100, step=10)
92
+ batch_size = trial.suggest_categorical("batch_size",[64])
93
+ dropout1 = trial.suggest_float("dropout1", 0.0, 0.9, step=0.1)
94
+ dropout2 = trial.suggest_float("dropout2", 0.0, 0.9, step=0.1)
95
+ weight_decay = trial.suggest_categorical("weight_decay", [1e-5, 1e-4, 1e-3])
96
+
97
+ try:
98
+ corr_score = run_nested_cv_with_early_stopping(
99
+ data=data,
100
+ label=label,
101
+ nsnp=nsnp,
102
+ learning_rate=lr,
103
+ patience=patience,
104
+ dropout1=dropout1,
105
+ dropout2=dropout2,
106
+ weight_decay=weight_decay
107
+ )
108
+
109
+ except TrialPruned:
110
+ return float("-inf")
111
+ return corr_score
112
+
113
+ study = optuna.create_study(direction="maximize")
114
+ study.optimize(objective, n_trials=20)
115
+
116
+ print("best params:", study.best_params)
117
+ print("successfully")
118
+ return study.best_params
@@ -0,0 +1,5 @@
1
+ from .DNNGP import DNNGP_reg
2
+
3
+ DNNGP = DNNGP_reg
4
+
5
+ __all__ = ["DNNGP","DNNGP_reg"]
@@ -0,0 +1,101 @@
1
+ import torch
2
+ import torch.nn as nn
3
+ import numpy as np
4
+
5
+
6
+ class DNNGP(nn.Module):
7
+ def __init__(self, input_size, dropout1, dropout2):
8
+ super().__init__()
9
+ self.CNN1 = nn.Conv1d(in_channels = 1, out_channels=64, kernel_size=4)
10
+ self.Relu1 = nn.ReLU()
11
+ self.Drop1 = nn.Dropout(dropout1)
12
+
13
+ self.Batchnorm = nn.BatchNorm1d(num_features=64)
14
+
15
+ self.CNN2 = nn.Conv1d(in_channels = 64, out_channels=64, kernel_size=4)
16
+ self.Relu2 = nn.ReLU()
17
+ self.Drop2 = nn.Dropout(dropout2)
18
+
19
+ self.CNN3 = nn.Conv1d(in_channels = 64, out_channels=64, kernel_size=4)
20
+ self.Relu3 = nn.ReLU()
21
+
22
+ self.Flatten = nn.Flatten()
23
+ self.Dense = nn.Linear(in_features=64*(input_size-9), out_features=3)
24
+ self.Output = nn.Linear(in_features=3, out_features=1)
25
+
26
+ def forward(self, x):
27
+ x = self.CNN1(x)
28
+ x = self.Relu1(x)
29
+ x = self.Drop1(x)
30
+ x = self.Batchnorm(x)
31
+ x = self.CNN2(x)
32
+ x = self.Relu2(x)
33
+ x = self.Drop2(x)
34
+ x = self.CNN3(x)
35
+ x = self.Relu3(x)
36
+ x = self.Flatten(x)
37
+ x = self.Dense(x)
38
+ x = self.Output(x)
39
+ return x
40
+
41
+
42
+ def train_model(self, train_loader, valid_loader, num_epochs, learning_rate, weight_decay, patience, device):
43
+ optimizer = torch.optim.Adam(self.parameters(), lr=learning_rate, weight_decay=weight_decay)
44
+ criterion = nn.MSELoss()
45
+ self.to(device)
46
+
47
+ best_loss = float('inf')
48
+ best_state = None
49
+ trigger_times = 0
50
+
51
+ for epoch in range(num_epochs):
52
+ self.train()
53
+ train_loss = 0.0
54
+ for inputs, labels in train_loader:
55
+ inputs, labels = inputs.to(device), labels.to(device)
56
+ optimizer.zero_grad()
57
+ outputs = self(inputs)
58
+ labels = labels.unsqueeze(1)
59
+ loss = criterion(outputs, labels)
60
+ loss.backward()
61
+ optimizer.step()
62
+ train_loss += loss.item() * inputs.size(0)
63
+
64
+ self.eval()
65
+ valid_loss = 0.0
66
+ with torch.no_grad():
67
+ for inputs, labels in valid_loader:
68
+ inputs, labels = inputs.to(device), labels.to(device)
69
+ outputs = self(inputs)
70
+ labels = labels.unsqueeze(1)
71
+ loss = criterion(outputs, labels)
72
+ valid_loss += loss.item() * inputs.size(0)
73
+
74
+ train_loss /= len(train_loader.dataset)
75
+ valid_loss /= len(valid_loader.dataset)
76
+
77
+ # ---------- Early stopping ----------
78
+ if valid_loss < best_loss:
79
+ best_loss = valid_loss
80
+ best_state = self.state_dict()
81
+ trigger_times = 0
82
+ else:
83
+ trigger_times += 1
84
+ if trigger_times >= patience:
85
+ print(f"Early stopping at epoch {epoch+1}")
86
+ break
87
+
88
+ if best_state is not None:
89
+ self.load_state_dict(best_state)
90
+ return best_loss
91
+
92
+ def predict(self, test_loader):
93
+ self.eval()
94
+ y_pred = []
95
+ with torch.no_grad():
96
+ for inputs, _ in test_loader:
97
+ outputs = self(inputs)
98
+ y_pred.append(outputs.cpu().numpy())
99
+ y_pred = np.concatenate(y_pred, axis=0)
100
+ y_pred = np.squeeze(y_pred)
101
+ return y_pred
@@ -0,0 +1,149 @@
1
+ import os
2
+ import time
3
+ import psutil
4
+ import argparse
5
+ import random
6
+ import torch
7
+ import numpy as np
8
+ import pandas as pd
9
+ from sklearn.model_selection import KFold, train_test_split
10
+ from .base_DeepCCR import DeepCCR
11
+ from scipy.stats import pearsonr
12
+ from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
13
+ from torch.utils.data import DataLoader, TensorDataset
14
+ from . import DeepCCR_Hyperparameters
15
+
16
+ def parse_args():
17
+ parser = argparse.ArgumentParser(description="Argument parser")
18
+ parser.add_argument('--methods', type=str, default='DeepCCR/', help='Random seed')
19
+ parser.add_argument('--species', type=str, default='')
20
+ parser.add_argument('--phe', type=str, default='', help='Dataset name')
21
+ parser.add_argument('--data_dir', type=str, default='../../data/')
22
+ parser.add_argument('--result_dir', type=str, default='result/')
23
+
24
+ parser.add_argument('--epoch', type=int, default=1000, help='Number of training rounds')
25
+ parser.add_argument('--batch_size', type=int, default=64, help='Batch size')
26
+ parser.add_argument('--lr', type=float, default=0.001, help='Learning rate')
27
+ parser.add_argument('--patience', type=int, default=10, help='Patience for early stopping')
28
+ args = parser.parse_args()
29
+ return args
30
+
31
+ def load_data(args):
32
+ xData = np.load(os.path.join(args.data_dir, args.species, 'genotype.npz'))["arr_0"]
33
+ yData = np.load(os.path.join(args.data_dir, args.species, 'phenotype.npz'))["arr_0"]
34
+ names = np.load(os.path.join(args.data_dir, args.species, 'phenotype.npz'))["arr_1"]
35
+
36
+ nsample = xData.shape[0]
37
+ nsnp = xData.shape[1]
38
+ print("Number of samples: ", nsample)
39
+ print("Number of SNPs: ", nsnp)
40
+ return xData, yData, nsample, nsnp, names
41
+
42
+ def set_seed(seed=42):
43
+ random.seed(seed)
44
+ np.random.seed(seed)
45
+ torch.manual_seed(seed)
46
+ torch.cuda.manual_seed_all(seed)
47
+ torch.backends.cudnn.deterministic = True
48
+ torch.backends.cudnn.benchmark = False
49
+
50
+ def run_nested_cv(args, data, label, nsnp, device):
51
+ result_dir = os.path.join(args.result_dir, args.methods + args.species + args.phe)
52
+ os.makedirs(result_dir, exist_ok=True)
53
+ print("Starting 10-fold cross-validation...")
54
+ kf = KFold(n_splits=10, shuffle=True, random_state=42)
55
+
56
+ all_mse, all_mae, all_r2, all_pcc = [], [], [], []
57
+ time_star = time.time()
58
+ for fold, (train_index, test_index) in enumerate(kf.split(data)):
59
+ print(f"Running fold {fold}...")
60
+ process = psutil.Process(os.getpid())
61
+ fold_start_time = time.time()
62
+
63
+ X_train, X_test = data[train_index], data[test_index]
64
+ y_train, y_test = label[train_index], label[test_index]
65
+
66
+ X_train_sub, X_valid, y_train_sub, y_valid = train_test_split(X_train, y_train, test_size=0.1, random_state=42)
67
+
68
+ x_train_tensor = torch.from_numpy(X_train_sub).float()
69
+ y_train_tensor = torch.from_numpy(y_train_sub).float()
70
+ x_valid_tensor = torch.from_numpy(X_valid).float()
71
+ y_valid_tensor = torch.from_numpy(y_valid).float()
72
+ x_test_tensor = torch.from_numpy(X_test).float()
73
+ y_test_tensor = torch.from_numpy(y_test).float()
74
+ x_train_tensor = x_train_tensor.unsqueeze(1)
75
+ x_valid_tensor = x_valid_tensor.unsqueeze(1)
76
+ x_test_tensor = x_test_tensor.unsqueeze(1)
77
+
78
+ train_data = TensorDataset(x_train_tensor, y_train_tensor)
79
+ valid_data = TensorDataset(x_valid_tensor, y_valid_tensor)
80
+ test_data = TensorDataset(x_test_tensor, y_test_tensor)
81
+
82
+ train_loader = DataLoader(train_data, args.batch_size, shuffle=True)
83
+ valid_loader = DataLoader(valid_data, args.batch_size, shuffle=False)
84
+ test_loader = DataLoader(test_data, args.batch_size, shuffle=False)
85
+
86
+ model = DeepCCR(input_seq_len = nsnp)
87
+ model.train_model(train_loader, valid_loader, args.epoch, args.lr, args.patience, device)
88
+ y_pred = model.predict(test_loader, device)
89
+
90
+ mse = mean_squared_error(y_test, y_pred)
91
+ r2 = r2_score(y_test, y_pred)
92
+ mae = mean_absolute_error(y_test, y_pred)
93
+ pcc, _ = pearsonr(y_test, y_pred)
94
+
95
+ all_mse.append(mse)
96
+ all_r2.append(r2)
97
+ all_mae.append(mae)
98
+ all_pcc.append(pcc)
99
+
100
+ fold_time = time.time() - fold_start_time
101
+ fold_gpu_mem = torch.cuda.max_memory_allocated() / 1024**2 if torch.cuda.is_available() else 0
102
+ fold_cpu_mem = process.memory_info().rss / 1024**2
103
+ print(f'Fold {fold}: Corr={pcc:.4f}, MAE={mae:.4f}, MSE={mse:.4f}, R2={r2:.4f}, Time={fold_time:.2f}s, '
104
+ f'GPU={fold_gpu_mem:.2f}MB, CPU={fold_cpu_mem:.2f}MB')
105
+
106
+ torch.cuda.empty_cache()
107
+ torch.cuda.reset_peak_memory_stats()
108
+ results_df = pd.DataFrame({'Y_test': y_test, 'Y_pred': y_pred})
109
+ results_df.to_csv(os.path.join(result_dir, f"fold{fold}.csv"), index=False)
110
+
111
+ print("\n===== Cross-validation summary =====")
112
+ print(f"Average PCC: {np.mean(all_pcc):.4f} ± {np.std(all_pcc):.4f}")
113
+ print(f"Average MAE: {np.mean(all_mae):.4f} ± {np.std(all_mae):.4f}")
114
+ print(f"Average MSE: {np.mean(all_mse):.4f} ± {np.std(all_mse):.4f}")
115
+ print(f"Average R2 : {np.mean(all_r2):.4f} ± {np.std(all_r2):.4f}")
116
+ print(f"Time: {time.time() - time_star:.2f}s")
117
+
118
+
119
+ def DeepCCR_reg():
120
+ set_seed(42)
121
+ torch.cuda.empty_cache()
122
+ args = parse_args()
123
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
124
+ all_species =['Cotton/']
125
+ for i in range(len(all_species)):
126
+ args.species = all_species[i]
127
+ args.device = device
128
+ X, Y, nsamples, nsnp, names = load_data(args)
129
+ for j in range(len(names)):
130
+ args.phe = names[j]
131
+ print("starting run " + args.methods + args.species + args.phe)
132
+ label = Y[:, j]
133
+ label = np.nan_to_num(label, nan=np.nanmean(label))
134
+ best_params = DeepCCR_Hyperparameters.Hyperparameter(X, label, nsnp)
135
+ args.lr = best_params['learning_rate']
136
+ args.patience = best_params['patience']
137
+ args.batch_size = best_params['batch_size']
138
+ start_time = time.time()
139
+ torch.cuda.reset_peak_memory_stats()
140
+ process = psutil.Process(os.getpid())
141
+ run_nested_cv(args, data=X, label=label, nsnp = nsnp, device = args.device)
142
+
143
+ elapsed_time = time.time() - start_time
144
+ print(f"running time: {elapsed_time:.2f} s")
145
+ print("successfully")
146
+
147
+
148
+ if __name__ == "__main__":
149
+ DeepCCR_reg()
@@ -0,0 +1,110 @@
1
+ import os
2
+ import time
3
+ import psutil
4
+ import random
5
+ import torch
6
+ import numpy as np
7
+ import optuna
8
+ from sklearn.model_selection import KFold, train_test_split
9
+ from .base_DeepCCR import DeepCCR
10
+ from scipy.stats import pearsonr
11
+ from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
12
+ from torch.utils.data import DataLoader, TensorDataset
13
+ from optuna.exceptions import TrialPruned
14
+
15
+ def run_nested_cv_with_early_stopping(data, label, nsnp, learning_rate, batch_size, patience, epoch=1000):
16
+ device = torch.device("cuda:0")
17
+ print("Starting 10-fold cross-validation...")
18
+ kf = KFold(n_splits=10, shuffle=True, random_state=42)
19
+ all_mse, all_mae, all_r2, all_pcc = [], [], [], []
20
+
21
+ for fold, (train_index, test_index) in enumerate(kf.split(data)):
22
+ print(f"Running fold {fold}...")
23
+ process = psutil.Process(os.getpid())
24
+ fold_start_time = time.time()
25
+
26
+ X_train, X_test = data[train_index], data[test_index]
27
+ y_train, y_test = label[train_index], label[test_index]
28
+
29
+ X_train_sub, X_valid, y_train_sub, y_valid = train_test_split(X_train, y_train, test_size=0.1, random_state=42)
30
+
31
+ x_train_tensor = torch.from_numpy(X_train_sub).float()
32
+ y_train_tensor = torch.from_numpy(y_train_sub).float()
33
+ x_valid_tensor = torch.from_numpy(X_valid).float()
34
+ y_valid_tensor = torch.from_numpy(y_valid).float()
35
+ x_test_tensor = torch.from_numpy(X_test).float()
36
+ y_test_tensor = torch.from_numpy(y_test).float()
37
+ x_train_tensor = x_train_tensor.unsqueeze(1)
38
+ x_valid_tensor = x_valid_tensor.unsqueeze(1)
39
+ x_test_tensor = x_test_tensor.unsqueeze(1)
40
+
41
+ train_data = TensorDataset(x_train_tensor, y_train_tensor)
42
+ valid_data = TensorDataset(x_valid_tensor, y_valid_tensor)
43
+ test_data = TensorDataset(x_test_tensor, y_test_tensor)
44
+
45
+ train_loader = DataLoader(train_data, batch_size, shuffle=True)
46
+ valid_loader = DataLoader(valid_data, batch_size, shuffle=False)
47
+ test_loader = DataLoader(test_data, batch_size, shuffle=False)
48
+
49
+ model = DeepCCR(input_seq_len = nsnp)
50
+ model.train_model(train_loader, valid_loader, epoch, learning_rate, patience, device)
51
+ y_pred = model.predict(test_loader, device)
52
+
53
+ mse = mean_squared_error(y_test, y_pred)
54
+ r2 = r2_score(y_test, y_pred)
55
+ mae = mean_absolute_error(y_test, y_pred)
56
+ pcc, _ = pearsonr(y_test, y_pred)
57
+
58
+ if np.isnan(pcc):
59
+ print(f"Fold {fold} resulted in NaN PCC, pruning the trial...")
60
+ raise TrialPruned()
61
+
62
+ all_mse.append(mse)
63
+ all_r2.append(r2)
64
+ all_mae.append(mae)
65
+ all_pcc.append(pcc)
66
+
67
+ fold_time = time.time() - fold_start_time
68
+ fold_gpu_mem = torch.cuda.max_memory_allocated() / 1024**2 if torch.cuda.is_available() else 0
69
+ fold_cpu_mem = process.memory_info().rss / 1024**2
70
+ print(f'Fold {fold}: Corr={pcc:.4f}, MAE={mae:.4f}, MSE={mse:.4f}, R2={r2:.4f}, Time={fold_time:.2f}s, '
71
+ f'GPU={fold_gpu_mem:.2f}MB, CPU={fold_cpu_mem:.2f}MB')
72
+
73
+ return np.mean(all_pcc) if all_pcc else 0.0
74
+
75
+ def set_seed(seed=42):
76
+ random.seed(seed)
77
+ np.random.seed(seed)
78
+ torch.manual_seed(seed)
79
+ if torch.cuda.is_available():
80
+ torch.cuda.manual_seed_all(seed)
81
+ torch.backends.cudnn.deterministic = True
82
+ torch.backends.cudnn.benchmark = False
83
+
84
+ def Hyperparameter(data, label, nsnp):
85
+ set_seed(42)
86
+
87
+ def objective(trial):
88
+ lr = trial.suggest_float("learning_rate", 1e-4, 0.1)
89
+ batch_size = trial.suggest_categorical("batch_size", [32, 64, 128])
90
+ patience = trial.suggest_int("patience", 1, 10)
91
+ try:
92
+ corr_score = run_nested_cv_with_early_stopping(
93
+ data=data,
94
+ label=label,
95
+ nsnp=nsnp,
96
+ learning_rate=lr,
97
+ batch_size=batch_size,
98
+ patience=patience
99
+ )
100
+
101
+ except TrialPruned:
102
+ return float("-inf")
103
+ return corr_score
104
+
105
+ study = optuna.create_study(direction="maximize")
106
+ study.optimize(objective, n_trials=20)
107
+
108
+ print("best params:", study.best_params)
109
+ print("successfully")
110
+ return study.best_params
@@ -0,0 +1,5 @@
1
+ from .DeepCCR import DeepCCR_reg
2
+
3
+ DeepCCR = DeepCCR_reg
4
+
5
+ __all__ = ["DeepCCR","DeepCCR_reg"]