gpbench 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (188) hide show
  1. gp_agent_tool/compute_dataset_feature.py +67 -0
  2. gp_agent_tool/config.py +65 -0
  3. gp_agent_tool/experience/create_masked_dataset_summary.py +97 -0
  4. gp_agent_tool/experience/dataset_summary_info.py +13 -0
  5. gp_agent_tool/experience/experience_info.py +12 -0
  6. gp_agent_tool/experience/get_matched_experience.py +111 -0
  7. gp_agent_tool/llm_client.py +119 -0
  8. gp_agent_tool/logging_utils.py +24 -0
  9. gp_agent_tool/main.py +347 -0
  10. gp_agent_tool/read_agent/__init__.py +46 -0
  11. gp_agent_tool/read_agent/nodes.py +674 -0
  12. gp_agent_tool/read_agent/prompts.py +547 -0
  13. gp_agent_tool/read_agent/python_repl_tool.py +165 -0
  14. gp_agent_tool/read_agent/state.py +101 -0
  15. gp_agent_tool/read_agent/workflow.py +54 -0
  16. gpbench/__init__.py +25 -0
  17. gpbench/_selftest.py +104 -0
  18. gpbench/method_class/BayesA/BayesA_class.py +141 -0
  19. gpbench/method_class/BayesA/__init__.py +5 -0
  20. gpbench/method_class/BayesA/_bayesfromR.py +96 -0
  21. gpbench/method_class/BayesA/_param_free_base_model.py +84 -0
  22. gpbench/method_class/BayesA/bayesAfromR.py +16 -0
  23. gpbench/method_class/BayesB/BayesB_class.py +140 -0
  24. gpbench/method_class/BayesB/__init__.py +5 -0
  25. gpbench/method_class/BayesB/_bayesfromR.py +96 -0
  26. gpbench/method_class/BayesB/_param_free_base_model.py +84 -0
  27. gpbench/method_class/BayesB/bayesBfromR.py +16 -0
  28. gpbench/method_class/BayesC/BayesC_class.py +141 -0
  29. gpbench/method_class/BayesC/__init__.py +4 -0
  30. gpbench/method_class/BayesC/_bayesfromR.py +96 -0
  31. gpbench/method_class/BayesC/_param_free_base_model.py +84 -0
  32. gpbench/method_class/BayesC/bayesCfromR.py +16 -0
  33. gpbench/method_class/CropARNet/CropARNet_class.py +186 -0
  34. gpbench/method_class/CropARNet/CropARNet_he_class.py +154 -0
  35. gpbench/method_class/CropARNet/__init__.py +5 -0
  36. gpbench/method_class/CropARNet/base_CropARNet_class.py +178 -0
  37. gpbench/method_class/Cropformer/Cropformer_class.py +308 -0
  38. gpbench/method_class/Cropformer/__init__.py +5 -0
  39. gpbench/method_class/Cropformer/cropformer_he_class.py +221 -0
  40. gpbench/method_class/DL_GWAS/DL_GWAS_class.py +250 -0
  41. gpbench/method_class/DL_GWAS/DL_GWAS_he_class.py +169 -0
  42. gpbench/method_class/DL_GWAS/__init__.py +5 -0
  43. gpbench/method_class/DNNGP/DNNGP_class.py +163 -0
  44. gpbench/method_class/DNNGP/DNNGP_he_class.py +138 -0
  45. gpbench/method_class/DNNGP/__init__.py +5 -0
  46. gpbench/method_class/DNNGP/base_dnngp_class.py +116 -0
  47. gpbench/method_class/DeepCCR/DeepCCR_class.py +172 -0
  48. gpbench/method_class/DeepCCR/DeepCCR_he_class.py +161 -0
  49. gpbench/method_class/DeepCCR/__init__.py +5 -0
  50. gpbench/method_class/DeepCCR/base_DeepCCR_class.py +209 -0
  51. gpbench/method_class/DeepGS/DeepGS_class.py +184 -0
  52. gpbench/method_class/DeepGS/DeepGS_he_class.py +150 -0
  53. gpbench/method_class/DeepGS/__init__.py +5 -0
  54. gpbench/method_class/DeepGS/base_deepgs_class.py +153 -0
  55. gpbench/method_class/EIR/EIR_class.py +276 -0
  56. gpbench/method_class/EIR/EIR_he_class.py +184 -0
  57. gpbench/method_class/EIR/__init__.py +5 -0
  58. gpbench/method_class/EIR/utils/__init__.py +0 -0
  59. gpbench/method_class/EIR/utils/array_output_modules.py +97 -0
  60. gpbench/method_class/EIR/utils/common.py +65 -0
  61. gpbench/method_class/EIR/utils/lcl_layers.py +235 -0
  62. gpbench/method_class/EIR/utils/logging.py +59 -0
  63. gpbench/method_class/EIR/utils/mlp_layers.py +92 -0
  64. gpbench/method_class/EIR/utils/models_locally_connected.py +642 -0
  65. gpbench/method_class/EIR/utils/transformer_models.py +546 -0
  66. gpbench/method_class/ElasticNet/ElasticNet_class.py +133 -0
  67. gpbench/method_class/ElasticNet/ElasticNet_he_class.py +91 -0
  68. gpbench/method_class/ElasticNet/__init__.py +5 -0
  69. gpbench/method_class/G2PDeep/G2PDeep_he_class.py +217 -0
  70. gpbench/method_class/G2PDeep/G2Pdeep_class.py +205 -0
  71. gpbench/method_class/G2PDeep/__init__.py +5 -0
  72. gpbench/method_class/G2PDeep/base_G2PDeep_class.py +209 -0
  73. gpbench/method_class/GBLUP/GBLUP_class.py +183 -0
  74. gpbench/method_class/GBLUP/__init__.py +5 -0
  75. gpbench/method_class/GEFormer/GEFormer_class.py +169 -0
  76. gpbench/method_class/GEFormer/GEFormer_he_class.py +137 -0
  77. gpbench/method_class/GEFormer/__init__.py +5 -0
  78. gpbench/method_class/GEFormer/gMLP_class.py +357 -0
  79. gpbench/method_class/LightGBM/LightGBM_class.py +224 -0
  80. gpbench/method_class/LightGBM/LightGBM_he_class.py +121 -0
  81. gpbench/method_class/LightGBM/__init__.py +5 -0
  82. gpbench/method_class/RF/RF_GPU_class.py +165 -0
  83. gpbench/method_class/RF/RF_GPU_he_class.py +124 -0
  84. gpbench/method_class/RF/__init__.py +5 -0
  85. gpbench/method_class/SVC/SVC_GPU.py +181 -0
  86. gpbench/method_class/SVC/SVC_GPU_he.py +106 -0
  87. gpbench/method_class/SVC/__init__.py +5 -0
  88. gpbench/method_class/SoyDNGP/AlexNet_206_class.py +179 -0
  89. gpbench/method_class/SoyDNGP/SoyDNGP_class.py +189 -0
  90. gpbench/method_class/SoyDNGP/SoyDNGP_he_class.py +112 -0
  91. gpbench/method_class/SoyDNGP/__init__.py +5 -0
  92. gpbench/method_class/XGBoost/XGboost_GPU_class.py +198 -0
  93. gpbench/method_class/XGBoost/XGboost_GPU_he_class.py +178 -0
  94. gpbench/method_class/XGBoost/__init__.py +5 -0
  95. gpbench/method_class/__init__.py +52 -0
  96. gpbench/method_class/rrBLUP/__init__.py +5 -0
  97. gpbench/method_class/rrBLUP/rrBLUP_class.py +140 -0
  98. gpbench/method_reg/BayesA/BayesA.py +116 -0
  99. gpbench/method_reg/BayesA/__init__.py +5 -0
  100. gpbench/method_reg/BayesA/_bayesfromR.py +96 -0
  101. gpbench/method_reg/BayesA/_param_free_base_model.py +84 -0
  102. gpbench/method_reg/BayesA/bayesAfromR.py +16 -0
  103. gpbench/method_reg/BayesB/BayesB.py +117 -0
  104. gpbench/method_reg/BayesB/__init__.py +5 -0
  105. gpbench/method_reg/BayesB/_bayesfromR.py +96 -0
  106. gpbench/method_reg/BayesB/_param_free_base_model.py +84 -0
  107. gpbench/method_reg/BayesB/bayesBfromR.py +16 -0
  108. gpbench/method_reg/BayesC/BayesC.py +115 -0
  109. gpbench/method_reg/BayesC/__init__.py +5 -0
  110. gpbench/method_reg/BayesC/_bayesfromR.py +96 -0
  111. gpbench/method_reg/BayesC/_param_free_base_model.py +84 -0
  112. gpbench/method_reg/BayesC/bayesCfromR.py +16 -0
  113. gpbench/method_reg/CropARNet/CropARNet.py +159 -0
  114. gpbench/method_reg/CropARNet/CropARNet_Hyperparameters.py +109 -0
  115. gpbench/method_reg/CropARNet/__init__.py +5 -0
  116. gpbench/method_reg/CropARNet/base_CropARNet.py +137 -0
  117. gpbench/method_reg/Cropformer/Cropformer.py +313 -0
  118. gpbench/method_reg/Cropformer/Cropformer_Hyperparameters.py +250 -0
  119. gpbench/method_reg/Cropformer/__init__.py +5 -0
  120. gpbench/method_reg/DL_GWAS/DL_GWAS.py +186 -0
  121. gpbench/method_reg/DL_GWAS/DL_GWAS_Hyperparameters.py +125 -0
  122. gpbench/method_reg/DL_GWAS/__init__.py +5 -0
  123. gpbench/method_reg/DNNGP/DNNGP.py +157 -0
  124. gpbench/method_reg/DNNGP/DNNGP_Hyperparameters.py +118 -0
  125. gpbench/method_reg/DNNGP/__init__.py +5 -0
  126. gpbench/method_reg/DNNGP/base_dnngp.py +101 -0
  127. gpbench/method_reg/DeepCCR/DeepCCR.py +149 -0
  128. gpbench/method_reg/DeepCCR/DeepCCR_Hyperparameters.py +110 -0
  129. gpbench/method_reg/DeepCCR/__init__.py +5 -0
  130. gpbench/method_reg/DeepCCR/base_DeepCCR.py +171 -0
  131. gpbench/method_reg/DeepGS/DeepGS.py +165 -0
  132. gpbench/method_reg/DeepGS/DeepGS_Hyperparameters.py +114 -0
  133. gpbench/method_reg/DeepGS/__init__.py +5 -0
  134. gpbench/method_reg/DeepGS/base_deepgs.py +98 -0
  135. gpbench/method_reg/EIR/EIR.py +258 -0
  136. gpbench/method_reg/EIR/EIR_Hyperparameters.py +178 -0
  137. gpbench/method_reg/EIR/__init__.py +5 -0
  138. gpbench/method_reg/EIR/utils/__init__.py +0 -0
  139. gpbench/method_reg/EIR/utils/array_output_modules.py +97 -0
  140. gpbench/method_reg/EIR/utils/common.py +65 -0
  141. gpbench/method_reg/EIR/utils/lcl_layers.py +235 -0
  142. gpbench/method_reg/EIR/utils/logging.py +59 -0
  143. gpbench/method_reg/EIR/utils/mlp_layers.py +92 -0
  144. gpbench/method_reg/EIR/utils/models_locally_connected.py +642 -0
  145. gpbench/method_reg/EIR/utils/transformer_models.py +546 -0
  146. gpbench/method_reg/ElasticNet/ElasticNet.py +123 -0
  147. gpbench/method_reg/ElasticNet/ElasticNet_he.py +83 -0
  148. gpbench/method_reg/ElasticNet/__init__.py +5 -0
  149. gpbench/method_reg/G2PDeep/G2PDeep_Hyperparameters.py +107 -0
  150. gpbench/method_reg/G2PDeep/G2Pdeep.py +166 -0
  151. gpbench/method_reg/G2PDeep/__init__.py +5 -0
  152. gpbench/method_reg/G2PDeep/base_G2PDeep.py +209 -0
  153. gpbench/method_reg/GBLUP/GBLUP_R.py +182 -0
  154. gpbench/method_reg/GBLUP/__init__.py +5 -0
  155. gpbench/method_reg/GEFormer/GEFormer.py +164 -0
  156. gpbench/method_reg/GEFormer/GEFormer_Hyperparameters.py +106 -0
  157. gpbench/method_reg/GEFormer/__init__.py +5 -0
  158. gpbench/method_reg/GEFormer/gMLP.py +341 -0
  159. gpbench/method_reg/LightGBM/LightGBM.py +237 -0
  160. gpbench/method_reg/LightGBM/LightGBM_Hyperparameters.py +77 -0
  161. gpbench/method_reg/LightGBM/__init__.py +5 -0
  162. gpbench/method_reg/MVP/MVP.py +182 -0
  163. gpbench/method_reg/MVP/MVP_Hyperparameters.py +126 -0
  164. gpbench/method_reg/MVP/__init__.py +5 -0
  165. gpbench/method_reg/MVP/base_MVP.py +113 -0
  166. gpbench/method_reg/RF/RF_GPU.py +174 -0
  167. gpbench/method_reg/RF/RF_Hyperparameters.py +163 -0
  168. gpbench/method_reg/RF/__init__.py +5 -0
  169. gpbench/method_reg/SVC/SVC_GPU.py +194 -0
  170. gpbench/method_reg/SVC/SVC_Hyperparameters.py +107 -0
  171. gpbench/method_reg/SVC/__init__.py +5 -0
  172. gpbench/method_reg/SoyDNGP/AlexNet_206.py +185 -0
  173. gpbench/method_reg/SoyDNGP/SoyDNGP.py +179 -0
  174. gpbench/method_reg/SoyDNGP/SoyDNGP_Hyperparameters.py +105 -0
  175. gpbench/method_reg/SoyDNGP/__init__.py +5 -0
  176. gpbench/method_reg/XGBoost/XGboost_GPU.py +188 -0
  177. gpbench/method_reg/XGBoost/XGboost_Hyperparameters.py +167 -0
  178. gpbench/method_reg/XGBoost/__init__.py +5 -0
  179. gpbench/method_reg/__init__.py +55 -0
  180. gpbench/method_reg/rrBLUP/__init__.py +5 -0
  181. gpbench/method_reg/rrBLUP/rrBLUP.py +123 -0
  182. gpbench-1.0.0.dist-info/METADATA +379 -0
  183. gpbench-1.0.0.dist-info/RECORD +188 -0
  184. gpbench-1.0.0.dist-info/WHEEL +5 -0
  185. gpbench-1.0.0.dist-info/entry_points.txt +2 -0
  186. gpbench-1.0.0.dist-info/top_level.txt +3 -0
  187. tests/test_import.py +80 -0
  188. tests/test_method.py +232 -0
@@ -0,0 +1,55 @@
1
+
2
+ #==============================================================
3
+ #使用示例:
4
+ #import method_class
5
+ #print(method_class.METHODS)
6
+ #m = method_class.load_method("BayesA")
7
+ #print(m) # method_class.BayesA
8
+
9
+ #或
10
+
11
+ #from method_class.BayesA import BayesA_class
12
+ #BayesA_class()
13
+ #==============================================================
14
+ from __future__ import annotations
15
+ import importlib
16
+ from typing import List
17
+
18
+ METHODS: List[str] = [
19
+ "BayesA",
20
+ "BayesB",
21
+ "BayesC",
22
+ "LightGBM",
23
+ "CropARNet",
24
+ "Cropformer",
25
+ "DeepCCR",
26
+ "DeepGS",
27
+ "DNNGP",
28
+ "EIR",
29
+ "G2PDeep",
30
+ "GBLUP",
31
+ "GEFormer",
32
+ "RF",
33
+ "rrBLUP",
34
+ "SoyDNGP",
35
+ "SVC",
36
+ "XGBoost",
37
+ "ElasticNet",
38
+ "DL_GWAS",
39
+ "MVP"
40
+ ]
41
+
42
+ __all__ = ["METHODS", "load_method"]
43
+
44
+ def load_method(name: str):
45
+ """
46
+ 动态加载某个方法子包并返回该子包模块对象
47
+ 用法:
48
+ m = method_class.load_method("BayesA")
49
+ # m 里面就是 method_class.BayesA 包
50
+ """
51
+ if name not in METHODS:
52
+ raise ValueError(f"Unknown method '{name}'. Available: {METHODS}")
53
+ return importlib.import_module(f"{__name__}.{name}")
54
+
55
+
@@ -0,0 +1,5 @@
1
+ from .rrBLUP import rrBLUP_reg
2
+
3
+ rrBLUP = rrBLUP_reg
4
+
5
+ __all__ = ["rrBLUP","rrBLUP_reg"]
@@ -0,0 +1,123 @@
1
+ import os
2
+ import time
3
+ import psutil
4
+ import argparse
5
+ import random
6
+ import torch
7
+ import numpy as np
8
+ import pandas as pd
9
+ from sklearn.model_selection import KFold
10
+ from scipy.stats import pearsonr
11
+ from rpy2.robjects.packages import importr
12
+ from rpy2.robjects import pandas2ri
13
+ from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
14
+ pandas2ri.activate()
15
+
16
+ def parse_args():
17
+ parser = argparse.ArgumentParser(description="Argument parser")
18
+ parser.add_argument('--methods', type=str, default='rrBLUP/', help='Random seed')
19
+ parser.add_argument('--species', type=str, default='', help='Species name/')
20
+ parser.add_argument('--phe', type=str, default='', help='Dataset name')
21
+ parser.add_argument('--data_dir', type=str, default='../../data/')
22
+ parser.add_argument('--result_dir', type=str, default='result/')
23
+ args = parser.parse_args()
24
+ return args
25
+
26
+ def load_data(args):
27
+ xData = np.load(os.path.join(args.data_dir, args.species, 'genotype.npz'))["arr_0"]
28
+ yData = np.load(os.path.join(args.data_dir, args.species, 'phenotype.npz'))["arr_0"]
29
+ names = np.load(os.path.join(args.data_dir, args.species, 'phenotype.npz'))["arr_1"]
30
+
31
+ nsample = xData.shape[0]
32
+ nsnp = xData.shape[1]
33
+ print("Number of samples: ", nsample)
34
+ print("Number of SNPs: ", nsnp)
35
+ return xData, yData, nsample, nsnp, names
36
+
37
+ def set_seed(seed=42):
38
+ random.seed(seed)
39
+ np.random.seed(seed)
40
+ torch.manual_seed(seed)
41
+ torch.cuda.manual_seed_all(seed)
42
+ torch.backends.cudnn.deterministic = True
43
+ torch.backends.cudnn.benchmark = False
44
+
45
+ def run_nested_cv(args, data, label):
46
+ result_dir = os.path.join(args.result_dir, args.methods + args.species + args.phe)
47
+ os.makedirs(result_dir, exist_ok=True)
48
+ rrblup = importr('rrBLUP')
49
+ print("Successfully loaded rrBLUP")
50
+
51
+ print("Starting 10-fold cross-validation...")
52
+ kf = KFold(n_splits=10, shuffle=True, random_state=42)
53
+
54
+ all_mse, all_mae, all_r2, all_pcc = [], [], [], []
55
+ time_star = time.time()
56
+ for fold, (train_index, test_index) in enumerate(kf.split(data)):
57
+ print(f"Running fold {fold}...")
58
+ process = psutil.Process(os.getpid())
59
+ fold_start_time = time.time()
60
+
61
+ X_train, X_test = data[train_index], data[test_index]
62
+ y_train, y_test = label[train_index], label[test_index]
63
+
64
+ model = rrblup.mixed_solve(y=y_train, Z=X_train)
65
+
66
+ beta = np.array(model.rx2('beta'))
67
+ u = np.array(model.rx2('u'))
68
+
69
+ #y_new = Z_new * u + beta
70
+ Z_new = np.array(X_test)
71
+ y_pred = np.dot(Z_new, u) + beta
72
+
73
+ mse = mean_squared_error(y_test, y_pred)
74
+ r2 = r2_score(y_test, y_pred)
75
+ mae = mean_absolute_error(y_test, y_pred)
76
+ pcc, _ = pearsonr(y_test, y_pred)
77
+
78
+ all_mse.append(mse)
79
+ all_r2.append(r2)
80
+ all_mae.append(mae)
81
+ all_pcc.append(pcc)
82
+
83
+ fold_time = time.time() - fold_start_time
84
+ fold_gpu_mem = torch.cuda.max_memory_allocated() / 1024**2 if torch.cuda.is_available() else 0
85
+ fold_cpu_mem = process.memory_info().rss / 1024**2
86
+ print(f'Fold {fold}: Corr={pcc:.4f}, MAE={mae:.4f}, MSE={mse:.4f}, R2={r2:.4f}, Time={fold_time:.2f}s, '
87
+ f'GPU={fold_gpu_mem:.2f}MB, CPU={fold_cpu_mem:.2f}MB')
88
+ results_df = pd.DataFrame({'Y_test': y_test, 'Y_pred': y_pred})
89
+ results_df.to_csv(os.path.join(result_dir, f"fold{fold}.csv"), index=False)
90
+
91
+ print("\n===== Cross-validation summary =====")
92
+ print(f"Average PCC: {np.mean(all_pcc):.4f} ± {np.std(all_pcc):.4f}")
93
+ print(f"Average MAE: {np.mean(all_mae):.4f} ± {np.std(all_mae):.4f}")
94
+ print(f"Average MSE: {np.mean(all_mse):.4f} ± {np.std(all_mse):.4f}")
95
+ print(f"Average R2 : {np.mean(all_r2):.4f} ± {np.std(all_r2):.4f}")
96
+ print(f"Time: {time.time() - time_star:.2f}s")
97
+
98
+
99
+ def rrBLUP_reg():
100
+ set_seed(42)
101
+ torch.cuda.empty_cache()
102
+ device = torch.device("cuda:0")
103
+ args = parse_args()
104
+ all_species =['Cotton/']
105
+ for i in range(len(all_species)):
106
+ args.species = all_species[i]
107
+ X, Y, nsamples, nsnp, names = load_data(args)
108
+ for j in range(len(names)):
109
+ args.phe = names[j]
110
+ print("starting run " + args.methods + args.species + args.phe)
111
+ label = Y[:, j]
112
+ label = np.nan_to_num(label, nan=np.nanmean(label))
113
+ start_time = time.time()
114
+ torch.cuda.reset_peak_memory_stats()
115
+ process = psutil.Process(os.getpid())
116
+ run_nested_cv(args, data=X, label=label)
117
+
118
+ elapsed_time = time.time() - start_time
119
+ print(f"running time: {elapsed_time:.2f} s")
120
+ print("successfully")
121
+
122
+ if __name__ == "__main__":
123
+ rrBLUP_reg()
@@ -0,0 +1,379 @@
1
+ Metadata-Version: 2.4
2
+ Name: gpbench
3
+ Version: 1.0.0
4
+ Summary: A benchmarking toolkit for genomic prediction with multiple methods and LLM-powered analysis
5
+ Author: GPBench Contributors
6
+ License: MIT
7
+ Keywords: genomic prediction,bioinformatics,machine learning,deep learning
8
+ Classifier: Development Status :: 4 - Beta
9
+ Classifier: Intended Audience :: Science/Research
10
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.8
13
+ Classifier: Programming Language :: Python :: 3.9
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Requires-Python: >=3.8
17
+ Description-Content-Type: text/markdown
18
+ Requires-Dist: numpy==1.26.4
19
+ Requires-Dist: pandas<2.2.3,>=2.0
20
+ Requires-Dist: scipy==1.13.1
21
+ Requires-Dist: matplotlib==3.9.4
22
+ Requires-Dist: seaborn==0.13.2
23
+ Requires-Dist: scikit-learn==1.6.1
24
+ Requires-Dist: torch==2.8.0
25
+ Requires-Dist: torchvision==0.23.0
26
+ Requires-Dist: torchmetrics==1.8.2
27
+ Requires-Dist: pytorch-lightning==2.5.6
28
+ Requires-Dist: lightning==2.5.6
29
+ Requires-Dist: lightning-utilities==0.15.2
30
+ Requires-Dist: tensorflow==2.20.0
31
+ Requires-Dist: keras==3.10.0
32
+ Requires-Dist: tensorboard==2.20.0
33
+ Requires-Dist: tensorboard-data-server==0.7.2
34
+ Requires-Dist: xgboost==2.1.4
35
+ Requires-Dist: lightgbm==4.6.0
36
+ Requires-Dist: optuna==2.10.0
37
+ Requires-Dist: umap-learn==0.5.9.post2
38
+ Requires-Dist: pynndescent==0.5.13
39
+ Requires-Dist: langchain-core==0.3.82
40
+ Requires-Dist: langchain-openai==0.3.35
41
+ Requires-Dist: langgraph==0.6.11
42
+ Requires-Dist: langgraph-checkpoint==2.1.2
43
+ Requires-Dist: langgraph-prebuilt==0.6.5
44
+ Requires-Dist: langgraph-sdk==0.2.9
45
+ Requires-Dist: langsmith==0.4.37
46
+ Requires-Dist: openai==2.8.1
47
+ Requires-Dist: dashscope==1.25.7
48
+ Requires-Dist: tiktoken==0.12.0
49
+ Requires-Dist: transformers==4.57.3
50
+ Requires-Dist: tokenizers==0.22.1
51
+ Requires-Dist: huggingface-hub==0.36.0
52
+ Requires-Dist: psutil==7.1.3
53
+ Requires-Dist: tqdm==4.67.1
54
+ Requires-Dist: pyyaml==6.0.3
55
+ Requires-Dist: requests==2.32.5
56
+ Requires-Dist: requests-toolbelt==1.0.0
57
+ Requires-Dist: python-dotenv==1.2.1
58
+ Requires-Dist: rich==13.9.4
59
+ Requires-Dist: rich-argparse==1.7.2
60
+ Requires-Dist: pyecharts==2.0.9
61
+ Requires-Dist: h5py==3.14.0
62
+ Requires-Dist: pandas-plink==2.2.9
63
+ Requires-Dist: xarray==2024.7.0
64
+ Requires-Dist: statsmodels==0.14.5
65
+ Requires-Dist: patsy==1.0.2
66
+ Requires-Dist: rpy2==3.5.16
67
+ Requires-Dist: aiohttp==3.13.2
68
+ Requires-Dist: httpx==0.28.1
69
+ Requires-Dist: httpcore==1.0.9
70
+ Requires-Dist: anyio==4.12.0
71
+ Requires-Dist: orjson==3.11.5
72
+ Requires-Dist: ormsgpack==1.11.0
73
+ Requires-Dist: simplejson==3.20.2
74
+ Requires-Dist: protobuf==6.33.0
75
+ Requires-Dist: flatbuffers==25.9.23
76
+ Requires-Dist: sympy==1.14.0
77
+ Requires-Dist: mpmath==1.3.0
78
+ Requires-Dist: opt-einsum==3.4.0
79
+ Requires-Dist: cmaes==0.12.0
80
+ Requires-Dist: flaml==2.3.6
81
+ Requires-Dist: pyro-api==0.1.2
82
+ Requires-Dist: pyro-ppl==1.9.1
83
+ Requires-Dist: fsspec==2025.10.0
84
+ Requires-Dist: filelock==3.19.1
85
+ Requires-Dist: diskcache==5.6.3
86
+ Requires-Dist: platformdirs==4.4.0
87
+ Requires-Dist: distro==1.9.0
88
+ Requires-Dist: pydantic==2.12.4
89
+ Requires-Dist: pydantic-core==2.41.5
90
+ Requires-Dist: typing-extensions==4.15.0
91
+ Requires-Dist: typing-inspection==0.4.2
92
+ Requires-Dist: annotated-types==0.7.0
93
+ Requires-Dist: pytest==8.4.2
94
+ Requires-Dist: pytest-cov==7.0.0
95
+ Requires-Dist: coverage==7.10.7
96
+ Requires-Dist: joblib==1.5.2
97
+ Requires-Dist: threadpoolctl==3.6.0
98
+ Requires-Dist: networkx==3.2.1
99
+ Requires-Dist: einops==0.8.1
100
+ Requires-Dist: triton==3.4.0
101
+ Requires-Dist: safetensors==0.7.0
102
+ Requires-Dist: ml-dtypes==0.5.3
103
+ Requires-Dist: tenacity==9.1.2
104
+ Requires-Dist: xxhash==3.6.0
105
+ Requires-Dist: xlsxwriter==3.2.9
106
+ Requires-Dist: aislib==0.1.14a0
107
+ Requires-Dist: swanlab==0.7.6
108
+ Provides-Extra: cuda
109
+ Requires-Dist: nvidia-cublas-cu12==12.8.4.1; extra == "cuda"
110
+ Requires-Dist: nvidia-cuda-cupti-cu12==12.8.90; extra == "cuda"
111
+ Requires-Dist: nvidia-cuda-nvrtc-cu12==12.8.93; extra == "cuda"
112
+ Requires-Dist: nvidia-cuda-runtime-cu12==12.8.90; extra == "cuda"
113
+ Requires-Dist: nvidia-cudnn-cu12==9.10.2.21; extra == "cuda"
114
+ Requires-Dist: nvidia-cufft-cu12==11.3.3.83; extra == "cuda"
115
+ Requires-Dist: nvidia-cufile-cu12==1.13.1.3; extra == "cuda"
116
+ Requires-Dist: nvidia-curand-cu12==10.3.9.90; extra == "cuda"
117
+ Requires-Dist: nvidia-cusolver-cu12==11.7.3.90; extra == "cuda"
118
+ Requires-Dist: nvidia-cusparse-cu12==12.5.8.93; extra == "cuda"
119
+ Requires-Dist: nvidia-cusparselt-cu12==0.7.1; extra == "cuda"
120
+ Requires-Dist: nvidia-ml-py==13.580.82; extra == "cuda"
121
+ Requires-Dist: nvidia-nccl-cu12==2.27.3; extra == "cuda"
122
+ Requires-Dist: nvidia-nvjitlink-cu12==12.8.93; extra == "cuda"
123
+ Requires-Dist: nvidia-nvtx-cu12==12.8.90; extra == "cuda"
124
+ Provides-Extra: extra
125
+ Requires-Dist: autogen-agentchat==0.2.40; extra == "extra"
126
+ Requires-Dist: swanlab==0.7.6; extra == "extra"
127
+ Requires-Dist: docker==7.1.0; extra == "extra"
128
+ Requires-Dist: boto3==1.40.69; extra == "extra"
129
+ Requires-Dist: botocore==1.40.69; extra == "extra"
130
+ Requires-Dist: s3transfer==0.14.0; extra == "extra"
131
+ Provides-Extra: dev
132
+ Requires-Dist: black>=22.0.0; extra == "dev"
133
+ Requires-Dist: flake8>=4.0.0; extra == "dev"
134
+ Requires-Dist: mypy>=1.0.0; extra == "dev"
135
+
136
+ # GPBench
137
+
138
+ GPBench is a benchmarking toolkit for genomic prediction. This repository reimplements and integrates many commonly used methods, including classic linear statistical approaches and machine learning / deep learning methods: rrBLUP, GBLUP, BayesA/B/C, SVR, Random Forest, XGBoost, LightGBM, DeepGS, DL_GWAS, G2PDeep, MVP, DNNGP, SoyDNGP, DeepCCR, EIR, Cropformer, GEFormer, CropARNet, etc.
139
+
140
+ Project Website: [https://www.sdu-idea.cn/GPBench/](https://www.sdu-idea.cn/GPBench/)
141
+
142
+ ![GPBench overview](data/fig/fig1.png)
143
+
144
+ ## Key Features
145
+ - Implements multiple genomic prediction methods and reproducible experimental workflows
146
+ - Supports GPU-accelerated deep learning methods (using PyTorch)
147
+ - Unified data loading and 10-fold cross-validation pipeline
148
+ - Outputs standardized evaluation metrics (PCC, MAE, MSE, R2) and per-fold predictions
149
+ - **LLM-powered analysis tool** (`gp_agent_tool`): Analyzes dataset characteristics, finds similar datasets, and recommends suitable genomic prediction methods based on historical experimental experience
150
+
151
+ ## Important Structure
152
+ - `data/`: Example/real dataset directory, each species/dataset is a subfolder (e.g., `data/Cotton/`), containing:
153
+ - `genotype.npz`: genotype matrix (typically saved as a NumPy array)
154
+ - `phenotype.npz`: phenotype data (contains phenotype matrix and phenotype names)
155
+ - `method_reg/`: subdirectories with implementations for each method (each method usually contains a main runner script plus hyperparameter/utility scripts)
156
+ - `result/`: default output directory for experimental results
157
+ - `gp_agent_tool/`: LLM-powered dataset analysis and method recommendation tool (see [Dataset Analysis Tool](#dataset-analysis-tool-gp_agent_tool) section)
158
+ - `environment.yml`: dependency file for creating a conda environment (recommended)
159
+
160
+ ## Environment Setup (recommended: conda)
161
+ There is an `environment.yml` in the repository; it is recommended to create and activate a conda environment with it:
162
+
163
+ ```bash
164
+ # On a machine with conda:
165
+ conda env create -f environment.yml
166
+ conda activate Benchmark
167
+ ```
168
+
169
+ Notes:
170
+ - `environment.yml` contains most dependencies (including CUDA / cuDNN related packages and pip list) and is suitable for GPU-enabled environments (the file references CUDA 11.8 and matching RAPIDS/torch/cupy versions).
171
+ - Ensure the target machine has an NVIDIA driver compatible with CUDA 11.8/12.
172
+ - If you cannot use the environment file directly, you can install main dependencies into an existing Python environment as needed:
173
+
174
+ ```bash
175
+ pip install -U numpy pandas scikit-learn torch torchvision optuna psutil xgboost lightgbm
176
+ ```
177
+
178
+ (Warning: the above is a simplified installation; some packages may need additional configuration on GPU systems or certain platforms.)
179
+
180
+ ## Data Format and Preparation
181
+ - Each species folder should contain `genotype.npz` and `phenotype.npz`.
182
+ - `genotype.npz` usually stores a 2D array (number of samples × number of SNPs).
183
+ - `phenotype.npz` typically includes two arrays: the phenotype matrix (number of samples × number of phenotypes) and a list of phenotype names.
184
+
185
+ Quickly view phenotype names for a dataset (e.g., `Cotton`):
186
+
187
+ ```bash
188
+ python - <<'PY'
189
+ import numpy as np
190
+ obj = np.load('data/Cotton/phenotype.npz')
191
+ print(obj['arr_1'])
192
+ PY
193
+ ```
194
+
195
+ ## Quick Start (example with a method)
196
+ Most methods have a main script under `method_reg/<Method>/`. Scripts usually accept parameters like `--methods`, `--species`, `--phe`, `--data_dir`, `--result_dir`, etc. Example:
197
+
198
+ ```bash
199
+ # 1) Activate the environment
200
+ conda activate Benchmark
201
+
202
+ # 2) Run a single phenotype with DeepCCR (note: include trailing slash after --species)
203
+ python method_reg/DeepCCR/DeepCCR.py \
204
+ --methods DeepCCR/ \
205
+ --species Cotton/ \
206
+ --phe FibLen_17_18 \
207
+ --data_dir data/ \
208
+ --result_dir result/
209
+ ```
210
+
211
+ Common optional arguments (may vary across scripts):
212
+ - `--epoch`: number of training epochs (example scripts often default to 1000)
213
+ - `--batch_size`: batch size
214
+ - `--lr`: learning rate
215
+ - `--patience`: early stopping patience
216
+
217
+ You can inspect the argparse help for the specific script in the method directory:
218
+
219
+ ```bash
220
+ python method_reg/DeepCCR/DeepCCR.py -h
221
+ ```
222
+
223
+ ## Dataset Analysis Tool (gp_agent_tool)
224
+
225
+ The `gp_agent_tool` is an LLM-powered analysis tool that performs comprehensive dataset analysis and automatically recommends suitable genomic prediction methods. It analyzes your dataset characteristics, computes statistical features, finds similar datasets from historical experiments, and provides evidence-based method recommendations.
226
+
227
+ ### Features
228
+ - **Dataset statistical analysis**: Automatically computes and analyzes dataset statistics including sample size, marker count, phenotype distribution, missing rates, and statistical properties
229
+ - **Similar dataset discovery**: Finds datasets with similar statistical distributions to your query dataset from historical experimental databases
230
+ - **Method recommendation**: Recommends genomic prediction methods that have shown best performance on similar datasets based on historical experience
231
+ - **Bilingual support**: Supports both Chinese and English queries and analysis
232
+ - **Experience-based insights**: Leverages comprehensive historical experimental results to provide evidence-based analysis and recommendations
233
+
234
+ ### Prerequisites
235
+
236
+ 1. **LLM Configuration**: Create a configuration file at `gp_agent_tool/config/config.json` with your LLM API settings:
237
+
238
+ ```json
239
+ {
240
+ "llm": {
241
+ "model": "gpt-4o-mini",
242
+ "api_key": "YOUR_OPENAI_API_KEY",
243
+ "base_url": "https://api.openai.com/v1",
244
+ "timeout_seconds": 60,
245
+ "max_retries": 3
246
+ },
247
+ "codegen_llm": {
248
+ "model": "gpt-4o-mini",
249
+ "api_key": "YOUR_OPENAI_API_KEY",
250
+ "base_url": "https://api.openai.com/v1",
251
+ "timeout_seconds": 60,
252
+ "max_retries": 3
253
+ },
254
+ "multimodal_llm": {
255
+ "model": "qwen-vl-max",
256
+ "api_key": "YOUR_DASHSCOPE_API_KEY"
257
+ }
258
+ }
259
+ ```
260
+
261
+ **Important**: Please replace the `api_key` fields in the configuration file with your own API keys:
262
+ - Replace `YOUR_OPENAI_API_KEY` in `llm` and `codegen_llm` with your OpenAI API key
263
+ - Replace `YOUR_DASHSCOPE_API_KEY` in `multimodal_llm` with your Alibaba Cloud DashScope API key
264
+
265
+ You can obtain API keys from the following URLs:
266
+ - OpenAI API key: https://platform.openai.com/api-keys
267
+ - Alibaba Cloud DashScope API key: https://dashscope.console.aliyun.com/apiKey
268
+
269
+ 2. **Additional Dependencies**: Install required packages for the tool:
270
+
271
+ ```bash
272
+ pip install langchain langgraph openai
273
+ ```
274
+
275
+ ### Usage
276
+
277
+ #### Basic Usage
278
+
279
+ Run the tool from the project root directory:
280
+
281
+ ```bash
282
+ cd gp_agent_tool
283
+ python main.py \
284
+ -q "Based on existing models, summarize the patterns in the mkg trait of cattle." \
285
+ -o result.json
286
+ ```
287
+
288
+ Or in English:
289
+
290
+ ```bash
291
+ python main.py \
292
+ -d ../data/Rapeseed \
293
+ -q "Recommend the best methods for this dataset" \
294
+ -o result.json
295
+ ```
296
+
297
+ #### Command-line Arguments
298
+
299
+ - **`-d / --dataset`** (optional): Path to the dataset directory containing `genotype.npz` and `phenotype.npz`. The tool will analyze this dataset to compute statistical features. If not provided, analysis and recommendations are based on the complete experience table only.
300
+ - **`-q / --user-query`** (required): Your analysis requirement or question description (supports both Chinese and English). Examples: "分析这个数据集的特征" / "Analyze this dataset and recommend methods" / "What methods work best for binary phenotypes?"
301
+ - **`-m / --mask`** (optional): Specify a `species/phenotype` (e.g., `Rapeseed/FloweringTime`) to mask in the reference experience database, preventing "answer leakage" when evaluating on known datasets.
302
+ - **`-o / --output`** (optional): Path to save the analysis result as a JSON file. If not provided, results are printed to the terminal.
303
+
304
+ #### Dataset Analysis Features
305
+
306
+ When a dataset path is provided, the tool automatically computes the following statistical features:
307
+
308
+ - **Sample information**: Total samples, valid samples, missing rate
309
+ - **Marker information**: Number of markers, genotype statistics (mean, std, missing rate, MAF)
310
+ - **Phenotype statistics**: Mean, std, min, max, median, skewness, kurtosis
311
+ - **Data type information**: Genotype and phenotype data types, binary phenotype detection
312
+
313
+ #### Example Output
314
+
315
+ The tool returns a JSON object with two main sections:
316
+
317
+ ```json
318
+ {
319
+ "similar_datasets": {
320
+ "items": ["Chickpea/Days_to_0.5_flowering", "Cotton/FibLen_17_18"],
321
+ "reason": "These datasets have similar statistical distributions..."
322
+ },
323
+ "methods": {
324
+ "items": ["GBLUP", "XGBoost", "LightGBM"],
325
+ "reason": "Based on historical experience, these methods showed best performance on similar datasets..."
326
+ }
327
+ }
328
+ ```
329
+
330
+ #### Analysis Workflow
331
+
332
+ When you provide a dataset path, the tool performs the following analysis steps:
333
+
334
+ 1. **Dataset feature extraction**: Computes statistical features from your dataset (phenotype mean, std, skewness, kurtosis, sample size, marker count, etc.)
335
+ 2. **Similar dataset matching**: Compares your dataset features with historical datasets to find the most similar ones
336
+ 3. **Experience table filtering**: Filters the historical experience table to include only results from similar datasets
337
+ 4. **Method analysis and recommendation**: Analyzes which methods performed best on similar datasets and recommends them with detailed reasoning
338
+
339
+ #### Use Cases
340
+
341
+ 1. **General method query**: Query methods based on specific criteria without providing a dataset:
342
+
343
+ ```bash
344
+ python main.py \
345
+ -q "What methods work best for small sample sizes?" \
346
+ -o result.json
347
+ ```
348
+
349
+ 2. **Evaluation mode with masking**: When evaluating on a known dataset, mask it to avoid bias in the analysis:
350
+
351
+ ```bash
352
+ python main.py \
353
+ -d ../data/Rapeseed \
354
+ -q "Analyze this dataset and recommend appropriate algorithms." \
355
+ -m Rapeseed/FloweringTime \
356
+ -o result.json
357
+ ```
358
+
359
+ ## Output Description
360
+ - Each method run creates a directory under `result/` named by method/species/phenotype, e.g., `result/DeepCCR/Cotton/<PHENO>/`.
361
+ - Per-fold prediction results are typically saved as `fold{n}.csv`, containing `Y_test` and `Y_pred` columns.
362
+ - The script prints or saves average evaluation metrics at the end: PCC (Pearson correlation coefficient), MAE, MSE, R2, along with runtime and memory/GPU usage.
363
+
364
+ ## Full Dataset Link
365
+ - [Species dataset](https://doi.org/10.6084/m9.figshare.31007608): contains genotype and phenotype data for 16 species.
366
+
367
+ ## Running Tips & Troubleshooting
368
+ - For GPU usage, ensure `conda activate Benchmark` and that CUDA drivers are available; `torch.cuda.is_available()` should return True.
369
+ - If you encounter memory or GPU OOM issues, try reducing `--batch_size` or disabling some parallel settings in scripts.
370
+ - If running on CPU-only systems, some GPU-specific methods (RAPIDS or GPU-only implementations) may be unavailable or require alternative implementations.
371
+
372
+ ## Contributing & Contact
373
+ - Contributions via issues and PRs are welcome. Please describe changes and testing in PRs.
374
+ - Contact: open an Issue in the repository or reach the repository owner (GitHub user: `xwzhang2118`).
375
+
376
+
377
+
378
+
379
+