gpbench 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (193) hide show
  1. gpbench-1.0.0/PKG-INFO +379 -0
  2. gpbench-1.0.0/README.md +244 -0
  3. gpbench-1.0.0/gp_agent_tool/compute_dataset_feature.py +67 -0
  4. gpbench-1.0.0/gp_agent_tool/config.py +65 -0
  5. gpbench-1.0.0/gp_agent_tool/experience/create_masked_dataset_summary.py +97 -0
  6. gpbench-1.0.0/gp_agent_tool/experience/dataset_summary_info.py +13 -0
  7. gpbench-1.0.0/gp_agent_tool/experience/experience_info.py +12 -0
  8. gpbench-1.0.0/gp_agent_tool/experience/get_matched_experience.py +111 -0
  9. gpbench-1.0.0/gp_agent_tool/llm_client.py +119 -0
  10. gpbench-1.0.0/gp_agent_tool/logging_utils.py +24 -0
  11. gpbench-1.0.0/gp_agent_tool/main.py +347 -0
  12. gpbench-1.0.0/gp_agent_tool/read_agent/__init__.py +46 -0
  13. gpbench-1.0.0/gp_agent_tool/read_agent/nodes.py +674 -0
  14. gpbench-1.0.0/gp_agent_tool/read_agent/prompts.py +547 -0
  15. gpbench-1.0.0/gp_agent_tool/read_agent/python_repl_tool.py +165 -0
  16. gpbench-1.0.0/gp_agent_tool/read_agent/state.py +101 -0
  17. gpbench-1.0.0/gp_agent_tool/read_agent/workflow.py +54 -0
  18. gpbench-1.0.0/gpbench/__init__.py +25 -0
  19. gpbench-1.0.0/gpbench/_selftest.py +104 -0
  20. gpbench-1.0.0/gpbench/method_class/BayesA/BayesA_class.py +141 -0
  21. gpbench-1.0.0/gpbench/method_class/BayesA/__init__.py +5 -0
  22. gpbench-1.0.0/gpbench/method_class/BayesA/_bayesfromR.py +96 -0
  23. gpbench-1.0.0/gpbench/method_class/BayesA/_param_free_base_model.py +84 -0
  24. gpbench-1.0.0/gpbench/method_class/BayesA/bayesAfromR.py +16 -0
  25. gpbench-1.0.0/gpbench/method_class/BayesB/BayesB_class.py +140 -0
  26. gpbench-1.0.0/gpbench/method_class/BayesB/__init__.py +5 -0
  27. gpbench-1.0.0/gpbench/method_class/BayesB/_bayesfromR.py +96 -0
  28. gpbench-1.0.0/gpbench/method_class/BayesB/_param_free_base_model.py +84 -0
  29. gpbench-1.0.0/gpbench/method_class/BayesB/bayesBfromR.py +16 -0
  30. gpbench-1.0.0/gpbench/method_class/BayesC/BayesC_class.py +141 -0
  31. gpbench-1.0.0/gpbench/method_class/BayesC/__init__.py +4 -0
  32. gpbench-1.0.0/gpbench/method_class/BayesC/_bayesfromR.py +96 -0
  33. gpbench-1.0.0/gpbench/method_class/BayesC/_param_free_base_model.py +84 -0
  34. gpbench-1.0.0/gpbench/method_class/BayesC/bayesCfromR.py +16 -0
  35. gpbench-1.0.0/gpbench/method_class/CropARNet/CropARNet_class.py +186 -0
  36. gpbench-1.0.0/gpbench/method_class/CropARNet/CropARNet_he_class.py +154 -0
  37. gpbench-1.0.0/gpbench/method_class/CropARNet/__init__.py +5 -0
  38. gpbench-1.0.0/gpbench/method_class/CropARNet/base_CropARNet_class.py +178 -0
  39. gpbench-1.0.0/gpbench/method_class/Cropformer/Cropformer_class.py +308 -0
  40. gpbench-1.0.0/gpbench/method_class/Cropformer/__init__.py +5 -0
  41. gpbench-1.0.0/gpbench/method_class/Cropformer/cropformer_he_class.py +221 -0
  42. gpbench-1.0.0/gpbench/method_class/DL_GWAS/DL_GWAS_class.py +250 -0
  43. gpbench-1.0.0/gpbench/method_class/DL_GWAS/DL_GWAS_he_class.py +169 -0
  44. gpbench-1.0.0/gpbench/method_class/DL_GWAS/__init__.py +5 -0
  45. gpbench-1.0.0/gpbench/method_class/DNNGP/DNNGP_class.py +163 -0
  46. gpbench-1.0.0/gpbench/method_class/DNNGP/DNNGP_he_class.py +138 -0
  47. gpbench-1.0.0/gpbench/method_class/DNNGP/__init__.py +5 -0
  48. gpbench-1.0.0/gpbench/method_class/DNNGP/base_dnngp_class.py +116 -0
  49. gpbench-1.0.0/gpbench/method_class/DeepCCR/DeepCCR_class.py +172 -0
  50. gpbench-1.0.0/gpbench/method_class/DeepCCR/DeepCCR_he_class.py +161 -0
  51. gpbench-1.0.0/gpbench/method_class/DeepCCR/__init__.py +5 -0
  52. gpbench-1.0.0/gpbench/method_class/DeepCCR/base_DeepCCR_class.py +209 -0
  53. gpbench-1.0.0/gpbench/method_class/DeepGS/DeepGS_class.py +184 -0
  54. gpbench-1.0.0/gpbench/method_class/DeepGS/DeepGS_he_class.py +150 -0
  55. gpbench-1.0.0/gpbench/method_class/DeepGS/__init__.py +5 -0
  56. gpbench-1.0.0/gpbench/method_class/DeepGS/base_deepgs_class.py +153 -0
  57. gpbench-1.0.0/gpbench/method_class/EIR/EIR_class.py +276 -0
  58. gpbench-1.0.0/gpbench/method_class/EIR/EIR_he_class.py +184 -0
  59. gpbench-1.0.0/gpbench/method_class/EIR/__init__.py +5 -0
  60. gpbench-1.0.0/gpbench/method_class/EIR/utils/__init__.py +0 -0
  61. gpbench-1.0.0/gpbench/method_class/EIR/utils/array_output_modules.py +97 -0
  62. gpbench-1.0.0/gpbench/method_class/EIR/utils/common.py +65 -0
  63. gpbench-1.0.0/gpbench/method_class/EIR/utils/lcl_layers.py +235 -0
  64. gpbench-1.0.0/gpbench/method_class/EIR/utils/logging.py +59 -0
  65. gpbench-1.0.0/gpbench/method_class/EIR/utils/mlp_layers.py +92 -0
  66. gpbench-1.0.0/gpbench/method_class/EIR/utils/models_locally_connected.py +642 -0
  67. gpbench-1.0.0/gpbench/method_class/EIR/utils/transformer_models.py +546 -0
  68. gpbench-1.0.0/gpbench/method_class/ElasticNet/ElasticNet_class.py +133 -0
  69. gpbench-1.0.0/gpbench/method_class/ElasticNet/ElasticNet_he_class.py +91 -0
  70. gpbench-1.0.0/gpbench/method_class/ElasticNet/__init__.py +5 -0
  71. gpbench-1.0.0/gpbench/method_class/G2PDeep/G2PDeep_he_class.py +217 -0
  72. gpbench-1.0.0/gpbench/method_class/G2PDeep/G2Pdeep_class.py +205 -0
  73. gpbench-1.0.0/gpbench/method_class/G2PDeep/__init__.py +5 -0
  74. gpbench-1.0.0/gpbench/method_class/G2PDeep/base_G2PDeep_class.py +209 -0
  75. gpbench-1.0.0/gpbench/method_class/GBLUP/GBLUP_class.py +183 -0
  76. gpbench-1.0.0/gpbench/method_class/GBLUP/__init__.py +5 -0
  77. gpbench-1.0.0/gpbench/method_class/GEFormer/GEFormer_class.py +169 -0
  78. gpbench-1.0.0/gpbench/method_class/GEFormer/GEFormer_he_class.py +137 -0
  79. gpbench-1.0.0/gpbench/method_class/GEFormer/__init__.py +5 -0
  80. gpbench-1.0.0/gpbench/method_class/GEFormer/gMLP_class.py +357 -0
  81. gpbench-1.0.0/gpbench/method_class/LightGBM/LightGBM_class.py +224 -0
  82. gpbench-1.0.0/gpbench/method_class/LightGBM/LightGBM_he_class.py +121 -0
  83. gpbench-1.0.0/gpbench/method_class/LightGBM/__init__.py +5 -0
  84. gpbench-1.0.0/gpbench/method_class/RF/RF_GPU_class.py +165 -0
  85. gpbench-1.0.0/gpbench/method_class/RF/RF_GPU_he_class.py +124 -0
  86. gpbench-1.0.0/gpbench/method_class/RF/__init__.py +5 -0
  87. gpbench-1.0.0/gpbench/method_class/SVC/SVC_GPU.py +181 -0
  88. gpbench-1.0.0/gpbench/method_class/SVC/SVC_GPU_he.py +106 -0
  89. gpbench-1.0.0/gpbench/method_class/SVC/__init__.py +5 -0
  90. gpbench-1.0.0/gpbench/method_class/SoyDNGP/AlexNet_206_class.py +179 -0
  91. gpbench-1.0.0/gpbench/method_class/SoyDNGP/SoyDNGP_class.py +189 -0
  92. gpbench-1.0.0/gpbench/method_class/SoyDNGP/SoyDNGP_he_class.py +112 -0
  93. gpbench-1.0.0/gpbench/method_class/SoyDNGP/__init__.py +5 -0
  94. gpbench-1.0.0/gpbench/method_class/XGBoost/XGboost_GPU_class.py +198 -0
  95. gpbench-1.0.0/gpbench/method_class/XGBoost/XGboost_GPU_he_class.py +178 -0
  96. gpbench-1.0.0/gpbench/method_class/XGBoost/__init__.py +5 -0
  97. gpbench-1.0.0/gpbench/method_class/__init__.py +52 -0
  98. gpbench-1.0.0/gpbench/method_class/rrBLUP/__init__.py +5 -0
  99. gpbench-1.0.0/gpbench/method_class/rrBLUP/rrBLUP_class.py +140 -0
  100. gpbench-1.0.0/gpbench/method_reg/BayesA/BayesA.py +116 -0
  101. gpbench-1.0.0/gpbench/method_reg/BayesA/__init__.py +5 -0
  102. gpbench-1.0.0/gpbench/method_reg/BayesA/_bayesfromR.py +96 -0
  103. gpbench-1.0.0/gpbench/method_reg/BayesA/_param_free_base_model.py +84 -0
  104. gpbench-1.0.0/gpbench/method_reg/BayesA/bayesAfromR.py +16 -0
  105. gpbench-1.0.0/gpbench/method_reg/BayesB/BayesB.py +117 -0
  106. gpbench-1.0.0/gpbench/method_reg/BayesB/__init__.py +5 -0
  107. gpbench-1.0.0/gpbench/method_reg/BayesB/_bayesfromR.py +96 -0
  108. gpbench-1.0.0/gpbench/method_reg/BayesB/_param_free_base_model.py +84 -0
  109. gpbench-1.0.0/gpbench/method_reg/BayesB/bayesBfromR.py +16 -0
  110. gpbench-1.0.0/gpbench/method_reg/BayesC/BayesC.py +115 -0
  111. gpbench-1.0.0/gpbench/method_reg/BayesC/__init__.py +5 -0
  112. gpbench-1.0.0/gpbench/method_reg/BayesC/_bayesfromR.py +96 -0
  113. gpbench-1.0.0/gpbench/method_reg/BayesC/_param_free_base_model.py +84 -0
  114. gpbench-1.0.0/gpbench/method_reg/BayesC/bayesCfromR.py +16 -0
  115. gpbench-1.0.0/gpbench/method_reg/CropARNet/CropARNet.py +159 -0
  116. gpbench-1.0.0/gpbench/method_reg/CropARNet/CropARNet_Hyperparameters.py +109 -0
  117. gpbench-1.0.0/gpbench/method_reg/CropARNet/__init__.py +5 -0
  118. gpbench-1.0.0/gpbench/method_reg/CropARNet/base_CropARNet.py +137 -0
  119. gpbench-1.0.0/gpbench/method_reg/Cropformer/Cropformer.py +313 -0
  120. gpbench-1.0.0/gpbench/method_reg/Cropformer/Cropformer_Hyperparameters.py +250 -0
  121. gpbench-1.0.0/gpbench/method_reg/Cropformer/__init__.py +5 -0
  122. gpbench-1.0.0/gpbench/method_reg/DL_GWAS/DL_GWAS.py +186 -0
  123. gpbench-1.0.0/gpbench/method_reg/DL_GWAS/DL_GWAS_Hyperparameters.py +125 -0
  124. gpbench-1.0.0/gpbench/method_reg/DL_GWAS/__init__.py +5 -0
  125. gpbench-1.0.0/gpbench/method_reg/DNNGP/DNNGP.py +157 -0
  126. gpbench-1.0.0/gpbench/method_reg/DNNGP/DNNGP_Hyperparameters.py +118 -0
  127. gpbench-1.0.0/gpbench/method_reg/DNNGP/__init__.py +5 -0
  128. gpbench-1.0.0/gpbench/method_reg/DNNGP/base_dnngp.py +101 -0
  129. gpbench-1.0.0/gpbench/method_reg/DeepCCR/DeepCCR.py +149 -0
  130. gpbench-1.0.0/gpbench/method_reg/DeepCCR/DeepCCR_Hyperparameters.py +110 -0
  131. gpbench-1.0.0/gpbench/method_reg/DeepCCR/__init__.py +5 -0
  132. gpbench-1.0.0/gpbench/method_reg/DeepCCR/base_DeepCCR.py +171 -0
  133. gpbench-1.0.0/gpbench/method_reg/DeepGS/DeepGS.py +165 -0
  134. gpbench-1.0.0/gpbench/method_reg/DeepGS/DeepGS_Hyperparameters.py +114 -0
  135. gpbench-1.0.0/gpbench/method_reg/DeepGS/__init__.py +5 -0
  136. gpbench-1.0.0/gpbench/method_reg/DeepGS/base_deepgs.py +98 -0
  137. gpbench-1.0.0/gpbench/method_reg/EIR/EIR.py +258 -0
  138. gpbench-1.0.0/gpbench/method_reg/EIR/EIR_Hyperparameters.py +178 -0
  139. gpbench-1.0.0/gpbench/method_reg/EIR/__init__.py +5 -0
  140. gpbench-1.0.0/gpbench/method_reg/EIR/utils/__init__.py +0 -0
  141. gpbench-1.0.0/gpbench/method_reg/EIR/utils/array_output_modules.py +97 -0
  142. gpbench-1.0.0/gpbench/method_reg/EIR/utils/common.py +65 -0
  143. gpbench-1.0.0/gpbench/method_reg/EIR/utils/lcl_layers.py +235 -0
  144. gpbench-1.0.0/gpbench/method_reg/EIR/utils/logging.py +59 -0
  145. gpbench-1.0.0/gpbench/method_reg/EIR/utils/mlp_layers.py +92 -0
  146. gpbench-1.0.0/gpbench/method_reg/EIR/utils/models_locally_connected.py +642 -0
  147. gpbench-1.0.0/gpbench/method_reg/EIR/utils/transformer_models.py +546 -0
  148. gpbench-1.0.0/gpbench/method_reg/ElasticNet/ElasticNet.py +123 -0
  149. gpbench-1.0.0/gpbench/method_reg/ElasticNet/ElasticNet_he.py +83 -0
  150. gpbench-1.0.0/gpbench/method_reg/ElasticNet/__init__.py +5 -0
  151. gpbench-1.0.0/gpbench/method_reg/G2PDeep/G2PDeep_Hyperparameters.py +107 -0
  152. gpbench-1.0.0/gpbench/method_reg/G2PDeep/G2Pdeep.py +166 -0
  153. gpbench-1.0.0/gpbench/method_reg/G2PDeep/__init__.py +5 -0
  154. gpbench-1.0.0/gpbench/method_reg/G2PDeep/base_G2PDeep.py +209 -0
  155. gpbench-1.0.0/gpbench/method_reg/GBLUP/GBLUP_R.py +182 -0
  156. gpbench-1.0.0/gpbench/method_reg/GBLUP/__init__.py +5 -0
  157. gpbench-1.0.0/gpbench/method_reg/GEFormer/GEFormer.py +164 -0
  158. gpbench-1.0.0/gpbench/method_reg/GEFormer/GEFormer_Hyperparameters.py +106 -0
  159. gpbench-1.0.0/gpbench/method_reg/GEFormer/__init__.py +5 -0
  160. gpbench-1.0.0/gpbench/method_reg/GEFormer/gMLP.py +341 -0
  161. gpbench-1.0.0/gpbench/method_reg/LightGBM/LightGBM.py +237 -0
  162. gpbench-1.0.0/gpbench/method_reg/LightGBM/LightGBM_Hyperparameters.py +77 -0
  163. gpbench-1.0.0/gpbench/method_reg/LightGBM/__init__.py +5 -0
  164. gpbench-1.0.0/gpbench/method_reg/MVP/MVP.py +182 -0
  165. gpbench-1.0.0/gpbench/method_reg/MVP/MVP_Hyperparameters.py +126 -0
  166. gpbench-1.0.0/gpbench/method_reg/MVP/__init__.py +5 -0
  167. gpbench-1.0.0/gpbench/method_reg/MVP/base_MVP.py +113 -0
  168. gpbench-1.0.0/gpbench/method_reg/RF/RF_GPU.py +174 -0
  169. gpbench-1.0.0/gpbench/method_reg/RF/RF_Hyperparameters.py +163 -0
  170. gpbench-1.0.0/gpbench/method_reg/RF/__init__.py +5 -0
  171. gpbench-1.0.0/gpbench/method_reg/SVC/SVC_GPU.py +194 -0
  172. gpbench-1.0.0/gpbench/method_reg/SVC/SVC_Hyperparameters.py +107 -0
  173. gpbench-1.0.0/gpbench/method_reg/SVC/__init__.py +5 -0
  174. gpbench-1.0.0/gpbench/method_reg/SoyDNGP/AlexNet_206.py +185 -0
  175. gpbench-1.0.0/gpbench/method_reg/SoyDNGP/SoyDNGP.py +179 -0
  176. gpbench-1.0.0/gpbench/method_reg/SoyDNGP/SoyDNGP_Hyperparameters.py +105 -0
  177. gpbench-1.0.0/gpbench/method_reg/SoyDNGP/__init__.py +5 -0
  178. gpbench-1.0.0/gpbench/method_reg/XGBoost/XGboost_GPU.py +188 -0
  179. gpbench-1.0.0/gpbench/method_reg/XGBoost/XGboost_Hyperparameters.py +167 -0
  180. gpbench-1.0.0/gpbench/method_reg/XGBoost/__init__.py +5 -0
  181. gpbench-1.0.0/gpbench/method_reg/__init__.py +55 -0
  182. gpbench-1.0.0/gpbench/method_reg/rrBLUP/__init__.py +5 -0
  183. gpbench-1.0.0/gpbench/method_reg/rrBLUP/rrBLUP.py +123 -0
  184. gpbench-1.0.0/gpbench.egg-info/PKG-INFO +379 -0
  185. gpbench-1.0.0/gpbench.egg-info/SOURCES.txt +374 -0
  186. gpbench-1.0.0/gpbench.egg-info/dependency_links.txt +1 -0
  187. gpbench-1.0.0/gpbench.egg-info/entry_points.txt +2 -0
  188. gpbench-1.0.0/gpbench.egg-info/requires.txt +120 -0
  189. gpbench-1.0.0/gpbench.egg-info/top_level.txt +6 -0
  190. gpbench-1.0.0/pyproject.toml +186 -0
  191. gpbench-1.0.0/setup.cfg +4 -0
  192. gpbench-1.0.0/tests/test_import.py +80 -0
  193. gpbench-1.0.0/tests/test_method.py +232 -0
gpbench-1.0.0/PKG-INFO ADDED
@@ -0,0 +1,379 @@
1
+ Metadata-Version: 2.4
2
+ Name: gpbench
3
+ Version: 1.0.0
4
+ Summary: A benchmarking toolkit for genomic prediction with multiple methods and LLM-powered analysis
5
+ Author: GPBench Contributors
6
+ License: MIT
7
+ Keywords: genomic prediction,bioinformatics,machine learning,deep learning
8
+ Classifier: Development Status :: 4 - Beta
9
+ Classifier: Intended Audience :: Science/Research
10
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.8
13
+ Classifier: Programming Language :: Python :: 3.9
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Requires-Python: >=3.8
17
+ Description-Content-Type: text/markdown
18
+ Requires-Dist: numpy==1.26.4
19
+ Requires-Dist: pandas<2.2.3,>=2.0
20
+ Requires-Dist: scipy==1.13.1
21
+ Requires-Dist: matplotlib==3.9.4
22
+ Requires-Dist: seaborn==0.13.2
23
+ Requires-Dist: scikit-learn==1.6.1
24
+ Requires-Dist: torch==2.8.0
25
+ Requires-Dist: torchvision==0.23.0
26
+ Requires-Dist: torchmetrics==1.8.2
27
+ Requires-Dist: pytorch-lightning==2.5.6
28
+ Requires-Dist: lightning==2.5.6
29
+ Requires-Dist: lightning-utilities==0.15.2
30
+ Requires-Dist: tensorflow==2.20.0
31
+ Requires-Dist: keras==3.10.0
32
+ Requires-Dist: tensorboard==2.20.0
33
+ Requires-Dist: tensorboard-data-server==0.7.2
34
+ Requires-Dist: xgboost==2.1.4
35
+ Requires-Dist: lightgbm==4.6.0
36
+ Requires-Dist: optuna==2.10.0
37
+ Requires-Dist: umap-learn==0.5.9.post2
38
+ Requires-Dist: pynndescent==0.5.13
39
+ Requires-Dist: langchain-core==0.3.82
40
+ Requires-Dist: langchain-openai==0.3.35
41
+ Requires-Dist: langgraph==0.6.11
42
+ Requires-Dist: langgraph-checkpoint==2.1.2
43
+ Requires-Dist: langgraph-prebuilt==0.6.5
44
+ Requires-Dist: langgraph-sdk==0.2.9
45
+ Requires-Dist: langsmith==0.4.37
46
+ Requires-Dist: openai==2.8.1
47
+ Requires-Dist: dashscope==1.25.7
48
+ Requires-Dist: tiktoken==0.12.0
49
+ Requires-Dist: transformers==4.57.3
50
+ Requires-Dist: tokenizers==0.22.1
51
+ Requires-Dist: huggingface-hub==0.36.0
52
+ Requires-Dist: psutil==7.1.3
53
+ Requires-Dist: tqdm==4.67.1
54
+ Requires-Dist: pyyaml==6.0.3
55
+ Requires-Dist: requests==2.32.5
56
+ Requires-Dist: requests-toolbelt==1.0.0
57
+ Requires-Dist: python-dotenv==1.2.1
58
+ Requires-Dist: rich==13.9.4
59
+ Requires-Dist: rich-argparse==1.7.2
60
+ Requires-Dist: pyecharts==2.0.9
61
+ Requires-Dist: h5py==3.14.0
62
+ Requires-Dist: pandas-plink==2.2.9
63
+ Requires-Dist: xarray==2024.7.0
64
+ Requires-Dist: statsmodels==0.14.5
65
+ Requires-Dist: patsy==1.0.2
66
+ Requires-Dist: rpy2==3.5.16
67
+ Requires-Dist: aiohttp==3.13.2
68
+ Requires-Dist: httpx==0.28.1
69
+ Requires-Dist: httpcore==1.0.9
70
+ Requires-Dist: anyio==4.12.0
71
+ Requires-Dist: orjson==3.11.5
72
+ Requires-Dist: ormsgpack==1.11.0
73
+ Requires-Dist: simplejson==3.20.2
74
+ Requires-Dist: protobuf==6.33.0
75
+ Requires-Dist: flatbuffers==25.9.23
76
+ Requires-Dist: sympy==1.14.0
77
+ Requires-Dist: mpmath==1.3.0
78
+ Requires-Dist: opt-einsum==3.4.0
79
+ Requires-Dist: cmaes==0.12.0
80
+ Requires-Dist: flaml==2.3.6
81
+ Requires-Dist: pyro-api==0.1.2
82
+ Requires-Dist: pyro-ppl==1.9.1
83
+ Requires-Dist: fsspec==2025.10.0
84
+ Requires-Dist: filelock==3.19.1
85
+ Requires-Dist: diskcache==5.6.3
86
+ Requires-Dist: platformdirs==4.4.0
87
+ Requires-Dist: distro==1.9.0
88
+ Requires-Dist: pydantic==2.12.4
89
+ Requires-Dist: pydantic-core==2.41.5
90
+ Requires-Dist: typing-extensions==4.15.0
91
+ Requires-Dist: typing-inspection==0.4.2
92
+ Requires-Dist: annotated-types==0.7.0
93
+ Requires-Dist: pytest==8.4.2
94
+ Requires-Dist: pytest-cov==7.0.0
95
+ Requires-Dist: coverage==7.10.7
96
+ Requires-Dist: joblib==1.5.2
97
+ Requires-Dist: threadpoolctl==3.6.0
98
+ Requires-Dist: networkx==3.2.1
99
+ Requires-Dist: einops==0.8.1
100
+ Requires-Dist: triton==3.4.0
101
+ Requires-Dist: safetensors==0.7.0
102
+ Requires-Dist: ml-dtypes==0.5.3
103
+ Requires-Dist: tenacity==9.1.2
104
+ Requires-Dist: xxhash==3.6.0
105
+ Requires-Dist: xlsxwriter==3.2.9
106
+ Requires-Dist: aislib==0.1.14a0
107
+ Requires-Dist: swanlab==0.7.6
108
+ Provides-Extra: cuda
109
+ Requires-Dist: nvidia-cublas-cu12==12.8.4.1; extra == "cuda"
110
+ Requires-Dist: nvidia-cuda-cupti-cu12==12.8.90; extra == "cuda"
111
+ Requires-Dist: nvidia-cuda-nvrtc-cu12==12.8.93; extra == "cuda"
112
+ Requires-Dist: nvidia-cuda-runtime-cu12==12.8.90; extra == "cuda"
113
+ Requires-Dist: nvidia-cudnn-cu12==9.10.2.21; extra == "cuda"
114
+ Requires-Dist: nvidia-cufft-cu12==11.3.3.83; extra == "cuda"
115
+ Requires-Dist: nvidia-cufile-cu12==1.13.1.3; extra == "cuda"
116
+ Requires-Dist: nvidia-curand-cu12==10.3.9.90; extra == "cuda"
117
+ Requires-Dist: nvidia-cusolver-cu12==11.7.3.90; extra == "cuda"
118
+ Requires-Dist: nvidia-cusparse-cu12==12.5.8.93; extra == "cuda"
119
+ Requires-Dist: nvidia-cusparselt-cu12==0.7.1; extra == "cuda"
120
+ Requires-Dist: nvidia-ml-py==13.580.82; extra == "cuda"
121
+ Requires-Dist: nvidia-nccl-cu12==2.27.3; extra == "cuda"
122
+ Requires-Dist: nvidia-nvjitlink-cu12==12.8.93; extra == "cuda"
123
+ Requires-Dist: nvidia-nvtx-cu12==12.8.90; extra == "cuda"
124
+ Provides-Extra: extra
125
+ Requires-Dist: autogen-agentchat==0.2.40; extra == "extra"
126
+ Requires-Dist: swanlab==0.7.6; extra == "extra"
127
+ Requires-Dist: docker==7.1.0; extra == "extra"
128
+ Requires-Dist: boto3==1.40.69; extra == "extra"
129
+ Requires-Dist: botocore==1.40.69; extra == "extra"
130
+ Requires-Dist: s3transfer==0.14.0; extra == "extra"
131
+ Provides-Extra: dev
132
+ Requires-Dist: black>=22.0.0; extra == "dev"
133
+ Requires-Dist: flake8>=4.0.0; extra == "dev"
134
+ Requires-Dist: mypy>=1.0.0; extra == "dev"
135
+
136
+ # GPBench
137
+
138
+ GPBench is a benchmarking toolkit for genomic prediction. This repository reimplements and integrates many commonly used methods, including classic linear statistical approaches and machine learning / deep learning methods: rrBLUP, GBLUP, BayesA/B/C, SVR, Random Forest, XGBoost, LightGBM, DeepGS, DL_GWAS, G2PDeep, MVP, DNNGP, SoyDNGP, DeepCCR, EIR, Cropformer, GEFormer, CropARNet, etc.
139
+
140
+ Project Website: [https://www.sdu-idea.cn/GPBench/](https://www.sdu-idea.cn/GPBench/)
141
+
142
+ ![GPBench overview](data/fig/fig1.png)
143
+
144
+ ## Key Features
145
+ - Implements multiple genomic prediction methods and reproducible experimental workflows
146
+ - Supports GPU-accelerated deep learning methods (using PyTorch)
147
+ - Unified data loading and 10-fold cross-validation pipeline
148
+ - Outputs standardized evaluation metrics (PCC, MAE, MSE, R2) and per-fold predictions
149
+ - **LLM-powered analysis tool** (`gp_agent_tool`): Analyzes dataset characteristics, finds similar datasets, and recommends suitable genomic prediction methods based on historical experimental experience
150
+
151
+ ## Important Structure
152
+ - `data/`: Example/real dataset directory, each species/dataset is a subfolder (e.g., `data/Cotton/`), containing:
153
+ - `genotype.npz`: genotype matrix (typically saved as a NumPy array)
154
+ - `phenotype.npz`: phenotype data (contains phenotype matrix and phenotype names)
155
+ - `method_reg/`: subdirectories with implementations for each method (each method usually contains a main runner script plus hyperparameter/utility scripts)
156
+ - `result/`: default output directory for experimental results
157
+ - `gp_agent_tool/`: LLM-powered dataset analysis and method recommendation tool (see [Dataset Analysis Tool](#dataset-analysis-tool-gp_agent_tool) section)
158
+ - `environment.yml`: dependency file for creating a conda environment (recommended)
159
+
160
+ ## Environment Setup (recommended: conda)
161
+ There is an `environment.yml` in the repository; it is recommended to create and activate a conda environment with it:
162
+
163
+ ```bash
164
+ # On a machine with conda:
165
+ conda env create -f environment.yml
166
+ conda activate Benchmark
167
+ ```
168
+
169
+ Notes:
170
+ - `environment.yml` contains most dependencies (including CUDA / cuDNN related packages and pip list) and is suitable for GPU-enabled environments (the file references CUDA 11.8 and matching RAPIDS/torch/cupy versions).
171
+ - Ensure the target machine has an NVIDIA driver compatible with CUDA 11.8/12.
172
+ - If you cannot use the environment file directly, you can install main dependencies into an existing Python environment as needed:
173
+
174
+ ```bash
175
+ pip install -U numpy pandas scikit-learn torch torchvision optuna psutil xgboost lightgbm
176
+ ```
177
+
178
+ (Warning: the above is a simplified installation; some packages may need additional configuration on GPU systems or certain platforms.)
179
+
180
+ ## Data Format and Preparation
181
+ - Each species folder should contain `genotype.npz` and `phenotype.npz`.
182
+ - `genotype.npz` usually stores a 2D array (number of samples × number of SNPs).
183
+ - `phenotype.npz` typically includes two arrays: the phenotype matrix (number of samples × number of phenotypes) and a list of phenotype names.
184
+
185
+ Quickly view phenotype names for a dataset (e.g., `Cotton`):
186
+
187
+ ```bash
188
+ python - <<'PY'
189
+ import numpy as np
190
+ obj = np.load('data/Cotton/phenotype.npz')
191
+ print(obj['arr_1'])
192
+ PY
193
+ ```
194
+
195
+ ## Quick Start (example with a method)
196
+ Most methods have a main script under `method_reg/<Method>/`. Scripts usually accept parameters like `--methods`, `--species`, `--phe`, `--data_dir`, `--result_dir`, etc. Example:
197
+
198
+ ```bash
199
+ # 1) Activate the environment
200
+ conda activate Benchmark
201
+
202
+ # 2) Run a single phenotype with DeepCCR (note: include trailing slash after --species)
203
+ python method_reg/DeepCCR/DeepCCR.py \
204
+ --methods DeepCCR/ \
205
+ --species Cotton/ \
206
+ --phe FibLen_17_18 \
207
+ --data_dir data/ \
208
+ --result_dir result/
209
+ ```
210
+
211
+ Common optional arguments (may vary across scripts):
212
+ - `--epoch`: number of training epochs (example scripts often default to 1000)
213
+ - `--batch_size`: batch size
214
+ - `--lr`: learning rate
215
+ - `--patience`: early stopping patience
216
+
217
+ You can inspect the argparse help for the specific script in the method directory:
218
+
219
+ ```bash
220
+ python method_reg/DeepCCR/DeepCCR.py -h
221
+ ```
222
+
223
+ ## Dataset Analysis Tool (gp_agent_tool)
224
+
225
+ The `gp_agent_tool` is an LLM-powered analysis tool that performs comprehensive dataset analysis and automatically recommends suitable genomic prediction methods. It analyzes your dataset characteristics, computes statistical features, finds similar datasets from historical experiments, and provides evidence-based method recommendations.
226
+
227
+ ### Features
228
+ - **Dataset statistical analysis**: Automatically computes and analyzes dataset statistics including sample size, marker count, phenotype distribution, missing rates, and statistical properties
229
+ - **Similar dataset discovery**: Finds datasets with similar statistical distributions to your query dataset from historical experimental databases
230
+ - **Method recommendation**: Recommends genomic prediction methods that have shown best performance on similar datasets based on historical experience
231
+ - **Bilingual support**: Supports both Chinese and English queries and analysis
232
+ - **Experience-based insights**: Leverages comprehensive historical experimental results to provide evidence-based analysis and recommendations
233
+
234
+ ### Prerequisites
235
+
236
+ 1. **LLM Configuration**: Create a configuration file at `gp_agent_tool/config/config.json` with your LLM API settings:
237
+
238
+ ```json
239
+ {
240
+ "llm": {
241
+ "model": "gpt-4o-mini",
242
+ "api_key": "YOUR_OPENAI_API_KEY",
243
+ "base_url": "https://api.openai.com/v1",
244
+ "timeout_seconds": 60,
245
+ "max_retries": 3
246
+ },
247
+ "codegen_llm": {
248
+ "model": "gpt-4o-mini",
249
+ "api_key": "YOUR_OPENAI_API_KEY",
250
+ "base_url": "https://api.openai.com/v1",
251
+ "timeout_seconds": 60,
252
+ "max_retries": 3
253
+ },
254
+ "multimodal_llm": {
255
+ "model": "qwen-vl-max",
256
+ "api_key": "YOUR_DASHSCOPE_API_KEY"
257
+ }
258
+ }
259
+ ```
260
+
261
+ **Important**: Please replace the `api_key` fields in the configuration file with your own API keys:
262
+ - Replace `YOUR_OPENAI_API_KEY` in `llm` and `codegen_llm` with your OpenAI API key
263
+ - Replace `YOUR_DASHSCOPE_API_KEY` in `multimodal_llm` with your Alibaba Cloud DashScope API key
264
+
265
+ You can obtain API keys from the following URLs:
266
+ - OpenAI API key: https://platform.openai.com/api-keys
267
+ - Alibaba Cloud DashScope API key: https://dashscope.console.aliyun.com/apiKey
268
+
269
+ 2. **Additional Dependencies**: Install required packages for the tool:
270
+
271
+ ```bash
272
+ pip install langchain langgraph openai
273
+ ```
274
+
275
+ ### Usage
276
+
277
+ #### Basic Usage
278
+
279
+ Run the tool from the project root directory:
280
+
281
+ ```bash
282
+ cd gp_agent_tool
283
+ python main.py \
284
+ -q "Based on existing models, summarize the patterns in the mkg trait of cattle." \
285
+ -o result.json
286
+ ```
287
+
288
+ Or in English:
289
+
290
+ ```bash
291
+ python main.py \
292
+ -d ../data/Rapeseed \
293
+ -q "Recommend the best methods for this dataset" \
294
+ -o result.json
295
+ ```
296
+
297
+ #### Command-line Arguments
298
+
299
+ - **`-d / --dataset`** (optional): Path to the dataset directory containing `genotype.npz` and `phenotype.npz`. The tool will analyze this dataset to compute statistical features. If not provided, analysis and recommendations are based on the complete experience table only.
300
+ - **`-q / --user-query`** (required): Your analysis requirement or question description (supports both Chinese and English). Examples: "分析这个数据集的特征" / "Analyze this dataset and recommend methods" / "What methods work best for binary phenotypes?"
301
+ - **`-m / --mask`** (optional): Specify a `species/phenotype` (e.g., `Rapeseed/FloweringTime`) to mask in the reference experience database, preventing "answer leakage" when evaluating on known datasets.
302
+ - **`-o / --output`** (optional): Path to save the analysis result as a JSON file. If not provided, results are printed to the terminal.
303
+
304
+ #### Dataset Analysis Features
305
+
306
+ When a dataset path is provided, the tool automatically computes the following statistical features:
307
+
308
+ - **Sample information**: Total samples, valid samples, missing rate
309
+ - **Marker information**: Number of markers, genotype statistics (mean, std, missing rate, MAF)
310
+ - **Phenotype statistics**: Mean, std, min, max, median, skewness, kurtosis
311
+ - **Data type information**: Genotype and phenotype data types, binary phenotype detection
312
+
313
+ #### Example Output
314
+
315
+ The tool returns a JSON object with two main sections:
316
+
317
+ ```json
318
+ {
319
+ "similar_datasets": {
320
+ "items": ["Chickpea/Days_to_0.5_flowering", "Cotton/FibLen_17_18"],
321
+ "reason": "These datasets have similar statistical distributions..."
322
+ },
323
+ "methods": {
324
+ "items": ["GBLUP", "XGBoost", "LightGBM"],
325
+ "reason": "Based on historical experience, these methods showed best performance on similar datasets..."
326
+ }
327
+ }
328
+ ```
329
+
330
+ #### Analysis Workflow
331
+
332
+ When you provide a dataset path, the tool performs the following analysis steps:
333
+
334
+ 1. **Dataset feature extraction**: Computes statistical features from your dataset (phenotype mean, std, skewness, kurtosis, sample size, marker count, etc.)
335
+ 2. **Similar dataset matching**: Compares your dataset features with historical datasets to find the most similar ones
336
+ 3. **Experience table filtering**: Filters the historical experience table to include only results from similar datasets
337
+ 4. **Method analysis and recommendation**: Analyzes which methods performed best on similar datasets and recommends them with detailed reasoning
338
+
339
+ #### Use Cases
340
+
341
+ 1. **General method query**: Query methods based on specific criteria without providing a dataset:
342
+
343
+ ```bash
344
+ python main.py \
345
+ -q "What methods work best for small sample sizes?" \
346
+ -o result.json
347
+ ```
348
+
349
+ 2. **Evaluation mode with masking**: When evaluating on a known dataset, mask it to avoid bias in the analysis:
350
+
351
+ ```bash
352
+ python main.py \
353
+ -d ../data/Rapeseed \
354
+ -q "Analyze this dataset and recommend appropriate algorithms." \
355
+ -m Rapeseed/FloweringTime \
356
+ -o result.json
357
+ ```
358
+
359
+ ## Output Description
360
+ - Each method run creates a directory under `result/` named by method/species/phenotype, e.g., `result/DeepCCR/Cotton/<PHENO>/`.
361
+ - Per-fold prediction results are typically saved as `fold{n}.csv`, containing `Y_test` and `Y_pred` columns.
362
+ - The script prints or saves average evaluation metrics at the end: PCC (Pearson correlation coefficient), MAE, MSE, R2, along with runtime and memory/GPU usage.
363
+
364
+ ## Full Dataset Link
365
+ - [Species dataset](https://doi.org/10.6084/m9.figshare.31007608): contains genotype and phenotype data for 16 species.
366
+
367
+ ## Running Tips & Troubleshooting
368
+ - For GPU usage, ensure `conda activate Benchmark` and that CUDA drivers are available; `torch.cuda.is_available()` should return True.
369
+ - If you encounter memory or GPU OOM issues, try reducing `--batch_size` or disabling some parallel settings in scripts.
370
+ - If running on CPU-only systems, some GPU-specific methods (RAPIDS or GPU-only implementations) may be unavailable or require alternative implementations.
371
+
372
+ ## Contributing & Contact
373
+ - Contributions via issues and PRs are welcome. Please describe changes and testing in PRs.
374
+ - Contact: open an Issue in the repository or reach the repository owner (GitHub user: `xwzhang2118`).
375
+
376
+
377
+
378
+
379
+
@@ -0,0 +1,244 @@
1
+ # GPBench
2
+
3
+ GPBench is a benchmarking toolkit for genomic prediction. This repository reimplements and integrates many commonly used methods, including classic linear statistical approaches and machine learning / deep learning methods: rrBLUP, GBLUP, BayesA/B/C, SVR, Random Forest, XGBoost, LightGBM, DeepGS, DL_GWAS, G2PDeep, MVP, DNNGP, SoyDNGP, DeepCCR, EIR, Cropformer, GEFormer, CropARNet, etc.
4
+
5
+ Project Website: [https://www.sdu-idea.cn/GPBench/](https://www.sdu-idea.cn/GPBench/)
6
+
7
+ ![GPBench overview](data/fig/fig1.png)
8
+
9
+ ## Key Features
10
+ - Implements multiple genomic prediction methods and reproducible experimental workflows
11
+ - Supports GPU-accelerated deep learning methods (using PyTorch)
12
+ - Unified data loading and 10-fold cross-validation pipeline
13
+ - Outputs standardized evaluation metrics (PCC, MAE, MSE, R2) and per-fold predictions
14
+ - **LLM-powered analysis tool** (`gp_agent_tool`): Analyzes dataset characteristics, finds similar datasets, and recommends suitable genomic prediction methods based on historical experimental experience
15
+
16
+ ## Important Structure
17
+ - `data/`: Example/real dataset directory, each species/dataset is a subfolder (e.g., `data/Cotton/`), containing:
18
+ - `genotype.npz`: genotype matrix (typically saved as a NumPy array)
19
+ - `phenotype.npz`: phenotype data (contains phenotype matrix and phenotype names)
20
+ - `method_reg/`: subdirectories with implementations for each method (each method usually contains a main runner script plus hyperparameter/utility scripts)
21
+ - `result/`: default output directory for experimental results
22
+ - `gp_agent_tool/`: LLM-powered dataset analysis and method recommendation tool (see [Dataset Analysis Tool](#dataset-analysis-tool-gp_agent_tool) section)
23
+ - `environment.yml`: dependency file for creating a conda environment (recommended)
24
+
25
+ ## Environment Setup (recommended: conda)
26
+ There is an `environment.yml` in the repository; it is recommended to create and activate a conda environment with it:
27
+
28
+ ```bash
29
+ # On a machine with conda:
30
+ conda env create -f environment.yml
31
+ conda activate Benchmark
32
+ ```
33
+
34
+ Notes:
35
+ - `environment.yml` contains most dependencies (including CUDA / cuDNN related packages and pip list) and is suitable for GPU-enabled environments (the file references CUDA 11.8 and matching RAPIDS/torch/cupy versions).
36
+ - Ensure the target machine has an NVIDIA driver compatible with CUDA 11.8/12.
37
+ - If you cannot use the environment file directly, you can install main dependencies into an existing Python environment as needed:
38
+
39
+ ```bash
40
+ pip install -U numpy pandas scikit-learn torch torchvision optuna psutil xgboost lightgbm
41
+ ```
42
+
43
+ (Warning: the above is a simplified installation; some packages may need additional configuration on GPU systems or certain platforms.)
44
+
45
+ ## Data Format and Preparation
46
+ - Each species folder should contain `genotype.npz` and `phenotype.npz`.
47
+ - `genotype.npz` usually stores a 2D array (number of samples × number of SNPs).
48
+ - `phenotype.npz` typically includes two arrays: the phenotype matrix (number of samples × number of phenotypes) and a list of phenotype names.
49
+
50
+ Quickly view phenotype names for a dataset (e.g., `Cotton`):
51
+
52
+ ```bash
53
+ python - <<'PY'
54
+ import numpy as np
55
+ obj = np.load('data/Cotton/phenotype.npz')
56
+ print(obj['arr_1'])
57
+ PY
58
+ ```
59
+
60
+ ## Quick Start (example with a method)
61
+ Most methods have a main script under `method_reg/<Method>/`. Scripts usually accept parameters like `--methods`, `--species`, `--phe`, `--data_dir`, `--result_dir`, etc. Example:
62
+
63
+ ```bash
64
+ # 1) Activate the environment
65
+ conda activate Benchmark
66
+
67
+ # 2) Run a single phenotype with DeepCCR (note: include trailing slash after --species)
68
+ python method_reg/DeepCCR/DeepCCR.py \
69
+ --methods DeepCCR/ \
70
+ --species Cotton/ \
71
+ --phe FibLen_17_18 \
72
+ --data_dir data/ \
73
+ --result_dir result/
74
+ ```
75
+
76
+ Common optional arguments (may vary across scripts):
77
+ - `--epoch`: number of training epochs (example scripts often default to 1000)
78
+ - `--batch_size`: batch size
79
+ - `--lr`: learning rate
80
+ - `--patience`: early stopping patience
81
+
82
+ You can inspect the argparse help for the specific script in the method directory:
83
+
84
+ ```bash
85
+ python method_reg/DeepCCR/DeepCCR.py -h
86
+ ```
87
+
88
+ ## Dataset Analysis Tool (gp_agent_tool)
89
+
90
+ The `gp_agent_tool` is an LLM-powered analysis tool that performs comprehensive dataset analysis and automatically recommends suitable genomic prediction methods. It analyzes your dataset characteristics, computes statistical features, finds similar datasets from historical experiments, and provides evidence-based method recommendations.
91
+
92
+ ### Features
93
+ - **Dataset statistical analysis**: Automatically computes and analyzes dataset statistics including sample size, marker count, phenotype distribution, missing rates, and statistical properties
94
+ - **Similar dataset discovery**: Finds datasets with similar statistical distributions to your query dataset from historical experimental databases
95
+ - **Method recommendation**: Recommends genomic prediction methods that have shown best performance on similar datasets based on historical experience
96
+ - **Bilingual support**: Supports both Chinese and English queries and analysis
97
+ - **Experience-based insights**: Leverages comprehensive historical experimental results to provide evidence-based analysis and recommendations
98
+
99
+ ### Prerequisites
100
+
101
+ 1. **LLM Configuration**: Create a configuration file at `gp_agent_tool/config/config.json` with your LLM API settings:
102
+
103
+ ```json
104
+ {
105
+ "llm": {
106
+ "model": "gpt-4o-mini",
107
+ "api_key": "YOUR_OPENAI_API_KEY",
108
+ "base_url": "https://api.openai.com/v1",
109
+ "timeout_seconds": 60,
110
+ "max_retries": 3
111
+ },
112
+ "codegen_llm": {
113
+ "model": "gpt-4o-mini",
114
+ "api_key": "YOUR_OPENAI_API_KEY",
115
+ "base_url": "https://api.openai.com/v1",
116
+ "timeout_seconds": 60,
117
+ "max_retries": 3
118
+ },
119
+ "multimodal_llm": {
120
+ "model": "qwen-vl-max",
121
+ "api_key": "YOUR_DASHSCOPE_API_KEY"
122
+ }
123
+ }
124
+ ```
125
+
126
+ **Important**: Please replace the `api_key` fields in the configuration file with your own API keys:
127
+ - Replace `YOUR_OPENAI_API_KEY` in `llm` and `codegen_llm` with your OpenAI API key
128
+ - Replace `YOUR_DASHSCOPE_API_KEY` in `multimodal_llm` with your Alibaba Cloud DashScope API key
129
+
130
+ You can obtain API keys from the following URLs:
131
+ - OpenAI API key: https://platform.openai.com/api-keys
132
+ - Alibaba Cloud DashScope API key: https://dashscope.console.aliyun.com/apiKey
133
+
134
+ 2. **Additional Dependencies**: Install required packages for the tool:
135
+
136
+ ```bash
137
+ pip install langchain langgraph openai
138
+ ```
139
+
140
+ ### Usage
141
+
142
+ #### Basic Usage
143
+
144
+ Run the tool from the project root directory:
145
+
146
+ ```bash
147
+ cd gp_agent_tool
148
+ python main.py \
149
+ -q "Based on existing models, summarize the patterns in the mkg trait of cattle." \
150
+ -o result.json
151
+ ```
152
+
153
+ Or in English:
154
+
155
+ ```bash
156
+ python main.py \
157
+ -d ../data/Rapeseed \
158
+ -q "Recommend the best methods for this dataset" \
159
+ -o result.json
160
+ ```
161
+
162
+ #### Command-line Arguments
163
+
164
+ - **`-d / --dataset`** (optional): Path to the dataset directory containing `genotype.npz` and `phenotype.npz`. The tool will analyze this dataset to compute statistical features. If not provided, analysis and recommendations are based on the complete experience table only.
165
+ - **`-q / --user-query`** (required): Your analysis requirement or question description (supports both Chinese and English). Examples: "分析这个数据集的特征" / "Analyze this dataset and recommend methods" / "What methods work best for binary phenotypes?"
166
+ - **`-m / --mask`** (optional): Specify a `species/phenotype` (e.g., `Rapeseed/FloweringTime`) to mask in the reference experience database, preventing "answer leakage" when evaluating on known datasets.
167
+ - **`-o / --output`** (optional): Path to save the analysis result as a JSON file. If not provided, results are printed to the terminal.
168
+
169
+ #### Dataset Analysis Features
170
+
171
+ When a dataset path is provided, the tool automatically computes the following statistical features:
172
+
173
+ - **Sample information**: Total samples, valid samples, missing rate
174
+ - **Marker information**: Number of markers, genotype statistics (mean, std, missing rate, MAF)
175
+ - **Phenotype statistics**: Mean, std, min, max, median, skewness, kurtosis
176
+ - **Data type information**: Genotype and phenotype data types, binary phenotype detection
177
+
178
+ #### Example Output
179
+
180
+ The tool returns a JSON object with two main sections:
181
+
182
+ ```json
183
+ {
184
+ "similar_datasets": {
185
+ "items": ["Chickpea/Days_to_0.5_flowering", "Cotton/FibLen_17_18"],
186
+ "reason": "These datasets have similar statistical distributions..."
187
+ },
188
+ "methods": {
189
+ "items": ["GBLUP", "XGBoost", "LightGBM"],
190
+ "reason": "Based on historical experience, these methods showed best performance on similar datasets..."
191
+ }
192
+ }
193
+ ```
194
+
195
+ #### Analysis Workflow
196
+
197
+ When you provide a dataset path, the tool performs the following analysis steps:
198
+
199
+ 1. **Dataset feature extraction**: Computes statistical features from your dataset (phenotype mean, std, skewness, kurtosis, sample size, marker count, etc.)
200
+ 2. **Similar dataset matching**: Compares your dataset features with historical datasets to find the most similar ones
201
+ 3. **Experience table filtering**: Filters the historical experience table to include only results from similar datasets
202
+ 4. **Method analysis and recommendation**: Analyzes which methods performed best on similar datasets and recommends them with detailed reasoning
203
+
204
+ #### Use Cases
205
+
206
+ 1. **General method query**: Query methods based on specific criteria without providing a dataset:
207
+
208
+ ```bash
209
+ python main.py \
210
+ -q "What methods work best for small sample sizes?" \
211
+ -o result.json
212
+ ```
213
+
214
+ 2. **Evaluation mode with masking**: When evaluating on a known dataset, mask it to avoid bias in the analysis:
215
+
216
+ ```bash
217
+ python main.py \
218
+ -d ../data/Rapeseed \
219
+ -q "Analyze this dataset and recommend appropriate algorithms." \
220
+ -m Rapeseed/FloweringTime \
221
+ -o result.json
222
+ ```
223
+
224
+ ## Output Description
225
+ - Each method run creates a directory under `result/` named by method/species/phenotype, e.g., `result/DeepCCR/Cotton/<PHENO>/`.
226
+ - Per-fold prediction results are typically saved as `fold{n}.csv`, containing `Y_test` and `Y_pred` columns.
227
+ - The script prints or saves average evaluation metrics at the end: PCC (Pearson correlation coefficient), MAE, MSE, R2, along with runtime and memory/GPU usage.
228
+
229
+ ## Full Dataset Link
230
+ - [Species dataset](https://doi.org/10.6084/m9.figshare.31007608): contains genotype and phenotype data for 16 species.
231
+
232
+ ## Running Tips & Troubleshooting
233
+ - For GPU usage, ensure `conda activate Benchmark` and that CUDA drivers are available; `torch.cuda.is_available()` should return True.
234
+ - If you encounter memory or GPU OOM issues, try reducing `--batch_size` or disabling some parallel settings in scripts.
235
+ - If running on CPU-only systems, some GPU-specific methods (RAPIDS or GPU-only implementations) may be unavailable or require alternative implementations.
236
+
237
+ ## Contributing & Contact
238
+ - Contributions via issues and PRs are welcome. Please describe changes and testing in PRs.
239
+ - Contact: open an Issue in the repository or reach the repository owner (GitHub user: `xwzhang2118`).
240
+
241
+
242
+
243
+
244
+
@@ -0,0 +1,67 @@
1
+ import numpy as np
2
+ from scipy.stats import skew, kurtosis
3
+ import os
4
+
5
+ def process_one_phenotype(dataset_path:str) -> dict:
6
+ """
7
+ 处理单个表型,返回 summary 字典
8
+ """
9
+ geno_path = os.path.join(dataset_path, "genotype.npz")
10
+ pheno_path = os.path.join(dataset_path, "phenotype.npz")
11
+
12
+ genotype = np.load(geno_path)['arr_0']
13
+ pheno_file = np.load(pheno_path)
14
+ phenotype = pheno_file['arr_0']
15
+ phe_name = pheno_file['arr_1']
16
+ sp_name = pheno_file['arr_2']
17
+ phe_data = phenotype[:, 0]
18
+
19
+ # 去除缺失值
20
+ mask = ~np.isnan(phe_data)
21
+ phe_clean = phe_data[mask]
22
+ geno_clean = genotype[mask] if mask.sum() > 0 else genotype
23
+
24
+ summary = {
25
+ # 基本信息
26
+ # 'species_phenotype': f"{sp_name}/{phe_name}",
27
+ 'species': sp_name,
28
+ # 'phenotype_name': phe_name,
29
+
30
+ # 维度信息
31
+ 'n_samples_total': genotype.shape[0],
32
+ 'n_samples_valid': len(phe_clean),
33
+ 'n_markers': genotype.shape[1] if genotype.ndim > 1 else 1,
34
+ 'missing_rate': 1 - len(phe_clean) / genotype.shape[0],
35
+
36
+ # 表型统计特征
37
+ 'pheno_mean': np.mean(phe_clean) if len(phe_clean) > 0 else np.nan,
38
+ 'pheno_std': np.std(phe_clean) if len(phe_clean) > 0 else np.nan,
39
+ 'pheno_min': np.min(phe_clean) if len(phe_clean) > 0 else np.nan,
40
+ 'pheno_max': np.max(phe_clean) if len(phe_clean) > 0 else np.nan,
41
+ 'pheno_median': np.median(phe_clean) if len(phe_clean) > 0 else np.nan,
42
+ 'pheno_skewness': skew(phe_clean) if len(phe_clean) > 3 else np.nan,
43
+ 'pheno_kurtosis': kurtosis(phe_clean) if len(phe_clean) > 3 else np.nan,
44
+
45
+ # 基因型统计特征
46
+ 'geno_mean': np.mean(geno_clean) if geno_clean.size > 0 else np.nan,
47
+ 'geno_std': np.std(geno_clean) if geno_clean.size > 0 else np.nan,
48
+ 'geno_missing_rate': (
49
+ np.isnan(geno_clean).sum() / geno_clean.size
50
+ if geno_clean.size > 0 else np.nan
51
+ ),
52
+ 'geno_maf': (
53
+ np.mean(
54
+ np.minimum(
55
+ np.mean(geno_clean, axis=0),
56
+ 1 - np.mean(geno_clean, axis=0)
57
+ )
58
+ ) if geno_clean.ndim > 1 and geno_clean.size > 0 else np.nan
59
+ ),
60
+
61
+ # 类型信息
62
+ 'geno_dtype': str(genotype.dtype),
63
+ 'pheno_dtype': str(phe_data.dtype),
64
+ 'is_pheno_binary': len(np.unique(phe_clean)) == 2 if len(phe_clean) > 0 else False
65
+ }
66
+
67
+ return summary