gpbench 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (188) hide show
  1. gp_agent_tool/compute_dataset_feature.py +67 -0
  2. gp_agent_tool/config.py +65 -0
  3. gp_agent_tool/experience/create_masked_dataset_summary.py +97 -0
  4. gp_agent_tool/experience/dataset_summary_info.py +13 -0
  5. gp_agent_tool/experience/experience_info.py +12 -0
  6. gp_agent_tool/experience/get_matched_experience.py +111 -0
  7. gp_agent_tool/llm_client.py +119 -0
  8. gp_agent_tool/logging_utils.py +24 -0
  9. gp_agent_tool/main.py +347 -0
  10. gp_agent_tool/read_agent/__init__.py +46 -0
  11. gp_agent_tool/read_agent/nodes.py +674 -0
  12. gp_agent_tool/read_agent/prompts.py +547 -0
  13. gp_agent_tool/read_agent/python_repl_tool.py +165 -0
  14. gp_agent_tool/read_agent/state.py +101 -0
  15. gp_agent_tool/read_agent/workflow.py +54 -0
  16. gpbench/__init__.py +25 -0
  17. gpbench/_selftest.py +104 -0
  18. gpbench/method_class/BayesA/BayesA_class.py +141 -0
  19. gpbench/method_class/BayesA/__init__.py +5 -0
  20. gpbench/method_class/BayesA/_bayesfromR.py +96 -0
  21. gpbench/method_class/BayesA/_param_free_base_model.py +84 -0
  22. gpbench/method_class/BayesA/bayesAfromR.py +16 -0
  23. gpbench/method_class/BayesB/BayesB_class.py +140 -0
  24. gpbench/method_class/BayesB/__init__.py +5 -0
  25. gpbench/method_class/BayesB/_bayesfromR.py +96 -0
  26. gpbench/method_class/BayesB/_param_free_base_model.py +84 -0
  27. gpbench/method_class/BayesB/bayesBfromR.py +16 -0
  28. gpbench/method_class/BayesC/BayesC_class.py +141 -0
  29. gpbench/method_class/BayesC/__init__.py +4 -0
  30. gpbench/method_class/BayesC/_bayesfromR.py +96 -0
  31. gpbench/method_class/BayesC/_param_free_base_model.py +84 -0
  32. gpbench/method_class/BayesC/bayesCfromR.py +16 -0
  33. gpbench/method_class/CropARNet/CropARNet_class.py +186 -0
  34. gpbench/method_class/CropARNet/CropARNet_he_class.py +154 -0
  35. gpbench/method_class/CropARNet/__init__.py +5 -0
  36. gpbench/method_class/CropARNet/base_CropARNet_class.py +178 -0
  37. gpbench/method_class/Cropformer/Cropformer_class.py +308 -0
  38. gpbench/method_class/Cropformer/__init__.py +5 -0
  39. gpbench/method_class/Cropformer/cropformer_he_class.py +221 -0
  40. gpbench/method_class/DL_GWAS/DL_GWAS_class.py +250 -0
  41. gpbench/method_class/DL_GWAS/DL_GWAS_he_class.py +169 -0
  42. gpbench/method_class/DL_GWAS/__init__.py +5 -0
  43. gpbench/method_class/DNNGP/DNNGP_class.py +163 -0
  44. gpbench/method_class/DNNGP/DNNGP_he_class.py +138 -0
  45. gpbench/method_class/DNNGP/__init__.py +5 -0
  46. gpbench/method_class/DNNGP/base_dnngp_class.py +116 -0
  47. gpbench/method_class/DeepCCR/DeepCCR_class.py +172 -0
  48. gpbench/method_class/DeepCCR/DeepCCR_he_class.py +161 -0
  49. gpbench/method_class/DeepCCR/__init__.py +5 -0
  50. gpbench/method_class/DeepCCR/base_DeepCCR_class.py +209 -0
  51. gpbench/method_class/DeepGS/DeepGS_class.py +184 -0
  52. gpbench/method_class/DeepGS/DeepGS_he_class.py +150 -0
  53. gpbench/method_class/DeepGS/__init__.py +5 -0
  54. gpbench/method_class/DeepGS/base_deepgs_class.py +153 -0
  55. gpbench/method_class/EIR/EIR_class.py +276 -0
  56. gpbench/method_class/EIR/EIR_he_class.py +184 -0
  57. gpbench/method_class/EIR/__init__.py +5 -0
  58. gpbench/method_class/EIR/utils/__init__.py +0 -0
  59. gpbench/method_class/EIR/utils/array_output_modules.py +97 -0
  60. gpbench/method_class/EIR/utils/common.py +65 -0
  61. gpbench/method_class/EIR/utils/lcl_layers.py +235 -0
  62. gpbench/method_class/EIR/utils/logging.py +59 -0
  63. gpbench/method_class/EIR/utils/mlp_layers.py +92 -0
  64. gpbench/method_class/EIR/utils/models_locally_connected.py +642 -0
  65. gpbench/method_class/EIR/utils/transformer_models.py +546 -0
  66. gpbench/method_class/ElasticNet/ElasticNet_class.py +133 -0
  67. gpbench/method_class/ElasticNet/ElasticNet_he_class.py +91 -0
  68. gpbench/method_class/ElasticNet/__init__.py +5 -0
  69. gpbench/method_class/G2PDeep/G2PDeep_he_class.py +217 -0
  70. gpbench/method_class/G2PDeep/G2Pdeep_class.py +205 -0
  71. gpbench/method_class/G2PDeep/__init__.py +5 -0
  72. gpbench/method_class/G2PDeep/base_G2PDeep_class.py +209 -0
  73. gpbench/method_class/GBLUP/GBLUP_class.py +183 -0
  74. gpbench/method_class/GBLUP/__init__.py +5 -0
  75. gpbench/method_class/GEFormer/GEFormer_class.py +169 -0
  76. gpbench/method_class/GEFormer/GEFormer_he_class.py +137 -0
  77. gpbench/method_class/GEFormer/__init__.py +5 -0
  78. gpbench/method_class/GEFormer/gMLP_class.py +357 -0
  79. gpbench/method_class/LightGBM/LightGBM_class.py +224 -0
  80. gpbench/method_class/LightGBM/LightGBM_he_class.py +121 -0
  81. gpbench/method_class/LightGBM/__init__.py +5 -0
  82. gpbench/method_class/RF/RF_GPU_class.py +165 -0
  83. gpbench/method_class/RF/RF_GPU_he_class.py +124 -0
  84. gpbench/method_class/RF/__init__.py +5 -0
  85. gpbench/method_class/SVC/SVC_GPU.py +181 -0
  86. gpbench/method_class/SVC/SVC_GPU_he.py +106 -0
  87. gpbench/method_class/SVC/__init__.py +5 -0
  88. gpbench/method_class/SoyDNGP/AlexNet_206_class.py +179 -0
  89. gpbench/method_class/SoyDNGP/SoyDNGP_class.py +189 -0
  90. gpbench/method_class/SoyDNGP/SoyDNGP_he_class.py +112 -0
  91. gpbench/method_class/SoyDNGP/__init__.py +5 -0
  92. gpbench/method_class/XGBoost/XGboost_GPU_class.py +198 -0
  93. gpbench/method_class/XGBoost/XGboost_GPU_he_class.py +178 -0
  94. gpbench/method_class/XGBoost/__init__.py +5 -0
  95. gpbench/method_class/__init__.py +52 -0
  96. gpbench/method_class/rrBLUP/__init__.py +5 -0
  97. gpbench/method_class/rrBLUP/rrBLUP_class.py +140 -0
  98. gpbench/method_reg/BayesA/BayesA.py +116 -0
  99. gpbench/method_reg/BayesA/__init__.py +5 -0
  100. gpbench/method_reg/BayesA/_bayesfromR.py +96 -0
  101. gpbench/method_reg/BayesA/_param_free_base_model.py +84 -0
  102. gpbench/method_reg/BayesA/bayesAfromR.py +16 -0
  103. gpbench/method_reg/BayesB/BayesB.py +117 -0
  104. gpbench/method_reg/BayesB/__init__.py +5 -0
  105. gpbench/method_reg/BayesB/_bayesfromR.py +96 -0
  106. gpbench/method_reg/BayesB/_param_free_base_model.py +84 -0
  107. gpbench/method_reg/BayesB/bayesBfromR.py +16 -0
  108. gpbench/method_reg/BayesC/BayesC.py +115 -0
  109. gpbench/method_reg/BayesC/__init__.py +5 -0
  110. gpbench/method_reg/BayesC/_bayesfromR.py +96 -0
  111. gpbench/method_reg/BayesC/_param_free_base_model.py +84 -0
  112. gpbench/method_reg/BayesC/bayesCfromR.py +16 -0
  113. gpbench/method_reg/CropARNet/CropARNet.py +159 -0
  114. gpbench/method_reg/CropARNet/CropARNet_Hyperparameters.py +109 -0
  115. gpbench/method_reg/CropARNet/__init__.py +5 -0
  116. gpbench/method_reg/CropARNet/base_CropARNet.py +137 -0
  117. gpbench/method_reg/Cropformer/Cropformer.py +313 -0
  118. gpbench/method_reg/Cropformer/Cropformer_Hyperparameters.py +250 -0
  119. gpbench/method_reg/Cropformer/__init__.py +5 -0
  120. gpbench/method_reg/DL_GWAS/DL_GWAS.py +186 -0
  121. gpbench/method_reg/DL_GWAS/DL_GWAS_Hyperparameters.py +125 -0
  122. gpbench/method_reg/DL_GWAS/__init__.py +5 -0
  123. gpbench/method_reg/DNNGP/DNNGP.py +157 -0
  124. gpbench/method_reg/DNNGP/DNNGP_Hyperparameters.py +118 -0
  125. gpbench/method_reg/DNNGP/__init__.py +5 -0
  126. gpbench/method_reg/DNNGP/base_dnngp.py +101 -0
  127. gpbench/method_reg/DeepCCR/DeepCCR.py +149 -0
  128. gpbench/method_reg/DeepCCR/DeepCCR_Hyperparameters.py +110 -0
  129. gpbench/method_reg/DeepCCR/__init__.py +5 -0
  130. gpbench/method_reg/DeepCCR/base_DeepCCR.py +171 -0
  131. gpbench/method_reg/DeepGS/DeepGS.py +165 -0
  132. gpbench/method_reg/DeepGS/DeepGS_Hyperparameters.py +114 -0
  133. gpbench/method_reg/DeepGS/__init__.py +5 -0
  134. gpbench/method_reg/DeepGS/base_deepgs.py +98 -0
  135. gpbench/method_reg/EIR/EIR.py +258 -0
  136. gpbench/method_reg/EIR/EIR_Hyperparameters.py +178 -0
  137. gpbench/method_reg/EIR/__init__.py +5 -0
  138. gpbench/method_reg/EIR/utils/__init__.py +0 -0
  139. gpbench/method_reg/EIR/utils/array_output_modules.py +97 -0
  140. gpbench/method_reg/EIR/utils/common.py +65 -0
  141. gpbench/method_reg/EIR/utils/lcl_layers.py +235 -0
  142. gpbench/method_reg/EIR/utils/logging.py +59 -0
  143. gpbench/method_reg/EIR/utils/mlp_layers.py +92 -0
  144. gpbench/method_reg/EIR/utils/models_locally_connected.py +642 -0
  145. gpbench/method_reg/EIR/utils/transformer_models.py +546 -0
  146. gpbench/method_reg/ElasticNet/ElasticNet.py +123 -0
  147. gpbench/method_reg/ElasticNet/ElasticNet_he.py +83 -0
  148. gpbench/method_reg/ElasticNet/__init__.py +5 -0
  149. gpbench/method_reg/G2PDeep/G2PDeep_Hyperparameters.py +107 -0
  150. gpbench/method_reg/G2PDeep/G2Pdeep.py +166 -0
  151. gpbench/method_reg/G2PDeep/__init__.py +5 -0
  152. gpbench/method_reg/G2PDeep/base_G2PDeep.py +209 -0
  153. gpbench/method_reg/GBLUP/GBLUP_R.py +182 -0
  154. gpbench/method_reg/GBLUP/__init__.py +5 -0
  155. gpbench/method_reg/GEFormer/GEFormer.py +164 -0
  156. gpbench/method_reg/GEFormer/GEFormer_Hyperparameters.py +106 -0
  157. gpbench/method_reg/GEFormer/__init__.py +5 -0
  158. gpbench/method_reg/GEFormer/gMLP.py +341 -0
  159. gpbench/method_reg/LightGBM/LightGBM.py +237 -0
  160. gpbench/method_reg/LightGBM/LightGBM_Hyperparameters.py +77 -0
  161. gpbench/method_reg/LightGBM/__init__.py +5 -0
  162. gpbench/method_reg/MVP/MVP.py +182 -0
  163. gpbench/method_reg/MVP/MVP_Hyperparameters.py +126 -0
  164. gpbench/method_reg/MVP/__init__.py +5 -0
  165. gpbench/method_reg/MVP/base_MVP.py +113 -0
  166. gpbench/method_reg/RF/RF_GPU.py +174 -0
  167. gpbench/method_reg/RF/RF_Hyperparameters.py +163 -0
  168. gpbench/method_reg/RF/__init__.py +5 -0
  169. gpbench/method_reg/SVC/SVC_GPU.py +194 -0
  170. gpbench/method_reg/SVC/SVC_Hyperparameters.py +107 -0
  171. gpbench/method_reg/SVC/__init__.py +5 -0
  172. gpbench/method_reg/SoyDNGP/AlexNet_206.py +185 -0
  173. gpbench/method_reg/SoyDNGP/SoyDNGP.py +179 -0
  174. gpbench/method_reg/SoyDNGP/SoyDNGP_Hyperparameters.py +105 -0
  175. gpbench/method_reg/SoyDNGP/__init__.py +5 -0
  176. gpbench/method_reg/XGBoost/XGboost_GPU.py +188 -0
  177. gpbench/method_reg/XGBoost/XGboost_Hyperparameters.py +167 -0
  178. gpbench/method_reg/XGBoost/__init__.py +5 -0
  179. gpbench/method_reg/__init__.py +55 -0
  180. gpbench/method_reg/rrBLUP/__init__.py +5 -0
  181. gpbench/method_reg/rrBLUP/rrBLUP.py +123 -0
  182. gpbench-1.0.0.dist-info/METADATA +379 -0
  183. gpbench-1.0.0.dist-info/RECORD +188 -0
  184. gpbench-1.0.0.dist-info/WHEEL +5 -0
  185. gpbench-1.0.0.dist-info/entry_points.txt +2 -0
  186. gpbench-1.0.0.dist-info/top_level.txt +3 -0
  187. tests/test_import.py +80 -0
  188. tests/test_method.py +232 -0
@@ -0,0 +1,115 @@
1
+ import os
2
+ import time
3
+ import psutil
4
+ import argparse
5
+ import random
6
+ import torch
7
+ import numpy as np
8
+ import pandas as pd
9
+ from .bayesCfromR import BayesC
10
+ from sklearn.model_selection import KFold
11
+ from scipy.stats import pearsonr
12
+ from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
13
+
14
+ def parse_args():
15
+ parser = argparse.ArgumentParser(description="Argument parser")
16
+ parser.add_argument('--methods', type=str, default='BayesC/', help='Model name')
17
+ parser.add_argument('--species', type=str, default='Cattle/', help='Species name')
18
+ parser.add_argument('--phe', type=str, default='', help='Phenotype name')
19
+ parser.add_argument('--data_dir', type=str, default='../../data/', help='Path to data directory')
20
+ parser.add_argument('--result_dir', type=str, default='result/', help='Path to result directory')
21
+ return parser.parse_args()
22
+
23
+
24
+ def load_data(args):
25
+ xData = np.load(os.path.join(args.data_dir, args.species, 'genotype.npz'))["arr_0"]
26
+ yData = np.load(os.path.join(args.data_dir, args.species, 'phenotype.npz'))["arr_0"]
27
+ names = np.load(os.path.join(args.data_dir, args.species, 'phenotype.npz'))["arr_1"]
28
+
29
+ nsample = xData.shape[0]
30
+ nsnp = xData.shape[1]
31
+ print("Number of samples: ", nsample)
32
+ print("Number of SNPs: ", nsnp)
33
+ return xData, yData, nsample, nsnp, names
34
+
35
+ def set_seed(seed=42):
36
+ random.seed(seed)
37
+ np.random.seed(seed)
38
+ torch.manual_seed(seed)
39
+ torch.cuda.manual_seed_all(seed)
40
+ torch.backends.cudnn.deterministic = True
41
+ torch.backends.cudnn.benchmark = False
42
+
43
+ def run_nested_cv(args, data, label):
44
+ result_dir = os.path.join(args.result_dir, args.methods + args.species + args.phe)
45
+ os.makedirs(result_dir, exist_ok=True)
46
+ print("Starting 10-fold cross-validation...")
47
+ kf = KFold(n_splits=10, shuffle=True, random_state=42)
48
+ process = psutil.Process(os.getpid())
49
+
50
+ all_mse, all_mae, all_r2, all_pcc = [], [], [], []
51
+ start_time = time.time()
52
+
53
+ for fold, (train_index, test_index) in enumerate(kf.split(data)):
54
+ fold_start = time.time()
55
+ print(f"\n===== Fold {fold} =====")
56
+ X_train, X_test = data[train_index], data[test_index]
57
+ Y_train, Y_test = label[train_index], label[test_index]
58
+
59
+ if torch.cuda.is_available():
60
+ torch.cuda.reset_peak_memory_stats()
61
+
62
+ model = BayesC(task="regression")
63
+ model.fit(X_train, Y_train)
64
+ Y_pred = model.predict(X_test)
65
+
66
+ mse = mean_squared_error(Y_test, Y_pred)
67
+ mae = mean_absolute_error(Y_test, Y_pred)
68
+ r2 = r2_score(Y_test, Y_pred)
69
+ pcc, _ = pearsonr(Y_test, Y_pred)
70
+
71
+ all_mse.append(mse)
72
+ all_mae.append(mae)
73
+ all_r2.append(r2)
74
+ all_pcc.append(pcc)
75
+
76
+ fold_time = time.time() - fold_start
77
+ fold_gpu_mem = torch.cuda.max_memory_allocated() / 1024**2 if torch.cuda.is_available() else 0
78
+ fold_cpu_mem = process.memory_info().rss / 1024**2
79
+ print(f'Fold {fold}: Corr={pcc:.4f}, MAE={mae:.4f}, MSE={mse:.4f}, R2={r2:.4f}, Time={fold_time:.2f}s, '
80
+ f'GPU={fold_gpu_mem:.2f}MB, CPU={fold_cpu_mem:.2f}MB')
81
+ results_df = pd.DataFrame({'Y_test': Y_test, 'Y_pred': Y_pred})
82
+ results_df.to_csv(os.path.join(result_dir, f"fold{fold}.csv"), index=False)
83
+
84
+ print("\n===== Cross-validation summary =====")
85
+ print(f"Average PCC: {np.mean(all_pcc):.4f} ± {np.std(all_pcc):.4f}")
86
+ print(f"Average MAE: {np.mean(all_mae):.4f} ± {np.std(all_mae):.4f}")
87
+ print(f"Average MSE: {np.mean(all_mse):.4f} ± {np.std(all_mse):.4f}")
88
+ print(f"Average R2 : {np.mean(all_r2):.4f} ± {np.std(all_r2):.4f}")
89
+ print(f"Total time : {time.time() - start_time:.2f}s")
90
+
91
+
92
+ def BayesC_reg():
93
+ set_seed(42)
94
+ torch.cuda.empty_cache()
95
+ args = parse_args()
96
+ all_species =['Cotton/']
97
+ for i in range(len(all_species)):
98
+ args.species = all_species[i]
99
+ X, Y, nsamples, nsnp, names = load_data(args)
100
+ for j in range(len(names)):
101
+ args.phe = names[j]
102
+ print("starting run " + args.methods + args.species + args.phe)
103
+ label = Y[:, j]
104
+ label = np.nan_to_num(label, nan=np.nanmean(label))
105
+ start_time = time.time()
106
+ torch.cuda.reset_peak_memory_stats()
107
+ process = psutil.Process(os.getpid())
108
+ run_nested_cv(args, data=X, label=label)
109
+ elapsed_time = time.time() - start_time
110
+ print(f"running time: {elapsed_time:.2f} s")
111
+ print("successfully")
112
+
113
+
114
+ if __name__ == "__main__":
115
+ BayesC_reg()
@@ -0,0 +1,5 @@
1
+ from .BayesC import BayesC_reg
2
+
3
+ BayesC = BayesC_reg
4
+
5
+ __all__ = ["BayesC","BayesC_reg"]
@@ -0,0 +1,96 @@
1
+ import numpy as np
2
+ import rpy2
3
+ from rpy2.robjects import numpy2ri
4
+ rpy2.robjects.numpy2ri.activate()
5
+ import rpy2.robjects as robjects
6
+ from rpy2.robjects.packages import importr
7
+ from . import _param_free_base_model
8
+ from joblib import Parallel, delayed
9
+
10
+ class Bayes_R(_param_free_base_model.ParamFreeBaseModel):
11
+ """
12
+ Implementation of a class for Bayesian alphabet.
13
+
14
+ *Attributes*
15
+
16
+ *Inherited attributes*
17
+
18
+ See :obj:`~easypheno.model._param_free_base_model.ParamFreeBaseModel` for more information on the attributes.
19
+
20
+ *Additional attributes*
21
+
22
+ - mu (*np.array*): intercept
23
+ - beta (*np.array*): effect size
24
+ - model_name (*str*): model to use (BayesA, BayesB or BayesC)
25
+ - n_iter (*int*): iterations for sampling
26
+ - burn_in (*int*): warmup/burnin for sampling
27
+ """
28
+ standard_encoding = '012'
29
+ possible_encodings = ['101']
30
+
31
+ def __init__(self, task: str, model_name: str, encoding: str = None, n_iter: int =1000, burn_in: int = 200):
32
+ super().__init__(task=task, encoding=encoding)
33
+ self.model_name = model_name
34
+ self.n_iter = n_iter
35
+ self.burn_in = burn_in
36
+ self.n_jobs = 1
37
+ self.mu = None
38
+ self.beta = None
39
+
40
+ def _run_chain(self, chain_num: int, R_X, R_y):
41
+ """
42
+ Helper function to run an individual MCMC chain.
43
+ """
44
+ BGLR = importr('BGLR')
45
+
46
+ # Run BGLR for BayesB on a single chain
47
+ ETA = robjects.r['list'](robjects.r['list'](X=R_X, model=self.model_name))
48
+ fmBB = BGLR.BGLR(y=R_y, ETA=ETA, verbose=False, nIter=self.n_iter, burnIn=self.burn_in)
49
+
50
+ # Extract the results for this chain
51
+ beta_chain = np.asarray(fmBB.rx2('ETA').rx2(1).rx2('b'))
52
+ mu_chain = np.asarray(fmBB.rx2('mu')) # Extract mu (intercept) for this chain
53
+ return beta_chain, mu_chain
54
+
55
+ def fit(self, X: np.array, y: np.array) -> np.array:
56
+ """
57
+ Implementation of fit function for Bayesian alphabet imported from R.
58
+
59
+ See :obj:`~easypheno.model._param_free_base_model.ParamFreeBaseModel` for more information.
60
+ """
61
+ # import necessary R packages
62
+ base = importr('base')
63
+ BGLR = importr('BGLR')
64
+
65
+ # create R objects for X and y
66
+ R_X = robjects.r['matrix'](X, nrow=X.shape[0], ncol=X.shape[1])
67
+ R_y = robjects.FloatVector(y)
68
+
69
+ results = Parallel(n_jobs=self.n_jobs)(
70
+ delayed(self._run_chain)(chain_num, R_X, R_y) for chain_num in range(self.n_jobs)
71
+ )
72
+
73
+ # Aggregate results from all chains
74
+ beta_chains = [result[0] for result in results]
75
+ mu_chains = [result[1] for result in results]
76
+
77
+ # Compute the mean of beta and mu over all chains
78
+ self.beta = np.mean(beta_chains, axis=0)
79
+ self.mu = np.mean(mu_chains, axis=0)
80
+
81
+ # run BGLR for BayesB
82
+ # ETA = base.list(base.list(X=R_X, model=self.model_name))
83
+ # fmBB = BGLR.BGLR(y=R_y, ETA=ETA, verbose=True, nIter=self.n_iter, burnIn=self.burn_in)
84
+
85
+ # # save results as numpy arrays
86
+ # self.beta = np.asarray(fmBB.rx2('ETA').rx2(1).rx2('b'))
87
+ # self.mu = fmBB.rx2('mu')
88
+ return self.predict(X_in=X)
89
+
90
+ def predict(self, X_in: np.array) -> np.array:
91
+ """
92
+ Implementation of predict function for Bayesian alphabet model imported from R.
93
+
94
+ See :obj:`~easypheno.model._param_free_base_model.ParamFreeBaseModel` for more information.
95
+ """
96
+ return self.mu + np.matmul(X_in, self.beta)
@@ -0,0 +1,84 @@
1
+ import abc
2
+ import joblib
3
+ import numpy as np
4
+ import pathlib
5
+
6
+
7
+ class ParamFreeBaseModel(abc.ABC):
8
+ """
9
+ BaseModel parent class for all models that do not have hyperparameters, e.g. BLUP.
10
+
11
+ Every model must be based on :obj:`~easypheno.model.param_free_base_model.ParamFreeBaseModel` directly or ParamFreeBaseModel's child classes.
12
+
13
+ Please add ``super().__init__(PARAMS)`` to the constructor in case you override it in a child class
14
+
15
+ **Attributes**
16
+
17
+ *Class attributes*
18
+
19
+ - standard_encoding (*str*): the standard encoding for this model
20
+ - possible_encodings (*List<str>*): a list of all encodings that are possible according to the model definition
21
+
22
+ *Instance attributes*
23
+
24
+ - task (*str*): ML task ('regression' or 'classification') depending on target variable
25
+ - encoding (*str*): the encoding to use (standard encoding or user-defined)
26
+
27
+
28
+ :param task: ML task (regression or classification) depending on target variable
29
+ :param encoding: the encoding to use (standard encoding or user-defined)
30
+
31
+ """
32
+
33
+ # Class attributes #
34
+ @property
35
+ @classmethod
36
+ @abc.abstractmethod
37
+ def standard_encoding(cls):
38
+ """the standard encoding for this model"""
39
+ raise NotImplementedError
40
+
41
+ @property
42
+ @classmethod
43
+ @abc.abstractmethod
44
+ def possible_encodings(cls):
45
+ """a list of all encodings that are possible according to the model definition"""
46
+ raise NotImplementedError
47
+
48
+ # Constructor super class #
49
+ def __init__(self, task: str, encoding: str = None):
50
+ self.task = task
51
+ self.encoding = self.standard_encoding if encoding is None else encoding
52
+
53
+ # Methods required by each child class #
54
+
55
+ @abc.abstractmethod
56
+ def fit(self, X: np.array, y: np.array) -> np.array:
57
+ """
58
+ Method that fits the model based on features X and targets y
59
+
60
+ :param X: feature matrix for retraining
61
+ :param y: target vector
62
+
63
+ :return: numpy array with values predicted for X
64
+ """
65
+
66
+ @abc.abstractmethod
67
+ def predict(self, X_in: np.array) -> np.array:
68
+ """
69
+ Method that predicts target values based on the input X_in
70
+
71
+ :param X_in: feature matrix as input
72
+
73
+ :return: numpy array with the predicted values
74
+ """
75
+
76
+ def save_model(self, path: pathlib.Path, filename: str):
77
+ """
78
+ Persist the whole model object on a hard drive
79
+ (can be loaded with :obj:`~easypheno.model._model_functions.load_model`)
80
+
81
+ :param path: path where the model will be saved
82
+ :param filename: filename of the model
83
+ """
84
+ joblib.dump(self, path.joinpath(filename), compress=3)
@@ -0,0 +1,16 @@
1
+ from . import _bayesfromR
2
+
3
+
4
+ class BayesC(_bayesfromR.Bayes_R):
5
+ """
6
+ Implementation of a class for Bayes A.
7
+
8
+ *Attributes*
9
+
10
+ *Inherited attributes*
11
+
12
+ See :obj:`~easypheno.model._bayesfromR.Bayes_R` for more information on the attributes.
13
+ """
14
+
15
+ def __init__(self, task: str, encoding: str = None):
16
+ super().__init__(task=task, model_name='BayesC', encoding=encoding)
@@ -0,0 +1,159 @@
1
+ import os
2
+ import time
3
+ import psutil
4
+ import argparse
5
+ import random
6
+ import torch
7
+ import numpy as np
8
+ import pandas as pd
9
+ from sklearn.model_selection import KFold, train_test_split
10
+ from .base_CropARNet import SimpleSNPModel
11
+ from scipy.stats import pearsonr
12
+ from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
13
+ from torch.utils.data import DataLoader, TensorDataset
14
+ from . import CropARNet_Hyperparameters
15
+ import pynvml
16
+
17
+ def parse_args():
18
+ parser = argparse.ArgumentParser(description="Argument parser")
19
+ parser.add_argument('--methods', type=str, default='CropARNet/', help='Random seed')
20
+ parser.add_argument('--species', type=str, default='', help='Species name')
21
+ parser.add_argument('--phe', type=str, default='', help='Dataset name')
22
+ parser.add_argument('--data_dir', type=str, default='../../data/')
23
+ parser.add_argument('--result_dir', type=str, default='result/')
24
+
25
+ parser.add_argument('--epochs', type=int, default=500, help='Number of training rounds')
26
+ parser.add_argument('--batch_size', type=int, default=32, help='Batch size')
27
+ parser.add_argument('--weight_decay', type=float, default=0.00001, help='Weight decay')
28
+ parser.add_argument('--momentum', type=float, default=0.5, help='Momentum')
29
+ parser.add_argument('--learning_rate', type=float, default=0.01, help='Learning rate')
30
+ parser.add_argument('--patience', type=int, default=50, help='Patience for early stopping')
31
+ args = parser.parse_args()
32
+ return args
33
+
34
+ def load_data(args):
35
+ xData = np.load(os.path.join(args.data_dir, args.species, 'genotype.npz'))["arr_0"]
36
+ yData = np.load(os.path.join(args.data_dir, args.species, 'phenotype.npz'))["arr_0"]
37
+ names = np.load(os.path.join(args.data_dir, args.species, 'phenotype.npz'))["arr_1"]
38
+
39
+ nsample = xData.shape[0]
40
+ nsnp = xData.shape[1]
41
+ print("Number of samples: ", nsample)
42
+ print("Number of SNPs: ", nsnp)
43
+ return xData, yData, nsample, nsnp, names
44
+
45
+ def set_seed(seed=42):
46
+ random.seed(seed)
47
+ np.random.seed(seed)
48
+ torch.manual_seed(seed)
49
+ torch.cuda.manual_seed_all(seed)
50
+ torch.backends.cudnn.deterministic = True
51
+ torch.backends.cudnn.benchmark = False
52
+
53
+ def get_gpu_mem_by_pid(pid):
54
+ procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
55
+ for p in procs:
56
+ if p.pid == pid:
57
+ return p.usedGpuMemory / 1024**2
58
+ return 0.0
59
+
60
+
61
+ def run_nested_cv(args, data, label, nsnp, device):
62
+ result_dir = os.path.join(args.result_dir, args.methods + args.species + args.phe)
63
+ os.makedirs(result_dir, exist_ok=True)
64
+ print("Starting 10-fold cross-validation...")
65
+ kf = KFold(n_splits=10, shuffle=True, random_state=42)
66
+
67
+ all_mse, all_mae, all_r2, all_pcc = [], [], [], []
68
+ time_star = time.time()
69
+ for fold, (train_index, test_index) in enumerate(kf.split(data)):
70
+ print(f"Running fold {fold}...")
71
+ process = psutil.Process(os.getpid())
72
+ fold_start_time = time.time()
73
+
74
+ X_train, X_test = data[train_index], data[test_index]
75
+ y_train, y_test = label[train_index], label[test_index]
76
+
77
+ X_train_sub, X_valid, y_train_sub, y_valid = train_test_split(X_train, y_train, test_size=0.1, random_state=42)
78
+
79
+ x_train_tensor = torch.from_numpy(X_train_sub).float().to(device)
80
+ y_train_tensor = torch.from_numpy(y_train_sub).float().to(device)
81
+ x_valid_tensor = torch.from_numpy(X_valid).float().to(device)
82
+ y_valid_tensor = torch.from_numpy(y_valid).float().to(device)
83
+ x_test_tensor = torch.from_numpy(X_test).float().to(device)
84
+ y_test_tensor = torch.from_numpy(y_test).float().to(device)
85
+
86
+
87
+ train_data = TensorDataset(x_train_tensor, y_train_tensor)
88
+ valid_data = TensorDataset(x_valid_tensor, y_valid_tensor)
89
+ test_data = TensorDataset(x_test_tensor, y_test_tensor)
90
+
91
+ train_loader = DataLoader(train_data, args.batch_size, shuffle=True)
92
+ valid_loader = DataLoader(valid_data, args.batch_size, shuffle=False)
93
+ test_loader = DataLoader(test_data, args.batch_size, shuffle=False)
94
+
95
+ model = SimpleSNPModel(nsnp)
96
+ model.train_model(train_loader, valid_loader, args.epochs, args.learning_rate, args.weight_decay, args.patience, device)
97
+ y_pred = model.predict(test_loader)
98
+
99
+ mse = mean_squared_error(y_test, y_pred)
100
+ r2 = r2_score(y_test, y_pred)
101
+ mae = mean_absolute_error(y_test, y_pred)
102
+ pcc, _ = pearsonr(y_test, y_pred)
103
+
104
+ all_mse.append(mse)
105
+ all_r2.append(r2)
106
+ all_mae.append(mae)
107
+ all_pcc.append(pcc)
108
+
109
+ fold_time = time.time() - fold_start_time
110
+ fold_gpu_mem = get_gpu_mem_by_pid(os.getpid())
111
+ fold_cpu_mem = process.memory_info().rss / 1024**2
112
+ print(f'Fold {fold}: Corr={pcc:.4f}, MAE={mae:.4f}, MSE={mse:.4f}, R2={r2:.4f}, Time={fold_time:.2f}s, '
113
+ f'GPU={fold_gpu_mem:.2f}MB, CPU={fold_cpu_mem:.2f}MB')
114
+
115
+ torch.cuda.empty_cache()
116
+ torch.cuda.reset_peak_memory_stats()
117
+ results_df = pd.DataFrame({'Y_test': y_test, 'Y_pred': y_pred})
118
+ results_df.to_csv(os.path.join(result_dir, f"fold{fold}.csv"), index=False)
119
+
120
+ print("\n===== Cross-validation summary =====")
121
+ print(f"Average PCC: {np.mean(all_pcc):.4f} ± {np.std(all_pcc):.4f}")
122
+ print(f"Average MAE: {np.mean(all_mae):.4f} ± {np.std(all_mae):.4f}")
123
+ print(f"Average MSE: {np.mean(all_mse):.4f} ± {np.std(all_mse):.4f}")
124
+ print(f"Average R2 : {np.mean(all_r2):.4f} ± {np.std(all_r2):.4f}")
125
+ print(f"Time: {time.time() - time_star:.2f}s")
126
+
127
+ def CropARNet_reg():
128
+ set_seed(42)
129
+ pynvml.nvmlInit()
130
+ handle = pynvml.nvmlDeviceGetHandleByIndex(0)
131
+ args = parse_args()
132
+ all_species =['Cotton/']
133
+ for i in range(len(all_species)):
134
+ args.species = all_species[i]
135
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
136
+ args.device = device
137
+ X, Y, nsamples, nsnp, names = load_data(args)
138
+ for j in range(len(names)):
139
+ args.phe = names[j]
140
+ print("starting run " + args.methods + args.species + args.phe)
141
+ label = Y[:, j]
142
+ label = np.nan_to_num(label, nan=np.nanmean(label))
143
+ best_params = CropARNet_Hyperparameters.Hyperparameter(X, label, nsnp)
144
+ args.learning_rate = best_params['learning_rate']
145
+ args.batch_size = best_params['batch_size']
146
+ args.weight_decay = best_params['weight_decay']
147
+ args.patience = best_params['patience']
148
+ start_time = time.time()
149
+ torch.cuda.reset_peak_memory_stats()
150
+ process = psutil.Process(os.getpid())
151
+ run_nested_cv(args, data=X, label=label, nsnp = nsnp, device = args.device)
152
+
153
+ elapsed_time = time.time() - start_time
154
+ print(f"running time: {elapsed_time:.2f} s")
155
+ print("successfully")
156
+
157
+
158
+ if __name__ == "__main__":
159
+ CropARNet_reg()
@@ -0,0 +1,109 @@
1
+ import os
2
+ import time
3
+ import psutil
4
+ import random
5
+ import torch
6
+ import numpy as np
7
+ import optuna
8
+ from sklearn.model_selection import KFold, train_test_split
9
+ from .base_CropARNet import SimpleSNPModel
10
+ from scipy.stats import pearsonr
11
+ from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
12
+ from torch.utils.data import DataLoader, TensorDataset
13
+ from optuna.exceptions import TrialPruned
14
+
15
+ def run_nested_cv_with_early_stopping(data, label, nsnp, learning_rate, weight_decay, patience, batch_size, num_round=500):
16
+ device = torch.device("cuda:0")
17
+ print("Starting 10-fold cross-validation...")
18
+ kf = KFold(n_splits=10, shuffle=True, random_state=42)
19
+ all_mse, all_mae, all_r2, all_pcc = [], [], [], []
20
+
21
+ for fold, (train_index, test_index) in enumerate(kf.split(data)):
22
+ print(f"Running fold {fold}...")
23
+ process = psutil.Process(os.getpid())
24
+ fold_start_time = time.time()
25
+
26
+ X_train, X_test = data[train_index], data[test_index]
27
+ y_train, y_test = label[train_index], label[test_index]
28
+
29
+ X_train_sub, X_valid, y_train_sub, y_valid = train_test_split(X_train, y_train, test_size=0.1, random_state=42)
30
+
31
+ x_train_tensor = torch.from_numpy(X_train_sub).float().to(device)
32
+ y_train_tensor = torch.from_numpy(y_train_sub).float().to(device)
33
+ x_valid_tensor = torch.from_numpy(X_valid).float().to(device)
34
+ y_valid_tensor = torch.from_numpy(y_valid).float().to(device)
35
+ x_test_tensor = torch.from_numpy(X_test).float().to(device)
36
+ y_test_tensor = torch.from_numpy(y_test).float().to(device)
37
+
38
+
39
+ train_data = TensorDataset(x_train_tensor, y_train_tensor)
40
+ valid_data = TensorDataset(x_valid_tensor, y_valid_tensor)
41
+ test_data = TensorDataset(x_test_tensor, y_test_tensor)
42
+
43
+ train_loader = DataLoader(train_data, batch_size, shuffle=True)
44
+ valid_loader = DataLoader(valid_data, batch_size, shuffle=False)
45
+ test_loader = DataLoader(test_data, batch_size, shuffle=False)
46
+
47
+ model = SimpleSNPModel(nsnp)
48
+ model.train_model(train_loader, valid_loader, num_round, learning_rate, weight_decay, patience, device)
49
+ y_pred = model.predict(test_loader)
50
+
51
+ mse = mean_squared_error(y_test, y_pred)
52
+ r2 = r2_score(y_test, y_pred)
53
+ mae = mean_absolute_error(y_test, y_pred)
54
+ pcc, _ = pearsonr(y_test, y_pred)
55
+
56
+ if np.isnan(pcc):
57
+ print(f"Fold {fold} resulted in NaN PCC, pruning the trial...")
58
+ raise TrialPruned()
59
+
60
+ all_mse.append(mse)
61
+ all_r2.append(r2)
62
+ all_mae.append(mae)
63
+ all_pcc.append(pcc)
64
+
65
+ fold_time = time.time() - fold_start_time
66
+ fold_cpu_mem = process.memory_info().rss / 1024**2
67
+ print(f'Fold {fold}: Corr={pcc:.4f}, MAE={mae:.4f}, MSE={mse:.4f}, R2={r2:.4f}, Time={fold_time:.2f}s, '
68
+ f'CPU={fold_cpu_mem:.2f}MB')
69
+
70
+ return np.mean(all_pcc) if all_pcc else 0.0
71
+
72
+ def set_seed(seed=42):
73
+ random.seed(seed)
74
+ np.random.seed(seed)
75
+ torch.manual_seed(seed)
76
+ if torch.cuda.is_available():
77
+ torch.cuda.manual_seed_all(seed)
78
+ torch.backends.cudnn.deterministic = True
79
+ torch.backends.cudnn.benchmark = False
80
+
81
+ def Hyperparameter(data, label, nsnp):
82
+ set_seed(42)
83
+
84
+ def objective(trial):
85
+ learning_rate = trial.suggest_float("learning_rate", 1e-4,0.1)
86
+ batch_size = trial.suggest_categorical("batch_size", [32, 64, 128])
87
+ weight_decay = trial.suggest_categorical("weight_decay", [1e-4, 1e-3, 1e-2, 1e-1])
88
+ patience = trial.suggest_int("patience", 1, 10)
89
+ try:
90
+ corr_score = run_nested_cv_with_early_stopping(
91
+ data=data,
92
+ label=label,
93
+ nsnp=nsnp,
94
+ learning_rate=learning_rate,
95
+ weight_decay=weight_decay,
96
+ patience=patience,
97
+ batch_size=batch_size
98
+ )
99
+
100
+ except TrialPruned:
101
+ return float("-inf")
102
+ return corr_score
103
+
104
+ study = optuna.create_study(direction="maximize")
105
+ study.optimize(objective, n_trials=20)
106
+
107
+ print("best params:", study.best_params)
108
+ print("successfully")
109
+ return study.best_params
@@ -0,0 +1,5 @@
1
+ from .CropARNet import CropARNet_reg
2
+
3
+ CropARNet = CropARNet_reg
4
+
5
+ __all__ = ["CropARNet","CropARNet_reg"]