gpbench 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (188) hide show
  1. gp_agent_tool/compute_dataset_feature.py +67 -0
  2. gp_agent_tool/config.py +65 -0
  3. gp_agent_tool/experience/create_masked_dataset_summary.py +97 -0
  4. gp_agent_tool/experience/dataset_summary_info.py +13 -0
  5. gp_agent_tool/experience/experience_info.py +12 -0
  6. gp_agent_tool/experience/get_matched_experience.py +111 -0
  7. gp_agent_tool/llm_client.py +119 -0
  8. gp_agent_tool/logging_utils.py +24 -0
  9. gp_agent_tool/main.py +347 -0
  10. gp_agent_tool/read_agent/__init__.py +46 -0
  11. gp_agent_tool/read_agent/nodes.py +674 -0
  12. gp_agent_tool/read_agent/prompts.py +547 -0
  13. gp_agent_tool/read_agent/python_repl_tool.py +165 -0
  14. gp_agent_tool/read_agent/state.py +101 -0
  15. gp_agent_tool/read_agent/workflow.py +54 -0
  16. gpbench/__init__.py +25 -0
  17. gpbench/_selftest.py +104 -0
  18. gpbench/method_class/BayesA/BayesA_class.py +141 -0
  19. gpbench/method_class/BayesA/__init__.py +5 -0
  20. gpbench/method_class/BayesA/_bayesfromR.py +96 -0
  21. gpbench/method_class/BayesA/_param_free_base_model.py +84 -0
  22. gpbench/method_class/BayesA/bayesAfromR.py +16 -0
  23. gpbench/method_class/BayesB/BayesB_class.py +140 -0
  24. gpbench/method_class/BayesB/__init__.py +5 -0
  25. gpbench/method_class/BayesB/_bayesfromR.py +96 -0
  26. gpbench/method_class/BayesB/_param_free_base_model.py +84 -0
  27. gpbench/method_class/BayesB/bayesBfromR.py +16 -0
  28. gpbench/method_class/BayesC/BayesC_class.py +141 -0
  29. gpbench/method_class/BayesC/__init__.py +4 -0
  30. gpbench/method_class/BayesC/_bayesfromR.py +96 -0
  31. gpbench/method_class/BayesC/_param_free_base_model.py +84 -0
  32. gpbench/method_class/BayesC/bayesCfromR.py +16 -0
  33. gpbench/method_class/CropARNet/CropARNet_class.py +186 -0
  34. gpbench/method_class/CropARNet/CropARNet_he_class.py +154 -0
  35. gpbench/method_class/CropARNet/__init__.py +5 -0
  36. gpbench/method_class/CropARNet/base_CropARNet_class.py +178 -0
  37. gpbench/method_class/Cropformer/Cropformer_class.py +308 -0
  38. gpbench/method_class/Cropformer/__init__.py +5 -0
  39. gpbench/method_class/Cropformer/cropformer_he_class.py +221 -0
  40. gpbench/method_class/DL_GWAS/DL_GWAS_class.py +250 -0
  41. gpbench/method_class/DL_GWAS/DL_GWAS_he_class.py +169 -0
  42. gpbench/method_class/DL_GWAS/__init__.py +5 -0
  43. gpbench/method_class/DNNGP/DNNGP_class.py +163 -0
  44. gpbench/method_class/DNNGP/DNNGP_he_class.py +138 -0
  45. gpbench/method_class/DNNGP/__init__.py +5 -0
  46. gpbench/method_class/DNNGP/base_dnngp_class.py +116 -0
  47. gpbench/method_class/DeepCCR/DeepCCR_class.py +172 -0
  48. gpbench/method_class/DeepCCR/DeepCCR_he_class.py +161 -0
  49. gpbench/method_class/DeepCCR/__init__.py +5 -0
  50. gpbench/method_class/DeepCCR/base_DeepCCR_class.py +209 -0
  51. gpbench/method_class/DeepGS/DeepGS_class.py +184 -0
  52. gpbench/method_class/DeepGS/DeepGS_he_class.py +150 -0
  53. gpbench/method_class/DeepGS/__init__.py +5 -0
  54. gpbench/method_class/DeepGS/base_deepgs_class.py +153 -0
  55. gpbench/method_class/EIR/EIR_class.py +276 -0
  56. gpbench/method_class/EIR/EIR_he_class.py +184 -0
  57. gpbench/method_class/EIR/__init__.py +5 -0
  58. gpbench/method_class/EIR/utils/__init__.py +0 -0
  59. gpbench/method_class/EIR/utils/array_output_modules.py +97 -0
  60. gpbench/method_class/EIR/utils/common.py +65 -0
  61. gpbench/method_class/EIR/utils/lcl_layers.py +235 -0
  62. gpbench/method_class/EIR/utils/logging.py +59 -0
  63. gpbench/method_class/EIR/utils/mlp_layers.py +92 -0
  64. gpbench/method_class/EIR/utils/models_locally_connected.py +642 -0
  65. gpbench/method_class/EIR/utils/transformer_models.py +546 -0
  66. gpbench/method_class/ElasticNet/ElasticNet_class.py +133 -0
  67. gpbench/method_class/ElasticNet/ElasticNet_he_class.py +91 -0
  68. gpbench/method_class/ElasticNet/__init__.py +5 -0
  69. gpbench/method_class/G2PDeep/G2PDeep_he_class.py +217 -0
  70. gpbench/method_class/G2PDeep/G2Pdeep_class.py +205 -0
  71. gpbench/method_class/G2PDeep/__init__.py +5 -0
  72. gpbench/method_class/G2PDeep/base_G2PDeep_class.py +209 -0
  73. gpbench/method_class/GBLUP/GBLUP_class.py +183 -0
  74. gpbench/method_class/GBLUP/__init__.py +5 -0
  75. gpbench/method_class/GEFormer/GEFormer_class.py +169 -0
  76. gpbench/method_class/GEFormer/GEFormer_he_class.py +137 -0
  77. gpbench/method_class/GEFormer/__init__.py +5 -0
  78. gpbench/method_class/GEFormer/gMLP_class.py +357 -0
  79. gpbench/method_class/LightGBM/LightGBM_class.py +224 -0
  80. gpbench/method_class/LightGBM/LightGBM_he_class.py +121 -0
  81. gpbench/method_class/LightGBM/__init__.py +5 -0
  82. gpbench/method_class/RF/RF_GPU_class.py +165 -0
  83. gpbench/method_class/RF/RF_GPU_he_class.py +124 -0
  84. gpbench/method_class/RF/__init__.py +5 -0
  85. gpbench/method_class/SVC/SVC_GPU.py +181 -0
  86. gpbench/method_class/SVC/SVC_GPU_he.py +106 -0
  87. gpbench/method_class/SVC/__init__.py +5 -0
  88. gpbench/method_class/SoyDNGP/AlexNet_206_class.py +179 -0
  89. gpbench/method_class/SoyDNGP/SoyDNGP_class.py +189 -0
  90. gpbench/method_class/SoyDNGP/SoyDNGP_he_class.py +112 -0
  91. gpbench/method_class/SoyDNGP/__init__.py +5 -0
  92. gpbench/method_class/XGBoost/XGboost_GPU_class.py +198 -0
  93. gpbench/method_class/XGBoost/XGboost_GPU_he_class.py +178 -0
  94. gpbench/method_class/XGBoost/__init__.py +5 -0
  95. gpbench/method_class/__init__.py +52 -0
  96. gpbench/method_class/rrBLUP/__init__.py +5 -0
  97. gpbench/method_class/rrBLUP/rrBLUP_class.py +140 -0
  98. gpbench/method_reg/BayesA/BayesA.py +116 -0
  99. gpbench/method_reg/BayesA/__init__.py +5 -0
  100. gpbench/method_reg/BayesA/_bayesfromR.py +96 -0
  101. gpbench/method_reg/BayesA/_param_free_base_model.py +84 -0
  102. gpbench/method_reg/BayesA/bayesAfromR.py +16 -0
  103. gpbench/method_reg/BayesB/BayesB.py +117 -0
  104. gpbench/method_reg/BayesB/__init__.py +5 -0
  105. gpbench/method_reg/BayesB/_bayesfromR.py +96 -0
  106. gpbench/method_reg/BayesB/_param_free_base_model.py +84 -0
  107. gpbench/method_reg/BayesB/bayesBfromR.py +16 -0
  108. gpbench/method_reg/BayesC/BayesC.py +115 -0
  109. gpbench/method_reg/BayesC/__init__.py +5 -0
  110. gpbench/method_reg/BayesC/_bayesfromR.py +96 -0
  111. gpbench/method_reg/BayesC/_param_free_base_model.py +84 -0
  112. gpbench/method_reg/BayesC/bayesCfromR.py +16 -0
  113. gpbench/method_reg/CropARNet/CropARNet.py +159 -0
  114. gpbench/method_reg/CropARNet/CropARNet_Hyperparameters.py +109 -0
  115. gpbench/method_reg/CropARNet/__init__.py +5 -0
  116. gpbench/method_reg/CropARNet/base_CropARNet.py +137 -0
  117. gpbench/method_reg/Cropformer/Cropformer.py +313 -0
  118. gpbench/method_reg/Cropformer/Cropformer_Hyperparameters.py +250 -0
  119. gpbench/method_reg/Cropformer/__init__.py +5 -0
  120. gpbench/method_reg/DL_GWAS/DL_GWAS.py +186 -0
  121. gpbench/method_reg/DL_GWAS/DL_GWAS_Hyperparameters.py +125 -0
  122. gpbench/method_reg/DL_GWAS/__init__.py +5 -0
  123. gpbench/method_reg/DNNGP/DNNGP.py +157 -0
  124. gpbench/method_reg/DNNGP/DNNGP_Hyperparameters.py +118 -0
  125. gpbench/method_reg/DNNGP/__init__.py +5 -0
  126. gpbench/method_reg/DNNGP/base_dnngp.py +101 -0
  127. gpbench/method_reg/DeepCCR/DeepCCR.py +149 -0
  128. gpbench/method_reg/DeepCCR/DeepCCR_Hyperparameters.py +110 -0
  129. gpbench/method_reg/DeepCCR/__init__.py +5 -0
  130. gpbench/method_reg/DeepCCR/base_DeepCCR.py +171 -0
  131. gpbench/method_reg/DeepGS/DeepGS.py +165 -0
  132. gpbench/method_reg/DeepGS/DeepGS_Hyperparameters.py +114 -0
  133. gpbench/method_reg/DeepGS/__init__.py +5 -0
  134. gpbench/method_reg/DeepGS/base_deepgs.py +98 -0
  135. gpbench/method_reg/EIR/EIR.py +258 -0
  136. gpbench/method_reg/EIR/EIR_Hyperparameters.py +178 -0
  137. gpbench/method_reg/EIR/__init__.py +5 -0
  138. gpbench/method_reg/EIR/utils/__init__.py +0 -0
  139. gpbench/method_reg/EIR/utils/array_output_modules.py +97 -0
  140. gpbench/method_reg/EIR/utils/common.py +65 -0
  141. gpbench/method_reg/EIR/utils/lcl_layers.py +235 -0
  142. gpbench/method_reg/EIR/utils/logging.py +59 -0
  143. gpbench/method_reg/EIR/utils/mlp_layers.py +92 -0
  144. gpbench/method_reg/EIR/utils/models_locally_connected.py +642 -0
  145. gpbench/method_reg/EIR/utils/transformer_models.py +546 -0
  146. gpbench/method_reg/ElasticNet/ElasticNet.py +123 -0
  147. gpbench/method_reg/ElasticNet/ElasticNet_he.py +83 -0
  148. gpbench/method_reg/ElasticNet/__init__.py +5 -0
  149. gpbench/method_reg/G2PDeep/G2PDeep_Hyperparameters.py +107 -0
  150. gpbench/method_reg/G2PDeep/G2Pdeep.py +166 -0
  151. gpbench/method_reg/G2PDeep/__init__.py +5 -0
  152. gpbench/method_reg/G2PDeep/base_G2PDeep.py +209 -0
  153. gpbench/method_reg/GBLUP/GBLUP_R.py +182 -0
  154. gpbench/method_reg/GBLUP/__init__.py +5 -0
  155. gpbench/method_reg/GEFormer/GEFormer.py +164 -0
  156. gpbench/method_reg/GEFormer/GEFormer_Hyperparameters.py +106 -0
  157. gpbench/method_reg/GEFormer/__init__.py +5 -0
  158. gpbench/method_reg/GEFormer/gMLP.py +341 -0
  159. gpbench/method_reg/LightGBM/LightGBM.py +237 -0
  160. gpbench/method_reg/LightGBM/LightGBM_Hyperparameters.py +77 -0
  161. gpbench/method_reg/LightGBM/__init__.py +5 -0
  162. gpbench/method_reg/MVP/MVP.py +182 -0
  163. gpbench/method_reg/MVP/MVP_Hyperparameters.py +126 -0
  164. gpbench/method_reg/MVP/__init__.py +5 -0
  165. gpbench/method_reg/MVP/base_MVP.py +113 -0
  166. gpbench/method_reg/RF/RF_GPU.py +174 -0
  167. gpbench/method_reg/RF/RF_Hyperparameters.py +163 -0
  168. gpbench/method_reg/RF/__init__.py +5 -0
  169. gpbench/method_reg/SVC/SVC_GPU.py +194 -0
  170. gpbench/method_reg/SVC/SVC_Hyperparameters.py +107 -0
  171. gpbench/method_reg/SVC/__init__.py +5 -0
  172. gpbench/method_reg/SoyDNGP/AlexNet_206.py +185 -0
  173. gpbench/method_reg/SoyDNGP/SoyDNGP.py +179 -0
  174. gpbench/method_reg/SoyDNGP/SoyDNGP_Hyperparameters.py +105 -0
  175. gpbench/method_reg/SoyDNGP/__init__.py +5 -0
  176. gpbench/method_reg/XGBoost/XGboost_GPU.py +188 -0
  177. gpbench/method_reg/XGBoost/XGboost_Hyperparameters.py +167 -0
  178. gpbench/method_reg/XGBoost/__init__.py +5 -0
  179. gpbench/method_reg/__init__.py +55 -0
  180. gpbench/method_reg/rrBLUP/__init__.py +5 -0
  181. gpbench/method_reg/rrBLUP/rrBLUP.py +123 -0
  182. gpbench-1.0.0.dist-info/METADATA +379 -0
  183. gpbench-1.0.0.dist-info/RECORD +188 -0
  184. gpbench-1.0.0.dist-info/WHEEL +5 -0
  185. gpbench-1.0.0.dist-info/entry_points.txt +2 -0
  186. gpbench-1.0.0.dist-info/top_level.txt +3 -0
  187. tests/test_import.py +80 -0
  188. tests/test_method.py +232 -0
@@ -0,0 +1,91 @@
1
+ import gc
2
+ import random
3
+ import time
4
+ import numpy as np
5
+ import optuna
6
+ from sklearn.model_selection import StratifiedKFold
7
+ from sklearn.linear_model import LogisticRegression
8
+ from sklearn.metrics import accuracy_score, precision_recall_fscore_support
9
+ from optuna.exceptions import TrialPruned
10
+
11
+ def run_nested_cv_with_early_stopping(data, label, outer_cv, C, l1_ratio):
12
+ best_accs, best_precs, best_recs, best_f1s = [], [], [], []
13
+ time_star = time.time()
14
+
15
+ for fold, (train_idx, test_idx) in enumerate(outer_cv.split(data, label)):
16
+ x_train = data[train_idx]
17
+ x_test = data[test_idx]
18
+ y_train = label[train_idx]
19
+ y_test = label[test_idx]
20
+
21
+ model = LogisticRegression(
22
+ penalty='elasticnet',
23
+ C=C,
24
+ l1_ratio=l1_ratio,
25
+ solver='saga',
26
+ max_iter=1000,
27
+ random_state=42,
28
+ n_jobs=-1
29
+ )
30
+ model.fit(x_train, y_train)
31
+ y_test_preds = model.predict(x_test)
32
+
33
+ acc = accuracy_score(y_test, y_test_preds)
34
+ prec, rec, f1, _ = precision_recall_fscore_support(
35
+ y_test, y_test_preds, average="macro", zero_division=0
36
+ )
37
+
38
+ best_accs.append(acc)
39
+ best_precs.append(prec)
40
+ best_recs.append(rec)
41
+ best_f1s.append(f1)
42
+
43
+ print(f'Fold {fold + 1}: ACC={acc:.4f}, PREC={prec:.4f}, REC={rec:.4f}, F1={f1:.4f}')
44
+ del model, y_test_preds, x_train, x_test, y_train, y_test
45
+
46
+ print("==== Final Results ====")
47
+ print(f"ACC: {np.mean(best_accs):.4f} ± {np.std(best_accs):.4f}")
48
+ print(f"PREC: {np.mean(best_precs):.4f} ± {np.std(best_precs):.4f}")
49
+ print(f"REC: {np.mean(best_recs):.4f} ± {np.std(best_recs):.4f}")
50
+ print(f"F1: {np.mean(best_f1s):.4f} ± {np.std(best_f1s):.4f}")
51
+
52
+ print(f"Time: {time.time() - time_star:.2f}s")
53
+ gc.collect()
54
+
55
+ mean_f1 = float(np.mean(best_f1s)) if best_f1s else 0.0
56
+ if np.isnan(mean_f1) or mean_f1 <= 0:
57
+ raise TrialPruned()
58
+
59
+ return mean_f1
60
+
61
+ def set_seed(seed=42):
62
+ random.seed(seed)
63
+ np.random.seed(seed)
64
+
65
+ def Hyperparameter(data, label):
66
+ set_seed(42)
67
+
68
+ def objective(trial):
69
+ C = trial.suggest_float("C", 1e-4, 100.0, log=True)
70
+ l1_ratio = trial.suggest_categorical("l1_ratio", [0.1, 0.3, 0.5, 0.7, 0.9])
71
+
72
+ outer_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
73
+
74
+ try:
75
+ f1_score = run_nested_cv_with_early_stopping(
76
+ data=data,
77
+ label=label,
78
+ outer_cv=outer_cv,
79
+ C=C,
80
+ l1_ratio=l1_ratio
81
+ )
82
+ except TrialPruned:
83
+ return float("-inf")
84
+ return f1_score
85
+
86
+ study = optuna.create_study(direction="maximize")
87
+ study.optimize(objective, n_trials=20)
88
+
89
+ print("best params:", study.best_params)
90
+ print("successfully")
91
+ return study.best_params
@@ -0,0 +1,5 @@
1
+ from .ElasticNet_class import ElasticNet_class
2
+
3
+ ElasticNet = ElasticNet_class
4
+
5
+ __all__ = ["ElasticNet","ElasticNet_class"]
@@ -0,0 +1,217 @@
1
+ import os
2
+ import time
3
+ import psutil
4
+ import random
5
+ import torch
6
+ import numpy as np
7
+ import optuna
8
+ from sklearn.model_selection import StratifiedKFold, train_test_split
9
+ from sklearn.metrics import accuracy_score, precision_recall_fscore_support
10
+ from torch.utils.data import DataLoader, TensorDataset
11
+ from optuna.exceptions import TrialPruned
12
+ from .base_G2PDeep_class import G2PDeep, ModelHyperparams
13
+
14
+
15
+ def train_model(model, train_loader, valid_loader, optimizer, criterion, num_epochs, patience, device):
16
+ model.to(device)
17
+ best_loss = float('inf')
18
+ best_state = None
19
+ trigger_times = 0
20
+
21
+ use_amp = device.type == 'cuda'
22
+ scaler = torch.amp.GradScaler('cuda') if use_amp else None
23
+
24
+ for epoch in range(num_epochs):
25
+ model.train()
26
+ train_loss = 0.0
27
+ for inputs, labels in train_loader:
28
+ inputs = inputs.to(device, non_blocking=True)
29
+ labels = labels.to(device, non_blocking=True)
30
+
31
+ optimizer.zero_grad()
32
+
33
+ if use_amp:
34
+ with torch.amp.autocast('cuda'):
35
+ outputs = model(inputs)
36
+ loss = criterion(outputs, labels)
37
+ scaler.scale(loss).backward()
38
+ scaler.step(optimizer)
39
+ scaler.update()
40
+ else:
41
+ outputs = model(inputs)
42
+ loss = criterion(outputs, labels)
43
+ loss.backward()
44
+ optimizer.step()
45
+
46
+ train_loss += loss.item() * inputs.size(0)
47
+ train_loss /= len(train_loader.dataset)
48
+
49
+ model.eval()
50
+ valid_loss = 0.0
51
+ with torch.no_grad():
52
+ for inputs, labels in valid_loader:
53
+ inputs = inputs.to(device, non_blocking=True)
54
+ labels = labels.to(device, non_blocking=True)
55
+
56
+ if use_amp:
57
+ with torch.amp.autocast('cuda'):
58
+ outputs = model(inputs)
59
+ loss = criterion(outputs, labels)
60
+ else:
61
+ outputs = model(inputs)
62
+ loss = criterion(outputs, labels)
63
+
64
+ valid_loss += loss.item() * inputs.size(0)
65
+ valid_loss /= len(valid_loader.dataset)
66
+
67
+ if valid_loss < best_loss:
68
+ best_loss = valid_loss
69
+ best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
70
+ trigger_times = 0
71
+ else:
72
+ trigger_times += 1
73
+ if trigger_times >= patience:
74
+ print(f"Early stopping at epoch {epoch+1}")
75
+ break
76
+
77
+ if best_state is not None:
78
+ cur_device = next(model.parameters()).device
79
+ best_state = {k: v.to(cur_device) for k, v in best_state.items()}
80
+ model.load_state_dict(best_state)
81
+ return best_loss
82
+
83
+
84
+ def predict(model, test_loader, device):
85
+ model.eval()
86
+ model.to(device)
87
+ y_pred_list = []
88
+ use_amp = device.type == 'cuda'
89
+ with torch.no_grad():
90
+ for inputs, _ in test_loader:
91
+ inputs = inputs.to(device, non_blocking=True)
92
+ if use_amp:
93
+ with torch.amp.autocast('cuda'):
94
+ outputs = model(inputs)
95
+ else:
96
+ outputs = model(inputs)
97
+ preds = torch.argmax(outputs, dim=1)
98
+ y_pred_list.append(preds.cpu())
99
+ y_pred = torch.cat(y_pred_list, dim=0).numpy()
100
+ return y_pred
101
+
102
+
103
+ def run_nested_cv_with_early_stopping(data, label, nsnp, num_classes, learning_rate, patience, batch_size, epochs=1000):
104
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
105
+ print("Starting 10-fold cross-validation...")
106
+ kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
107
+ all_acc, all_prec, all_rec, all_f1 = [], [], [], []
108
+
109
+ for fold, (train_index, test_index) in enumerate(kf.split(data, label)):
110
+ print(f"Running fold {fold}...")
111
+ process = psutil.Process(os.getpid())
112
+ fold_start_time = time.time()
113
+
114
+ X_train, X_test = data[train_index], data[test_index]
115
+ y_train, y_test = label[train_index], label[test_index]
116
+
117
+ X_train_sub, X_valid, y_train_sub, y_valid = train_test_split(
118
+ X_train, y_train, test_size=0.1, stratify=y_train, random_state=42
119
+ )
120
+
121
+ x_train_tensor = torch.from_numpy(X_train_sub).float()
122
+ y_train_tensor = torch.from_numpy(y_train_sub).long()
123
+ x_valid_tensor = torch.from_numpy(X_valid).float()
124
+ y_valid_tensor = torch.from_numpy(y_valid).long()
125
+ x_test_tensor = torch.from_numpy(X_test).float()
126
+ y_test_tensor = torch.from_numpy(y_test).long()
127
+
128
+ train_data = TensorDataset(x_train_tensor, y_train_tensor)
129
+ valid_data = TensorDataset(x_valid_tensor, y_valid_tensor)
130
+ test_data = TensorDataset(x_test_tensor, y_test_tensor)
131
+
132
+ train_loader = DataLoader(
133
+ train_data, batch_size, shuffle=True,
134
+ num_workers=4, pin_memory=True, persistent_workers=True
135
+ )
136
+ valid_loader = DataLoader(
137
+ valid_data, batch_size, shuffle=False,
138
+ num_workers=4, pin_memory=True, persistent_workers=True
139
+ )
140
+ test_loader = DataLoader(
141
+ test_data, batch_size, shuffle=False,
142
+ num_workers=4, pin_memory=True, persistent_workers=True
143
+ )
144
+
145
+ hp = ModelHyperparams()
146
+ model = G2PDeep(nsnp=nsnp, num_classes=num_classes, hyperparams=hp).to(device)
147
+ optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-4)
148
+ loss_fn = torch.nn.CrossEntropyLoss()
149
+
150
+ train_model(model, train_loader, valid_loader, optimizer, loss_fn, epochs, patience, device)
151
+ y_pred = predict(model, test_loader, device)
152
+
153
+ acc = accuracy_score(y_test, y_pred)
154
+ prec, rec, f1, _ = precision_recall_fscore_support(
155
+ y_test, y_pred, average="macro", zero_division=0
156
+ )
157
+
158
+ if np.isnan(f1) or f1 <= 0:
159
+ print(f"Fold {fold} resulted in NaN or zero F1, pruning the trial...")
160
+ raise TrialPruned()
161
+
162
+ all_acc.append(acc)
163
+ all_prec.append(prec)
164
+ all_rec.append(rec)
165
+ all_f1.append(f1)
166
+
167
+ fold_time = time.time() - fold_start_time
168
+ fold_cpu_mem = process.memory_info().rss / 1024**2
169
+ print(f'Fold {fold}: ACC={acc:.4f}, PREC={prec:.4f}, REC={rec:.4f}, F1={f1:.4f}, '
170
+ f'Time={fold_time:.2f}s, CPU={fold_cpu_mem:.2f}MB')
171
+
172
+ print("\n===== Cross-validation summary =====")
173
+ print(f"Average ACC: {np.mean(all_acc):.4f} ± {np.std(all_acc):.4f}")
174
+ print(f"Average PREC: {np.mean(all_prec):.4f} ± {np.std(all_prec):.4f}")
175
+ print(f"Average REC: {np.mean(all_rec):.4f} ± {np.std(all_rec):.4f}")
176
+ print(f"Average F1 : {np.mean(all_f1):.4f} ± {np.std(all_f1):.4f}")
177
+
178
+ return float(np.mean(all_f1)) if all_f1 else 0.0
179
+
180
+
181
+ def set_seed(seed=42):
182
+ random.seed(seed)
183
+ np.random.seed(seed)
184
+ torch.manual_seed(seed)
185
+ if torch.cuda.is_available():
186
+ torch.cuda.manual_seed_all(seed)
187
+ torch.backends.cudnn.deterministic = True
188
+ torch.backends.cudnn.benchmark = False
189
+
190
+
191
+ def Hyperparameter(data, label, nsnp, num_classes):
192
+ set_seed(42)
193
+
194
+ def objective(trial):
195
+ learning_rate = trial.suggest_float("learning_rate", 1e-4, 0.1, log=True)
196
+ batch_size = trial.suggest_categorical("batch_size", [32, 64, 128])
197
+ patience = trial.suggest_int("patience", 10, 100, step=10)
198
+ try:
199
+ f1_score = run_nested_cv_with_early_stopping(
200
+ data=data,
201
+ label=label,
202
+ nsnp=nsnp,
203
+ num_classes=num_classes,
204
+ learning_rate=learning_rate,
205
+ patience=patience,
206
+ batch_size=batch_size
207
+ )
208
+ except TrialPruned:
209
+ return float("-inf")
210
+ return f1_score
211
+
212
+ study = optuna.create_study(direction="maximize")
213
+ study.optimize(objective, n_trials=20)
214
+
215
+ print("best params:", study.best_params)
216
+ print("successfully")
217
+ return study.best_params
@@ -0,0 +1,205 @@
1
+ import os
2
+ import time
3
+ import psutil
4
+ import swanlab
5
+ import argparse
6
+ import random
7
+ import torch
8
+ import numpy as np
9
+ import pandas as pd
10
+ from sklearn.model_selection import StratifiedKFold, train_test_split
11
+ from sklearn.preprocessing import LabelEncoder
12
+ from sklearn.metrics import accuracy_score, precision_recall_fscore_support
13
+ from .base_G2PDeep_class import G2PDeep, ModelHyperparams
14
+ from torch.utils.data import DataLoader, TensorDataset
15
+ from . import G2PDeep_he_class
16
+ import pynvml
17
+
18
+
19
+ def parse_args():
20
+ parser = argparse.ArgumentParser(description="G2PDeep classification")
21
+ parser.add_argument('--methods', type=str, default='G2PDeep/')
22
+ parser.add_argument('--species', type=str, default='')
23
+ parser.add_argument('--phe', type=str, default='')
24
+ parser.add_argument('--data_dir', type=str, default='../../data/')
25
+ parser.add_argument('--result_dir', type=str, default='result/')
26
+ parser.add_argument('--epoch', type=int, default=1000)
27
+ parser.add_argument('--batch_size', type=int, default=64)
28
+ parser.add_argument('--lr', type=float, default=0.001)
29
+ parser.add_argument('--patience', type=int, default=10)
30
+ return parser.parse_args()
31
+
32
+
33
+ def process_snp_data(data: np.array) -> np.array:
34
+ nb_classes = 4
35
+ onehot_x = np.empty(
36
+ shape=(data.shape[0], data.shape[1], nb_classes),
37
+ dtype=np.float32
38
+ )
39
+
40
+ for i in range(data.shape[0]):
41
+ _data = pd.to_numeric(data[i], errors='coerce')
42
+ _targets = np.array(_data).reshape(-1).astype(np.int64)
43
+ onehot_x[i] = np.eye(nb_classes)[_targets]
44
+
45
+ return onehot_x
46
+
47
+
48
+ def load_data(args):
49
+ xData = np.load(os.path.join(args.data_dir, args.species, 'genotype.npz'))["arr_0"]
50
+ yData = np.load(os.path.join(args.data_dir, args.species, 'phenotype.npz'))["arr_0"]
51
+ names = np.load(os.path.join(args.data_dir, args.species, 'phenotype.npz'))["arr_1"]
52
+
53
+ xData[xData == -9] = 0
54
+ xData = process_snp_data(xData)
55
+ nsample = xData.shape[0]
56
+ nsnp = xData.shape[1]
57
+ print("Number of samples: ", nsample)
58
+ print("Number of SNPs: ", nsnp)
59
+ return xData, yData, nsample, nsnp, names
60
+
61
+
62
+ def set_seed(seed=42):
63
+ random.seed(seed)
64
+ np.random.seed(seed)
65
+ torch.manual_seed(seed)
66
+ torch.cuda.manual_seed_all(seed)
67
+ torch.backends.cudnn.deterministic = True
68
+ torch.backends.cudnn.benchmark = False
69
+
70
+
71
+ def get_gpu_mem_by_pid(pid, handle=None):
72
+ if handle is None:
73
+ return 0.0
74
+ try:
75
+ procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
76
+ for p in procs:
77
+ if p.pid == pid:
78
+ return p.usedGpuMemory / 1024**2
79
+ return 0.0
80
+ except Exception:
81
+ return 0.0
82
+
83
+
84
+ def run_nested_cv(args, data, label, nsnp, num_classes, device, gpu_handle=None):
85
+ result_dir = os.path.join(args.result_dir, args.methods + args.species + args.phe)
86
+ os.makedirs(result_dir, exist_ok=True)
87
+ print("Starting 10-fold cross-validation...")
88
+ kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
89
+
90
+ all_acc, all_prec, all_rec, all_f1 = [], [], [], []
91
+ time_star = time.time()
92
+
93
+ for fold, (train_index, test_index) in enumerate(kf.split(data, label)):
94
+ print(f"Running fold {fold}...")
95
+ process = psutil.Process(os.getpid())
96
+ fold_start_time = time.time()
97
+
98
+ X_train, X_test = data[train_index], data[test_index]
99
+ y_train, y_test = label[train_index], label[test_index]
100
+
101
+ X_train_sub, X_valid, y_train_sub, y_valid = train_test_split(
102
+ X_train, y_train, test_size=0.1, stratify=y_train, random_state=42
103
+ )
104
+
105
+ x_train_tensor = torch.from_numpy(X_train_sub).float()
106
+ y_train_tensor = torch.from_numpy(y_train_sub).long()
107
+ x_valid_tensor = torch.from_numpy(X_valid).float()
108
+ y_valid_tensor = torch.from_numpy(y_valid).long()
109
+ x_test_tensor = torch.from_numpy(X_test).float()
110
+ y_test_tensor = torch.from_numpy(y_test).long()
111
+
112
+ train_data = TensorDataset(x_train_tensor, y_train_tensor)
113
+ valid_data = TensorDataset(x_valid_tensor, y_valid_tensor)
114
+ test_data = TensorDataset(x_test_tensor, y_test_tensor)
115
+
116
+ train_loader = DataLoader(
117
+ train_data, args.batch_size, shuffle=True,
118
+ num_workers=4, pin_memory=True, persistent_workers=True
119
+ )
120
+ valid_loader = DataLoader(
121
+ valid_data, args.batch_size, shuffle=False,
122
+ num_workers=4, pin_memory=True, persistent_workers=True
123
+ )
124
+ test_loader = DataLoader(
125
+ test_data, args.batch_size, shuffle=False,
126
+ num_workers=4, pin_memory=True, persistent_workers=True
127
+ )
128
+
129
+ hp = ModelHyperparams()
130
+ model = G2PDeep(nsnp=nsnp, num_classes=num_classes, hyperparams=hp).to(device)
131
+ model.train_model(train_loader, valid_loader, args.epoch, args.lr, args.patience, device)
132
+ y_pred = model.predict(test_loader, device)
133
+
134
+ acc = accuracy_score(y_test, y_pred)
135
+ prec, rec, f1, _ = precision_recall_fscore_support(
136
+ y_test, y_pred, average="macro", zero_division=0
137
+ )
138
+
139
+ all_acc.append(acc)
140
+ all_prec.append(prec)
141
+ all_rec.append(rec)
142
+ all_f1.append(f1)
143
+
144
+ fold_time = time.time() - fold_start_time
145
+ fold_gpu_mem = get_gpu_mem_by_pid(os.getpid(), gpu_handle)
146
+ fold_cpu_mem = process.memory_info().rss / 1024**2
147
+ print(f'Fold {fold}: ACC={acc:.4f}, PREC={prec:.4f}, REC={rec:.4f}, F1={f1:.4f}, '
148
+ f'Time={fold_time:.2f}s, GPU={fold_gpu_mem:.2f}MB, CPU={fold_cpu_mem:.2f}MB')
149
+
150
+ if torch.cuda.is_available():
151
+ torch.cuda.empty_cache()
152
+ torch.cuda.reset_peak_memory_stats()
153
+ results_df = pd.DataFrame({'Y_test': y_test, 'Y_pred': y_pred})
154
+ results_df.to_csv(os.path.join(result_dir, f"fold{fold}.csv"), index=False)
155
+
156
+ print("\n===== Cross-validation summary =====")
157
+ print(f"Average ACC: {np.mean(all_acc):.4f} ± {np.std(all_acc):.4f}")
158
+ print(f"Average PREC: {np.mean(all_prec):.4f} ± {np.std(all_prec):.4f}")
159
+ print(f"Average REC: {np.mean(all_rec):.4f} ± {np.std(all_rec):.4f}")
160
+ print(f"Average F1 : {np.mean(all_f1):.4f} ± {np.std(all_f1):.4f}")
161
+ print(f"Time: {time.time() - time_star:.2f}s")
162
+
163
+
164
+ def G2PDeep_class():
165
+ set_seed(42)
166
+ gpu_handle = None
167
+ try:
168
+ if torch.cuda.is_available():
169
+ pynvml.nvmlInit()
170
+ gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(0)
171
+ except Exception as e:
172
+ print(f"Warning: GPU monitoring initialization failed: {e}")
173
+ gpu_handle = None
174
+
175
+ args = parse_args()
176
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
177
+
178
+ all_species = ["Human/Sim/"]
179
+ for species in all_species:
180
+ args.species = species
181
+ X, Y, nsamples, nsnp, names = load_data(args)
182
+
183
+ print("Starting:", args.methods + args.species)
184
+ label_raw = np.nan_to_num(Y[:, 0])
185
+ le = LabelEncoder()
186
+ label = le.fit_transform(label_raw)
187
+ num_classes = len(le.classes_)
188
+
189
+ best_params = G2PDeep_he_class.Hyperparameter(X, label, nsnp, num_classes)
190
+ args.lr = best_params['learning_rate']
191
+ args.patience = best_params['patience']
192
+ args.batch_size = best_params['batch_size']
193
+
194
+ start_time = time.time()
195
+ if torch.cuda.is_available():
196
+ torch.cuda.reset_peak_memory_stats()
197
+ process = psutil.Process(os.getpid())
198
+ run_nested_cv(args, data=X, label=label, nsnp=nsnp, num_classes=num_classes, device=device, gpu_handle=gpu_handle)
199
+
200
+ elapsed_time = time.time() - start_time
201
+ print(f"Running time: {elapsed_time:.2f}s")
202
+ print("successfully")
203
+
204
+ if __name__ == "__main__":
205
+ G2PDeep_class()
@@ -0,0 +1,5 @@
1
+ from .G2Pdeep_class import G2PDeep_class
2
+
3
+ G2PDeep = G2PDeep_class
4
+
5
+ __all__ = ["G2PDeep","G2PDeep_class"]