gpbench 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (188) hide show
  1. gp_agent_tool/compute_dataset_feature.py +67 -0
  2. gp_agent_tool/config.py +65 -0
  3. gp_agent_tool/experience/create_masked_dataset_summary.py +97 -0
  4. gp_agent_tool/experience/dataset_summary_info.py +13 -0
  5. gp_agent_tool/experience/experience_info.py +12 -0
  6. gp_agent_tool/experience/get_matched_experience.py +111 -0
  7. gp_agent_tool/llm_client.py +119 -0
  8. gp_agent_tool/logging_utils.py +24 -0
  9. gp_agent_tool/main.py +347 -0
  10. gp_agent_tool/read_agent/__init__.py +46 -0
  11. gp_agent_tool/read_agent/nodes.py +674 -0
  12. gp_agent_tool/read_agent/prompts.py +547 -0
  13. gp_agent_tool/read_agent/python_repl_tool.py +165 -0
  14. gp_agent_tool/read_agent/state.py +101 -0
  15. gp_agent_tool/read_agent/workflow.py +54 -0
  16. gpbench/__init__.py +25 -0
  17. gpbench/_selftest.py +104 -0
  18. gpbench/method_class/BayesA/BayesA_class.py +141 -0
  19. gpbench/method_class/BayesA/__init__.py +5 -0
  20. gpbench/method_class/BayesA/_bayesfromR.py +96 -0
  21. gpbench/method_class/BayesA/_param_free_base_model.py +84 -0
  22. gpbench/method_class/BayesA/bayesAfromR.py +16 -0
  23. gpbench/method_class/BayesB/BayesB_class.py +140 -0
  24. gpbench/method_class/BayesB/__init__.py +5 -0
  25. gpbench/method_class/BayesB/_bayesfromR.py +96 -0
  26. gpbench/method_class/BayesB/_param_free_base_model.py +84 -0
  27. gpbench/method_class/BayesB/bayesBfromR.py +16 -0
  28. gpbench/method_class/BayesC/BayesC_class.py +141 -0
  29. gpbench/method_class/BayesC/__init__.py +4 -0
  30. gpbench/method_class/BayesC/_bayesfromR.py +96 -0
  31. gpbench/method_class/BayesC/_param_free_base_model.py +84 -0
  32. gpbench/method_class/BayesC/bayesCfromR.py +16 -0
  33. gpbench/method_class/CropARNet/CropARNet_class.py +186 -0
  34. gpbench/method_class/CropARNet/CropARNet_he_class.py +154 -0
  35. gpbench/method_class/CropARNet/__init__.py +5 -0
  36. gpbench/method_class/CropARNet/base_CropARNet_class.py +178 -0
  37. gpbench/method_class/Cropformer/Cropformer_class.py +308 -0
  38. gpbench/method_class/Cropformer/__init__.py +5 -0
  39. gpbench/method_class/Cropformer/cropformer_he_class.py +221 -0
  40. gpbench/method_class/DL_GWAS/DL_GWAS_class.py +250 -0
  41. gpbench/method_class/DL_GWAS/DL_GWAS_he_class.py +169 -0
  42. gpbench/method_class/DL_GWAS/__init__.py +5 -0
  43. gpbench/method_class/DNNGP/DNNGP_class.py +163 -0
  44. gpbench/method_class/DNNGP/DNNGP_he_class.py +138 -0
  45. gpbench/method_class/DNNGP/__init__.py +5 -0
  46. gpbench/method_class/DNNGP/base_dnngp_class.py +116 -0
  47. gpbench/method_class/DeepCCR/DeepCCR_class.py +172 -0
  48. gpbench/method_class/DeepCCR/DeepCCR_he_class.py +161 -0
  49. gpbench/method_class/DeepCCR/__init__.py +5 -0
  50. gpbench/method_class/DeepCCR/base_DeepCCR_class.py +209 -0
  51. gpbench/method_class/DeepGS/DeepGS_class.py +184 -0
  52. gpbench/method_class/DeepGS/DeepGS_he_class.py +150 -0
  53. gpbench/method_class/DeepGS/__init__.py +5 -0
  54. gpbench/method_class/DeepGS/base_deepgs_class.py +153 -0
  55. gpbench/method_class/EIR/EIR_class.py +276 -0
  56. gpbench/method_class/EIR/EIR_he_class.py +184 -0
  57. gpbench/method_class/EIR/__init__.py +5 -0
  58. gpbench/method_class/EIR/utils/__init__.py +0 -0
  59. gpbench/method_class/EIR/utils/array_output_modules.py +97 -0
  60. gpbench/method_class/EIR/utils/common.py +65 -0
  61. gpbench/method_class/EIR/utils/lcl_layers.py +235 -0
  62. gpbench/method_class/EIR/utils/logging.py +59 -0
  63. gpbench/method_class/EIR/utils/mlp_layers.py +92 -0
  64. gpbench/method_class/EIR/utils/models_locally_connected.py +642 -0
  65. gpbench/method_class/EIR/utils/transformer_models.py +546 -0
  66. gpbench/method_class/ElasticNet/ElasticNet_class.py +133 -0
  67. gpbench/method_class/ElasticNet/ElasticNet_he_class.py +91 -0
  68. gpbench/method_class/ElasticNet/__init__.py +5 -0
  69. gpbench/method_class/G2PDeep/G2PDeep_he_class.py +217 -0
  70. gpbench/method_class/G2PDeep/G2Pdeep_class.py +205 -0
  71. gpbench/method_class/G2PDeep/__init__.py +5 -0
  72. gpbench/method_class/G2PDeep/base_G2PDeep_class.py +209 -0
  73. gpbench/method_class/GBLUP/GBLUP_class.py +183 -0
  74. gpbench/method_class/GBLUP/__init__.py +5 -0
  75. gpbench/method_class/GEFormer/GEFormer_class.py +169 -0
  76. gpbench/method_class/GEFormer/GEFormer_he_class.py +137 -0
  77. gpbench/method_class/GEFormer/__init__.py +5 -0
  78. gpbench/method_class/GEFormer/gMLP_class.py +357 -0
  79. gpbench/method_class/LightGBM/LightGBM_class.py +224 -0
  80. gpbench/method_class/LightGBM/LightGBM_he_class.py +121 -0
  81. gpbench/method_class/LightGBM/__init__.py +5 -0
  82. gpbench/method_class/RF/RF_GPU_class.py +165 -0
  83. gpbench/method_class/RF/RF_GPU_he_class.py +124 -0
  84. gpbench/method_class/RF/__init__.py +5 -0
  85. gpbench/method_class/SVC/SVC_GPU.py +181 -0
  86. gpbench/method_class/SVC/SVC_GPU_he.py +106 -0
  87. gpbench/method_class/SVC/__init__.py +5 -0
  88. gpbench/method_class/SoyDNGP/AlexNet_206_class.py +179 -0
  89. gpbench/method_class/SoyDNGP/SoyDNGP_class.py +189 -0
  90. gpbench/method_class/SoyDNGP/SoyDNGP_he_class.py +112 -0
  91. gpbench/method_class/SoyDNGP/__init__.py +5 -0
  92. gpbench/method_class/XGBoost/XGboost_GPU_class.py +198 -0
  93. gpbench/method_class/XGBoost/XGboost_GPU_he_class.py +178 -0
  94. gpbench/method_class/XGBoost/__init__.py +5 -0
  95. gpbench/method_class/__init__.py +52 -0
  96. gpbench/method_class/rrBLUP/__init__.py +5 -0
  97. gpbench/method_class/rrBLUP/rrBLUP_class.py +140 -0
  98. gpbench/method_reg/BayesA/BayesA.py +116 -0
  99. gpbench/method_reg/BayesA/__init__.py +5 -0
  100. gpbench/method_reg/BayesA/_bayesfromR.py +96 -0
  101. gpbench/method_reg/BayesA/_param_free_base_model.py +84 -0
  102. gpbench/method_reg/BayesA/bayesAfromR.py +16 -0
  103. gpbench/method_reg/BayesB/BayesB.py +117 -0
  104. gpbench/method_reg/BayesB/__init__.py +5 -0
  105. gpbench/method_reg/BayesB/_bayesfromR.py +96 -0
  106. gpbench/method_reg/BayesB/_param_free_base_model.py +84 -0
  107. gpbench/method_reg/BayesB/bayesBfromR.py +16 -0
  108. gpbench/method_reg/BayesC/BayesC.py +115 -0
  109. gpbench/method_reg/BayesC/__init__.py +5 -0
  110. gpbench/method_reg/BayesC/_bayesfromR.py +96 -0
  111. gpbench/method_reg/BayesC/_param_free_base_model.py +84 -0
  112. gpbench/method_reg/BayesC/bayesCfromR.py +16 -0
  113. gpbench/method_reg/CropARNet/CropARNet.py +159 -0
  114. gpbench/method_reg/CropARNet/CropARNet_Hyperparameters.py +109 -0
  115. gpbench/method_reg/CropARNet/__init__.py +5 -0
  116. gpbench/method_reg/CropARNet/base_CropARNet.py +137 -0
  117. gpbench/method_reg/Cropformer/Cropformer.py +313 -0
  118. gpbench/method_reg/Cropformer/Cropformer_Hyperparameters.py +250 -0
  119. gpbench/method_reg/Cropformer/__init__.py +5 -0
  120. gpbench/method_reg/DL_GWAS/DL_GWAS.py +186 -0
  121. gpbench/method_reg/DL_GWAS/DL_GWAS_Hyperparameters.py +125 -0
  122. gpbench/method_reg/DL_GWAS/__init__.py +5 -0
  123. gpbench/method_reg/DNNGP/DNNGP.py +157 -0
  124. gpbench/method_reg/DNNGP/DNNGP_Hyperparameters.py +118 -0
  125. gpbench/method_reg/DNNGP/__init__.py +5 -0
  126. gpbench/method_reg/DNNGP/base_dnngp.py +101 -0
  127. gpbench/method_reg/DeepCCR/DeepCCR.py +149 -0
  128. gpbench/method_reg/DeepCCR/DeepCCR_Hyperparameters.py +110 -0
  129. gpbench/method_reg/DeepCCR/__init__.py +5 -0
  130. gpbench/method_reg/DeepCCR/base_DeepCCR.py +171 -0
  131. gpbench/method_reg/DeepGS/DeepGS.py +165 -0
  132. gpbench/method_reg/DeepGS/DeepGS_Hyperparameters.py +114 -0
  133. gpbench/method_reg/DeepGS/__init__.py +5 -0
  134. gpbench/method_reg/DeepGS/base_deepgs.py +98 -0
  135. gpbench/method_reg/EIR/EIR.py +258 -0
  136. gpbench/method_reg/EIR/EIR_Hyperparameters.py +178 -0
  137. gpbench/method_reg/EIR/__init__.py +5 -0
  138. gpbench/method_reg/EIR/utils/__init__.py +0 -0
  139. gpbench/method_reg/EIR/utils/array_output_modules.py +97 -0
  140. gpbench/method_reg/EIR/utils/common.py +65 -0
  141. gpbench/method_reg/EIR/utils/lcl_layers.py +235 -0
  142. gpbench/method_reg/EIR/utils/logging.py +59 -0
  143. gpbench/method_reg/EIR/utils/mlp_layers.py +92 -0
  144. gpbench/method_reg/EIR/utils/models_locally_connected.py +642 -0
  145. gpbench/method_reg/EIR/utils/transformer_models.py +546 -0
  146. gpbench/method_reg/ElasticNet/ElasticNet.py +123 -0
  147. gpbench/method_reg/ElasticNet/ElasticNet_he.py +83 -0
  148. gpbench/method_reg/ElasticNet/__init__.py +5 -0
  149. gpbench/method_reg/G2PDeep/G2PDeep_Hyperparameters.py +107 -0
  150. gpbench/method_reg/G2PDeep/G2Pdeep.py +166 -0
  151. gpbench/method_reg/G2PDeep/__init__.py +5 -0
  152. gpbench/method_reg/G2PDeep/base_G2PDeep.py +209 -0
  153. gpbench/method_reg/GBLUP/GBLUP_R.py +182 -0
  154. gpbench/method_reg/GBLUP/__init__.py +5 -0
  155. gpbench/method_reg/GEFormer/GEFormer.py +164 -0
  156. gpbench/method_reg/GEFormer/GEFormer_Hyperparameters.py +106 -0
  157. gpbench/method_reg/GEFormer/__init__.py +5 -0
  158. gpbench/method_reg/GEFormer/gMLP.py +341 -0
  159. gpbench/method_reg/LightGBM/LightGBM.py +237 -0
  160. gpbench/method_reg/LightGBM/LightGBM_Hyperparameters.py +77 -0
  161. gpbench/method_reg/LightGBM/__init__.py +5 -0
  162. gpbench/method_reg/MVP/MVP.py +182 -0
  163. gpbench/method_reg/MVP/MVP_Hyperparameters.py +126 -0
  164. gpbench/method_reg/MVP/__init__.py +5 -0
  165. gpbench/method_reg/MVP/base_MVP.py +113 -0
  166. gpbench/method_reg/RF/RF_GPU.py +174 -0
  167. gpbench/method_reg/RF/RF_Hyperparameters.py +163 -0
  168. gpbench/method_reg/RF/__init__.py +5 -0
  169. gpbench/method_reg/SVC/SVC_GPU.py +194 -0
  170. gpbench/method_reg/SVC/SVC_Hyperparameters.py +107 -0
  171. gpbench/method_reg/SVC/__init__.py +5 -0
  172. gpbench/method_reg/SoyDNGP/AlexNet_206.py +185 -0
  173. gpbench/method_reg/SoyDNGP/SoyDNGP.py +179 -0
  174. gpbench/method_reg/SoyDNGP/SoyDNGP_Hyperparameters.py +105 -0
  175. gpbench/method_reg/SoyDNGP/__init__.py +5 -0
  176. gpbench/method_reg/XGBoost/XGboost_GPU.py +188 -0
  177. gpbench/method_reg/XGBoost/XGboost_Hyperparameters.py +167 -0
  178. gpbench/method_reg/XGBoost/__init__.py +5 -0
  179. gpbench/method_reg/__init__.py +55 -0
  180. gpbench/method_reg/rrBLUP/__init__.py +5 -0
  181. gpbench/method_reg/rrBLUP/rrBLUP.py +123 -0
  182. gpbench-1.0.0.dist-info/METADATA +379 -0
  183. gpbench-1.0.0.dist-info/RECORD +188 -0
  184. gpbench-1.0.0.dist-info/WHEEL +5 -0
  185. gpbench-1.0.0.dist-info/entry_points.txt +2 -0
  186. gpbench-1.0.0.dist-info/top_level.txt +3 -0
  187. tests/test_import.py +80 -0
  188. tests/test_method.py +232 -0
@@ -0,0 +1,138 @@
1
+ import os
2
+ import time
3
+ import psutil
4
+ import random
5
+ import torch
6
+ import numpy as np
7
+ import optuna
8
+ from sklearn.model_selection import KFold, train_test_split
9
+ from sklearn.metrics import accuracy_score, precision_recall_fscore_support
10
+ from torch.utils.data import DataLoader, TensorDataset
11
+ from optuna.exceptions import TrialPruned
12
+ from .base_dnngp_class import DNNGP
13
+
14
+ def run_nested_cv_with_early_stopping(
15
+ data, label, nsnp,
16
+ learning_rate, dropout1, dropout2,
17
+ weight_decay, patience,
18
+ batch_size=64, epoch=1000):
19
+
20
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
21
+ print("Starting 10-fold cross-validation...")
22
+ kf = KFold(n_splits=10, shuffle=True, random_state=42)
23
+
24
+ all_acc, all_prec, all_rec, all_f1 = [], [], [], []
25
+
26
+ num_classes = len(np.unique(label))
27
+
28
+ for fold, (train_index, test_index) in enumerate(kf.split(data)):
29
+ print(f"Running fold {fold}...")
30
+ process = psutil.Process(os.getpid())
31
+ fold_start_time = time.time()
32
+
33
+ X_train, X_test = data[train_index], data[test_index]
34
+ y_train, y_test = label[train_index], label[test_index]
35
+
36
+ X_train_sub, X_valid, y_train_sub, y_valid = train_test_split(
37
+ X_train, y_train, test_size=0.1, random_state=42
38
+ )
39
+
40
+ x_train_tensor = torch.from_numpy(X_train_sub).float().to(device)
41
+ y_train_tensor = torch.from_numpy(y_train_sub).long().to(device)
42
+ x_valid_tensor = torch.from_numpy(X_valid).float().to(device)
43
+ y_valid_tensor = torch.from_numpy(y_valid).long().to(device)
44
+ x_test_tensor = torch.from_numpy(X_test).float().to(device)
45
+ y_test_tensor = torch.from_numpy(y_test).long().to(device)
46
+
47
+ x_train_tensor = x_train_tensor.unsqueeze(1)
48
+ x_valid_tensor = x_valid_tensor.unsqueeze(1)
49
+ x_test_tensor = x_test_tensor.unsqueeze(1)
50
+
51
+ train_loader = DataLoader(
52
+ TensorDataset(x_train_tensor, y_train_tensor),
53
+ batch_size=batch_size, shuffle=True
54
+ )
55
+ valid_loader = DataLoader(
56
+ TensorDataset(x_valid_tensor, y_valid_tensor),
57
+ batch_size=batch_size, shuffle=False
58
+ )
59
+ test_loader = DataLoader(
60
+ TensorDataset(x_test_tensor, y_test_tensor),
61
+ batch_size=batch_size, shuffle=False
62
+ )
63
+
64
+ model = DNNGP(nsnp, dropout1, dropout2, output_dim=num_classes).to(device)
65
+ model.loss_fn = torch.nn.CrossEntropyLoss()
66
+
67
+ model.train_model(
68
+ train_loader, valid_loader,
69
+ epoch, learning_rate, weight_decay, patience, device
70
+ )
71
+
72
+ logits = model.predict(test_loader)
73
+ y_pred = torch.argmax(torch.tensor(logits), dim=1).cpu().numpy()
74
+
75
+ acc = accuracy_score(y_test, y_pred)
76
+ prec, rec, f1, _ = precision_recall_fscore_support(
77
+ y_test, y_pred, average="macro", zero_division=0
78
+ )
79
+
80
+ all_acc.append(acc)
81
+ all_prec.append(prec)
82
+ all_rec.append(rec)
83
+ all_f1.append(f1)
84
+
85
+ fold_time = time.time() - fold_start_time
86
+ fold_cpu_mem = process.memory_info().rss / 1024**2
87
+
88
+ print(
89
+ f"Fold {fold}: "
90
+ f"ACC={acc:.4f}, PREC={prec:.4f}, REC={rec:.4f}, F1={f1:.4f}, "
91
+ f"Time={fold_time:.2f}s, CPU={fold_cpu_mem:.2f}MB"
92
+ )
93
+
94
+ return np.mean(all_f1)
95
+
96
+ def set_seed(seed=42):
97
+ random.seed(seed)
98
+ np.random.seed(seed)
99
+ torch.manual_seed(seed)
100
+ if torch.cuda.is_available():
101
+ torch.cuda.manual_seed_all(seed)
102
+ torch.backends.cudnn.deterministic = True
103
+ torch.backends.cudnn.benchmark = False
104
+
105
+ def Hyperparameter(data, label, nsnp):
106
+ set_seed(42)
107
+
108
+ def objective(trial):
109
+ lr = trial.suggest_float("learning_rate", 1e-4, 0.1, log=True)
110
+ patience = trial.suggest_int("patience", 1, 10)
111
+ batch_size = trial.suggest_categorical("batch_size", [32, 64])
112
+ dropout1 = trial.suggest_float("dropout1", 0.0, 0.9, step=0.1)
113
+ dropout2 = trial.suggest_float("dropout2", 0.0, 0.9, step=0.1)
114
+ weight_decay = trial.suggest_categorical("weight_decay", [1e-5, 1e-4, 1e-3])
115
+
116
+ try:
117
+ f1 = run_nested_cv_with_early_stopping(
118
+ data=data,
119
+ label=label,
120
+ nsnp=nsnp,
121
+ learning_rate=lr,
122
+ dropout1=dropout1,
123
+ dropout2=dropout2,
124
+ weight_decay=weight_decay,
125
+ patience=patience,
126
+ batch_size=batch_size
127
+ )
128
+ except TrialPruned:
129
+ return float("-inf")
130
+
131
+ return f1
132
+
133
+ study = optuna.create_study(direction="maximize")
134
+ study.optimize(objective, n_trials=20)
135
+
136
+ print("Best hyperparameters:", study.best_params)
137
+ print("successfully")
138
+ return study.best_params
@@ -0,0 +1,5 @@
1
+ from .DNNGP_class import DNNGP_class
2
+
3
+ DNNGP = DNNGP_class
4
+
5
+ __all__ = ["DNNGP","DNNGP_class"]
@@ -0,0 +1,116 @@
1
+ import torch
2
+ import torch.nn as nn
3
+ import numpy as np
4
+
5
+
6
+ class DNNGP(nn.Module):
7
+ def __init__(self, input_size, dropout1, dropout2, output_dim):
8
+ super().__init__()
9
+
10
+ self.CNN1 = nn.Conv1d(in_channels=1, out_channels=64, kernel_size=4)
11
+ self.Relu1 = nn.ReLU()
12
+ self.Drop1 = nn.Dropout(dropout1)
13
+
14
+ self.Batchnorm = nn.BatchNorm1d(num_features=64)
15
+
16
+ self.CNN2 = nn.Conv1d(in_channels=64, out_channels=64, kernel_size=4)
17
+ self.Relu2 = nn.ReLU()
18
+ self.Drop2 = nn.Dropout(dropout2)
19
+
20
+ self.CNN3 = nn.Conv1d(in_channels=64, out_channels=64, kernel_size=4)
21
+ self.Relu3 = nn.ReLU()
22
+
23
+ self.Flatten = nn.Flatten()
24
+ self.Dense = nn.Linear(in_features=64 * (input_size - 9), out_features=3)
25
+
26
+ self.Output = nn.Linear(in_features=3, out_features=output_dim)
27
+
28
+ def forward(self, x):
29
+ x = self.CNN1(x)
30
+ x = self.Relu1(x)
31
+ x = self.Drop1(x)
32
+ x = self.Batchnorm(x)
33
+
34
+ x = self.CNN2(x)
35
+ x = self.Relu2(x)
36
+ x = self.Drop2(x)
37
+
38
+ x = self.CNN3(x)
39
+ x = self.Relu3(x)
40
+
41
+ x = self.Flatten(x)
42
+ x = self.Dense(x)
43
+ x = self.Output(x)
44
+ return x
45
+
46
+
47
+ def train_model(
48
+ self, train_loader, valid_loader,
49
+ num_epochs, learning_rate, weight_decay,
50
+ patience, device
51
+ ):
52
+ optimizer = torch.optim.Adam(
53
+ self.parameters(), lr=learning_rate, weight_decay=weight_decay
54
+ )
55
+ criterion = nn.CrossEntropyLoss()
56
+ self.to(device)
57
+
58
+ best_loss = float('inf')
59
+ best_state = None
60
+ trigger_times = 0
61
+
62
+ for epoch in range(num_epochs):
63
+ self.train()
64
+ train_loss = 0.0
65
+
66
+ for inputs, labels in train_loader:
67
+ inputs = inputs.to(device)
68
+ labels = labels.to(device)
69
+ optimizer.zero_grad()
70
+ outputs = self(inputs)
71
+ loss = criterion(outputs, labels)
72
+ loss.backward()
73
+ optimizer.step()
74
+
75
+ train_loss += loss.item() * inputs.size(0)
76
+
77
+ self.eval()
78
+ valid_loss = 0.0
79
+ with torch.no_grad():
80
+ for inputs, labels in valid_loader:
81
+ inputs = inputs.to(device)
82
+ labels = labels.to(device)
83
+ outputs = self(inputs)
84
+ loss = criterion(outputs, labels)
85
+ valid_loss += loss.item() * inputs.size(0)
86
+
87
+ train_loss /= len(train_loader.dataset)
88
+ valid_loss /= len(valid_loader.dataset)
89
+
90
+ # ---------- Early stopping ----------
91
+ if valid_loss < best_loss:
92
+ best_loss = valid_loss
93
+ best_state = self.state_dict()
94
+ trigger_times = 0
95
+ else:
96
+ trigger_times += 1
97
+ if trigger_times >= patience:
98
+ print(f"Early stopping at epoch {epoch+1}")
99
+ break
100
+
101
+ if best_state is not None:
102
+ self.load_state_dict(best_state)
103
+
104
+ return best_loss
105
+
106
+
107
+ def predict(self, test_loader):
108
+ self.eval()
109
+ logits = []
110
+ with torch.no_grad():
111
+ for inputs, _ in test_loader:
112
+ outputs = self(inputs)
113
+ logits.append(outputs.cpu().numpy())
114
+
115
+ logits = np.concatenate(logits, axis=0) # (N, C)
116
+ return logits
@@ -0,0 +1,172 @@
1
+ import os
2
+ import time
3
+ import psutil
4
+ import swanlab
5
+ import argparse
6
+ import random
7
+ import torch
8
+ import numpy as np
9
+ import pandas as pd
10
+ from sklearn.preprocessing import LabelEncoder
11
+ from sklearn.model_selection import StratifiedKFold, train_test_split
12
+ from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
13
+ from torch.utils.data import DataLoader, TensorDataset
14
+
15
+ from .base_DeepCCR_class import DeepCCR
16
+ from . import DeepCCR_he_class
17
+
18
+ def parse_args():
19
+ parser = argparse.ArgumentParser()
20
+ parser.add_argument('--methods', type=str, default='DeepCCR/')
21
+ parser.add_argument('--species', type=str, default='')
22
+ parser.add_argument('--phe', type=str, default='')
23
+ parser.add_argument('--data_dir', type=str, default='../../data/')
24
+ parser.add_argument('--result_dir', type=str, default='result/')
25
+
26
+ parser.add_argument('--epoch', type=int, default=1000)
27
+ parser.add_argument('--batch_size', type=int, default=64)
28
+ parser.add_argument('--lr', type=float, default=0.001)
29
+ parser.add_argument('--patience', type=int, default=10)
30
+ return parser.parse_args()
31
+
32
+ def load_data(args):
33
+ xData = np.load(os.path.join(args.data_dir, args.species, 'genotype.npz'))["arr_0"]
34
+ yData = np.load(os.path.join(args.data_dir, args.species, 'phenotype.npz'))["arr_0"]
35
+ names = np.load(os.path.join(args.data_dir, args.species, 'phenotype.npz'))["arr_1"]
36
+
37
+ print("Samples:", xData.shape[0])
38
+ print("SNPs:", xData.shape[1])
39
+ return xData, yData, xData.shape[0], xData.shape[1], names
40
+
41
+ def set_seed(seed=42):
42
+ random.seed(seed)
43
+ np.random.seed(seed)
44
+ torch.manual_seed(seed)
45
+ torch.cuda.manual_seed_all(seed)
46
+ torch.backends.cudnn.deterministic = True
47
+ torch.backends.cudnn.benchmark = False
48
+
49
+ def run_nested_cv(args, data, label, nsnp, num_classes, device):
50
+
51
+ result_dir = os.path.join(args.result_dir, args.methods + args.species + args.phe)
52
+ os.makedirs(result_dir, exist_ok=True)
53
+
54
+ kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
55
+
56
+ all_acc, all_f1, all_prec, all_rec = [], [], [], []
57
+ time_start = time.time()
58
+
59
+ for fold, (train_index, test_index) in enumerate(kf.split(data, label)):
60
+ print(f"Running fold {fold}...")
61
+ process = psutil.Process(os.getpid())
62
+ fold_start = time.time()
63
+
64
+ X_train, X_test = data[train_index], data[test_index]
65
+ y_train, y_test = label[train_index], label[test_index]
66
+
67
+ X_tr, X_val, y_tr, y_val = train_test_split(
68
+ X_train, y_train, test_size=0.1, random_state=42, stratify=y_train
69
+ )
70
+
71
+ # Tensor
72
+ x_tr = torch.from_numpy(X_tr).float().unsqueeze(1)
73
+ y_tr = torch.from_numpy(y_tr).long()
74
+ x_val = torch.from_numpy(X_val).float().unsqueeze(1)
75
+ y_val = torch.from_numpy(y_val).long()
76
+ x_te = torch.from_numpy(X_test).float().unsqueeze(1)
77
+ y_te = torch.from_numpy(y_test).long()
78
+
79
+ train_loader = DataLoader(TensorDataset(x_tr, y_tr), args.batch_size, shuffle=True)
80
+ valid_loader = DataLoader(TensorDataset(x_val, y_val), args.batch_size)
81
+ test_loader = DataLoader(TensorDataset(x_te, y_te), args.batch_size)
82
+
83
+ model = DeepCCR(input_seq_len=nsnp, num_classes=num_classes)
84
+ model.train_model(
85
+ train_loader, valid_loader,
86
+ args.epoch, args.lr, args.patience, device
87
+ )
88
+
89
+ y_pred = model.predict(test_loader, device)
90
+
91
+ acc = accuracy_score(y_test, y_pred)
92
+ f1 = f1_score(y_test, y_pred, average='macro')
93
+ prec = precision_score(y_test, y_pred, average='macro', zero_division=0)
94
+ rec = recall_score(y_test, y_pred, average='macro')
95
+
96
+ all_acc.append(acc)
97
+ all_f1.append(f1)
98
+ all_prec.append(prec)
99
+ all_rec.append(rec)
100
+
101
+ fold_time = time.time() - fold_start
102
+ gpu_mem = torch.cuda.max_memory_allocated() / 1024**2 if torch.cuda.is_available() else 0
103
+ cpu_mem = process.memory_info().rss / 1024**2
104
+
105
+ print(
106
+ f"Fold {fold}: ACC={acc:.4f}, F1={f1:.4f}, "
107
+ f"Prec={prec:.4f}, Rec={rec:.4f}, "
108
+ f"Time={fold_time:.2f}s"
109
+ )
110
+
111
+ pd.DataFrame({
112
+ "y_true": y_test,
113
+ "y_pred": y_pred
114
+ }).to_csv(os.path.join(result_dir, f"fold{fold}.csv"), index=False)
115
+
116
+ torch.cuda.empty_cache()
117
+ torch.cuda.reset_peak_memory_stats()
118
+
119
+ total_time = time.time() - time_start
120
+
121
+ print("\n===== CV Summary =====")
122
+ print(f"ACC : {np.mean(all_acc):.4f} ± {np.std(all_acc):.4f}")
123
+ print(f"F1 : {np.mean(all_f1):.4f} ± {np.std(all_f1):.4f}")
124
+ print(f"Time: {total_time:.2f}s")
125
+
126
+
127
+
128
+ def DeepCCR_class():
129
+ set_seed(42)
130
+ torch.cuda.empty_cache()
131
+
132
+ args = parse_args()
133
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
134
+
135
+ all_species = ["Human/Sim/"]
136
+
137
+ for species in all_species:
138
+ args.species = species
139
+ X, Y, nsamples, nsnp, names = load_data(args)
140
+ print("Starting:", args.methods + args.species)
141
+
142
+ label = Y[:, 0]
143
+ label_series = pd.Series(label)
144
+ if label_series.isna().any():
145
+ mode_val = label_series.mode()
146
+ fill_val = mode_val.iloc[0] if len(mode_val) > 0 else label_series.dropna().iloc[0] if not label_series.dropna().empty else 0
147
+ label = label_series.fillna(fill_val).values
148
+ else:
149
+ label = label_series.values
150
+
151
+ le = LabelEncoder()
152
+ label = le.fit_transform(label)
153
+ num_classes = len(np.unique(label))
154
+
155
+ result_dir = os.path.join(args.result_dir, args.methods + args.species + args.phe)
156
+ os.makedirs(result_dir, exist_ok=True)
157
+ np.save(os.path.join(result_dir, 'label_mapping.npy'), le.classes_)
158
+
159
+ best_params = DeepCCR_he_class.Hyperparameter(X, label, nsnp)
160
+ args.lr = best_params['learning_rate']
161
+ args.patience = best_params['patience']
162
+ args.batch_size = best_params['batch_size']
163
+ start_time = time.time()
164
+ run_nested_cv(args, X, label, nsnp, num_classes, device)
165
+ elapsed = time.time() - start_time
166
+
167
+ print(f"Total running time: {elapsed:.2f}s")
168
+ print("Successfully finished\n")
169
+
170
+
171
+ if __name__ == "__main__":
172
+ DeepCCR_class()
@@ -0,0 +1,161 @@
1
+ import os
2
+ import time
3
+ import psutil
4
+ import random
5
+ import torch
6
+ import numpy as np
7
+ import optuna
8
+
9
+ from sklearn.model_selection import KFold, train_test_split
10
+ from sklearn.metrics import (
11
+ accuracy_score,
12
+ precision_score,
13
+ recall_score,
14
+ f1_score
15
+ )
16
+ from torch.utils.data import DataLoader, TensorDataset
17
+ from optuna.exceptions import TrialPruned
18
+
19
+ from .base_DeepCCR_class import DeepCCR
20
+
21
+ def run_nested_cv_with_early_stopping(
22
+ data,
23
+ label,
24
+ nsnp,
25
+ num_classes,
26
+ learning_rate,
27
+ batch_size,
28
+ patience,
29
+ epoch=1000
30
+ ):
31
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
32
+ print("Starting 10-fold cross-validation...")
33
+
34
+ kf = KFold(n_splits=10, shuffle=True, random_state=42)
35
+
36
+ acc_list, pre_list, rec_list, f1_list = [], [], [], []
37
+
38
+ for fold, (train_index, test_index) in enumerate(kf.split(data)):
39
+ print(f"Running fold {fold}...")
40
+ process = psutil.Process(os.getpid())
41
+ fold_start_time = time.time()
42
+
43
+ X_train, X_test = data[train_index], data[test_index]
44
+ y_train, y_test = label[train_index], label[test_index]
45
+
46
+ X_tr, X_val, y_tr, y_val = train_test_split(
47
+ X_train, y_train, test_size=0.1, random_state=42
48
+ )
49
+
50
+ x_tr = torch.from_numpy(X_tr).float().unsqueeze(1)
51
+ y_tr = torch.from_numpy(y_tr).long()
52
+ x_val = torch.from_numpy(X_val).float().unsqueeze(1)
53
+ y_val = torch.from_numpy(y_val).long()
54
+ x_te = torch.from_numpy(X_test).float().unsqueeze(1)
55
+ y_te = torch.from_numpy(y_test).long()
56
+
57
+ train_loader = DataLoader(
58
+ TensorDataset(x_tr, y_tr), batch_size, shuffle=True
59
+ )
60
+ valid_loader = DataLoader(
61
+ TensorDataset(x_val, y_val), batch_size, shuffle=False
62
+ )
63
+ test_loader = DataLoader(
64
+ TensorDataset(x_te, y_te), batch_size, shuffle=False
65
+ )
66
+
67
+ model = DeepCCR(
68
+ input_seq_len=nsnp,
69
+ num_classes=num_classes
70
+ )
71
+
72
+ model.train_model(
73
+ train_loader,
74
+ valid_loader,
75
+ epoch,
76
+ learning_rate,
77
+ patience,
78
+ device
79
+ )
80
+
81
+ y_pred = model.predict(test_loader, device)
82
+ acc = accuracy_score(y_test, y_pred)
83
+ pre = precision_score(y_test, y_pred, average="macro", zero_division=0)
84
+ rec = recall_score(y_test, y_pred, average="macro", zero_division=0)
85
+ f1 = f1_score(y_test, y_pred, average="macro", zero_division=0)
86
+
87
+ if np.isnan(f1):
88
+ print(f"Fold {fold} produced NaN F1, pruning trial.")
89
+ raise TrialPruned()
90
+
91
+ acc_list.append(acc)
92
+ pre_list.append(pre)
93
+ rec_list.append(rec)
94
+ f1_list.append(f1)
95
+
96
+ fold_time = time.time() - fold_start_time
97
+ fold_gpu_mem = (
98
+ torch.cuda.max_memory_allocated() / 1024**2
99
+ if torch.cuda.is_available()
100
+ else 0
101
+ )
102
+ fold_cpu_mem = process.memory_info().rss / 1024**2
103
+
104
+ print(
105
+ f"Fold {fold}: "
106
+ f"ACC={acc:.4f}, PRE={pre:.4f}, REC={rec:.4f}, F1={f1:.4f}, "
107
+ f"Time={fold_time:.2f}s, "
108
+ f"GPU={fold_gpu_mem:.2f}MB, CPU={fold_cpu_mem:.2f}MB"
109
+ )
110
+
111
+ torch.cuda.empty_cache()
112
+ torch.cuda.reset_peak_memory_stats()
113
+
114
+ return {
115
+ "acc": np.mean(acc_list),
116
+ "pre": np.mean(pre_list),
117
+ "rec": np.mean(rec_list),
118
+ "f1": np.mean(f1_list),
119
+ }
120
+
121
+ def set_seed(seed=42):
122
+ random.seed(seed)
123
+ np.random.seed(seed)
124
+ torch.manual_seed(seed)
125
+ if torch.cuda.is_available():
126
+ torch.cuda.manual_seed_all(seed)
127
+ torch.backends.cudnn.deterministic = True
128
+ torch.backends.cudnn.benchmark = False
129
+
130
+ def Hyperparameter(data, label, nsnp):
131
+ set_seed(42)
132
+
133
+ label = label.astype(int)
134
+ num_classes = len(np.unique(label))
135
+ print("Number of classes:", num_classes)
136
+
137
+ def objective(trial):
138
+ lr = trial.suggest_float("learning_rate", 1e-4, 1e-2, log=True)
139
+ batch_size = trial.suggest_categorical("batch_size", [32, 64, 128])
140
+ patience = trial.suggest_int("patience", 3, 15)
141
+
142
+ try:
143
+ metrics = run_nested_cv_with_early_stopping(
144
+ data=data,
145
+ label=label,
146
+ nsnp=nsnp,
147
+ num_classes=num_classes,
148
+ learning_rate=lr,
149
+ batch_size=batch_size,
150
+ patience=patience
151
+ )
152
+ except TrialPruned:
153
+ return float("-inf")
154
+ return metrics["f1"]
155
+
156
+ study = optuna.create_study(direction="maximize")
157
+ study.optimize(objective, n_trials=20)
158
+
159
+ print("Best parameters:", study.best_params)
160
+ print("successfully")
161
+ return study.best_params
@@ -0,0 +1,5 @@
1
+ from .DeepCCR_class import DeepCCR_class
2
+
3
+ DeepCCR = DeepCCR_class
4
+
5
+ __all__ = ["DeepCCR","DeepCCR_class"]