gpbench 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (188) hide show
  1. gp_agent_tool/compute_dataset_feature.py +67 -0
  2. gp_agent_tool/config.py +65 -0
  3. gp_agent_tool/experience/create_masked_dataset_summary.py +97 -0
  4. gp_agent_tool/experience/dataset_summary_info.py +13 -0
  5. gp_agent_tool/experience/experience_info.py +12 -0
  6. gp_agent_tool/experience/get_matched_experience.py +111 -0
  7. gp_agent_tool/llm_client.py +119 -0
  8. gp_agent_tool/logging_utils.py +24 -0
  9. gp_agent_tool/main.py +347 -0
  10. gp_agent_tool/read_agent/__init__.py +46 -0
  11. gp_agent_tool/read_agent/nodes.py +674 -0
  12. gp_agent_tool/read_agent/prompts.py +547 -0
  13. gp_agent_tool/read_agent/python_repl_tool.py +165 -0
  14. gp_agent_tool/read_agent/state.py +101 -0
  15. gp_agent_tool/read_agent/workflow.py +54 -0
  16. gpbench/__init__.py +25 -0
  17. gpbench/_selftest.py +104 -0
  18. gpbench/method_class/BayesA/BayesA_class.py +141 -0
  19. gpbench/method_class/BayesA/__init__.py +5 -0
  20. gpbench/method_class/BayesA/_bayesfromR.py +96 -0
  21. gpbench/method_class/BayesA/_param_free_base_model.py +84 -0
  22. gpbench/method_class/BayesA/bayesAfromR.py +16 -0
  23. gpbench/method_class/BayesB/BayesB_class.py +140 -0
  24. gpbench/method_class/BayesB/__init__.py +5 -0
  25. gpbench/method_class/BayesB/_bayesfromR.py +96 -0
  26. gpbench/method_class/BayesB/_param_free_base_model.py +84 -0
  27. gpbench/method_class/BayesB/bayesBfromR.py +16 -0
  28. gpbench/method_class/BayesC/BayesC_class.py +141 -0
  29. gpbench/method_class/BayesC/__init__.py +4 -0
  30. gpbench/method_class/BayesC/_bayesfromR.py +96 -0
  31. gpbench/method_class/BayesC/_param_free_base_model.py +84 -0
  32. gpbench/method_class/BayesC/bayesCfromR.py +16 -0
  33. gpbench/method_class/CropARNet/CropARNet_class.py +186 -0
  34. gpbench/method_class/CropARNet/CropARNet_he_class.py +154 -0
  35. gpbench/method_class/CropARNet/__init__.py +5 -0
  36. gpbench/method_class/CropARNet/base_CropARNet_class.py +178 -0
  37. gpbench/method_class/Cropformer/Cropformer_class.py +308 -0
  38. gpbench/method_class/Cropformer/__init__.py +5 -0
  39. gpbench/method_class/Cropformer/cropformer_he_class.py +221 -0
  40. gpbench/method_class/DL_GWAS/DL_GWAS_class.py +250 -0
  41. gpbench/method_class/DL_GWAS/DL_GWAS_he_class.py +169 -0
  42. gpbench/method_class/DL_GWAS/__init__.py +5 -0
  43. gpbench/method_class/DNNGP/DNNGP_class.py +163 -0
  44. gpbench/method_class/DNNGP/DNNGP_he_class.py +138 -0
  45. gpbench/method_class/DNNGP/__init__.py +5 -0
  46. gpbench/method_class/DNNGP/base_dnngp_class.py +116 -0
  47. gpbench/method_class/DeepCCR/DeepCCR_class.py +172 -0
  48. gpbench/method_class/DeepCCR/DeepCCR_he_class.py +161 -0
  49. gpbench/method_class/DeepCCR/__init__.py +5 -0
  50. gpbench/method_class/DeepCCR/base_DeepCCR_class.py +209 -0
  51. gpbench/method_class/DeepGS/DeepGS_class.py +184 -0
  52. gpbench/method_class/DeepGS/DeepGS_he_class.py +150 -0
  53. gpbench/method_class/DeepGS/__init__.py +5 -0
  54. gpbench/method_class/DeepGS/base_deepgs_class.py +153 -0
  55. gpbench/method_class/EIR/EIR_class.py +276 -0
  56. gpbench/method_class/EIR/EIR_he_class.py +184 -0
  57. gpbench/method_class/EIR/__init__.py +5 -0
  58. gpbench/method_class/EIR/utils/__init__.py +0 -0
  59. gpbench/method_class/EIR/utils/array_output_modules.py +97 -0
  60. gpbench/method_class/EIR/utils/common.py +65 -0
  61. gpbench/method_class/EIR/utils/lcl_layers.py +235 -0
  62. gpbench/method_class/EIR/utils/logging.py +59 -0
  63. gpbench/method_class/EIR/utils/mlp_layers.py +92 -0
  64. gpbench/method_class/EIR/utils/models_locally_connected.py +642 -0
  65. gpbench/method_class/EIR/utils/transformer_models.py +546 -0
  66. gpbench/method_class/ElasticNet/ElasticNet_class.py +133 -0
  67. gpbench/method_class/ElasticNet/ElasticNet_he_class.py +91 -0
  68. gpbench/method_class/ElasticNet/__init__.py +5 -0
  69. gpbench/method_class/G2PDeep/G2PDeep_he_class.py +217 -0
  70. gpbench/method_class/G2PDeep/G2Pdeep_class.py +205 -0
  71. gpbench/method_class/G2PDeep/__init__.py +5 -0
  72. gpbench/method_class/G2PDeep/base_G2PDeep_class.py +209 -0
  73. gpbench/method_class/GBLUP/GBLUP_class.py +183 -0
  74. gpbench/method_class/GBLUP/__init__.py +5 -0
  75. gpbench/method_class/GEFormer/GEFormer_class.py +169 -0
  76. gpbench/method_class/GEFormer/GEFormer_he_class.py +137 -0
  77. gpbench/method_class/GEFormer/__init__.py +5 -0
  78. gpbench/method_class/GEFormer/gMLP_class.py +357 -0
  79. gpbench/method_class/LightGBM/LightGBM_class.py +224 -0
  80. gpbench/method_class/LightGBM/LightGBM_he_class.py +121 -0
  81. gpbench/method_class/LightGBM/__init__.py +5 -0
  82. gpbench/method_class/RF/RF_GPU_class.py +165 -0
  83. gpbench/method_class/RF/RF_GPU_he_class.py +124 -0
  84. gpbench/method_class/RF/__init__.py +5 -0
  85. gpbench/method_class/SVC/SVC_GPU.py +181 -0
  86. gpbench/method_class/SVC/SVC_GPU_he.py +106 -0
  87. gpbench/method_class/SVC/__init__.py +5 -0
  88. gpbench/method_class/SoyDNGP/AlexNet_206_class.py +179 -0
  89. gpbench/method_class/SoyDNGP/SoyDNGP_class.py +189 -0
  90. gpbench/method_class/SoyDNGP/SoyDNGP_he_class.py +112 -0
  91. gpbench/method_class/SoyDNGP/__init__.py +5 -0
  92. gpbench/method_class/XGBoost/XGboost_GPU_class.py +198 -0
  93. gpbench/method_class/XGBoost/XGboost_GPU_he_class.py +178 -0
  94. gpbench/method_class/XGBoost/__init__.py +5 -0
  95. gpbench/method_class/__init__.py +52 -0
  96. gpbench/method_class/rrBLUP/__init__.py +5 -0
  97. gpbench/method_class/rrBLUP/rrBLUP_class.py +140 -0
  98. gpbench/method_reg/BayesA/BayesA.py +116 -0
  99. gpbench/method_reg/BayesA/__init__.py +5 -0
  100. gpbench/method_reg/BayesA/_bayesfromR.py +96 -0
  101. gpbench/method_reg/BayesA/_param_free_base_model.py +84 -0
  102. gpbench/method_reg/BayesA/bayesAfromR.py +16 -0
  103. gpbench/method_reg/BayesB/BayesB.py +117 -0
  104. gpbench/method_reg/BayesB/__init__.py +5 -0
  105. gpbench/method_reg/BayesB/_bayesfromR.py +96 -0
  106. gpbench/method_reg/BayesB/_param_free_base_model.py +84 -0
  107. gpbench/method_reg/BayesB/bayesBfromR.py +16 -0
  108. gpbench/method_reg/BayesC/BayesC.py +115 -0
  109. gpbench/method_reg/BayesC/__init__.py +5 -0
  110. gpbench/method_reg/BayesC/_bayesfromR.py +96 -0
  111. gpbench/method_reg/BayesC/_param_free_base_model.py +84 -0
  112. gpbench/method_reg/BayesC/bayesCfromR.py +16 -0
  113. gpbench/method_reg/CropARNet/CropARNet.py +159 -0
  114. gpbench/method_reg/CropARNet/CropARNet_Hyperparameters.py +109 -0
  115. gpbench/method_reg/CropARNet/__init__.py +5 -0
  116. gpbench/method_reg/CropARNet/base_CropARNet.py +137 -0
  117. gpbench/method_reg/Cropformer/Cropformer.py +313 -0
  118. gpbench/method_reg/Cropformer/Cropformer_Hyperparameters.py +250 -0
  119. gpbench/method_reg/Cropformer/__init__.py +5 -0
  120. gpbench/method_reg/DL_GWAS/DL_GWAS.py +186 -0
  121. gpbench/method_reg/DL_GWAS/DL_GWAS_Hyperparameters.py +125 -0
  122. gpbench/method_reg/DL_GWAS/__init__.py +5 -0
  123. gpbench/method_reg/DNNGP/DNNGP.py +157 -0
  124. gpbench/method_reg/DNNGP/DNNGP_Hyperparameters.py +118 -0
  125. gpbench/method_reg/DNNGP/__init__.py +5 -0
  126. gpbench/method_reg/DNNGP/base_dnngp.py +101 -0
  127. gpbench/method_reg/DeepCCR/DeepCCR.py +149 -0
  128. gpbench/method_reg/DeepCCR/DeepCCR_Hyperparameters.py +110 -0
  129. gpbench/method_reg/DeepCCR/__init__.py +5 -0
  130. gpbench/method_reg/DeepCCR/base_DeepCCR.py +171 -0
  131. gpbench/method_reg/DeepGS/DeepGS.py +165 -0
  132. gpbench/method_reg/DeepGS/DeepGS_Hyperparameters.py +114 -0
  133. gpbench/method_reg/DeepGS/__init__.py +5 -0
  134. gpbench/method_reg/DeepGS/base_deepgs.py +98 -0
  135. gpbench/method_reg/EIR/EIR.py +258 -0
  136. gpbench/method_reg/EIR/EIR_Hyperparameters.py +178 -0
  137. gpbench/method_reg/EIR/__init__.py +5 -0
  138. gpbench/method_reg/EIR/utils/__init__.py +0 -0
  139. gpbench/method_reg/EIR/utils/array_output_modules.py +97 -0
  140. gpbench/method_reg/EIR/utils/common.py +65 -0
  141. gpbench/method_reg/EIR/utils/lcl_layers.py +235 -0
  142. gpbench/method_reg/EIR/utils/logging.py +59 -0
  143. gpbench/method_reg/EIR/utils/mlp_layers.py +92 -0
  144. gpbench/method_reg/EIR/utils/models_locally_connected.py +642 -0
  145. gpbench/method_reg/EIR/utils/transformer_models.py +546 -0
  146. gpbench/method_reg/ElasticNet/ElasticNet.py +123 -0
  147. gpbench/method_reg/ElasticNet/ElasticNet_he.py +83 -0
  148. gpbench/method_reg/ElasticNet/__init__.py +5 -0
  149. gpbench/method_reg/G2PDeep/G2PDeep_Hyperparameters.py +107 -0
  150. gpbench/method_reg/G2PDeep/G2Pdeep.py +166 -0
  151. gpbench/method_reg/G2PDeep/__init__.py +5 -0
  152. gpbench/method_reg/G2PDeep/base_G2PDeep.py +209 -0
  153. gpbench/method_reg/GBLUP/GBLUP_R.py +182 -0
  154. gpbench/method_reg/GBLUP/__init__.py +5 -0
  155. gpbench/method_reg/GEFormer/GEFormer.py +164 -0
  156. gpbench/method_reg/GEFormer/GEFormer_Hyperparameters.py +106 -0
  157. gpbench/method_reg/GEFormer/__init__.py +5 -0
  158. gpbench/method_reg/GEFormer/gMLP.py +341 -0
  159. gpbench/method_reg/LightGBM/LightGBM.py +237 -0
  160. gpbench/method_reg/LightGBM/LightGBM_Hyperparameters.py +77 -0
  161. gpbench/method_reg/LightGBM/__init__.py +5 -0
  162. gpbench/method_reg/MVP/MVP.py +182 -0
  163. gpbench/method_reg/MVP/MVP_Hyperparameters.py +126 -0
  164. gpbench/method_reg/MVP/__init__.py +5 -0
  165. gpbench/method_reg/MVP/base_MVP.py +113 -0
  166. gpbench/method_reg/RF/RF_GPU.py +174 -0
  167. gpbench/method_reg/RF/RF_Hyperparameters.py +163 -0
  168. gpbench/method_reg/RF/__init__.py +5 -0
  169. gpbench/method_reg/SVC/SVC_GPU.py +194 -0
  170. gpbench/method_reg/SVC/SVC_Hyperparameters.py +107 -0
  171. gpbench/method_reg/SVC/__init__.py +5 -0
  172. gpbench/method_reg/SoyDNGP/AlexNet_206.py +185 -0
  173. gpbench/method_reg/SoyDNGP/SoyDNGP.py +179 -0
  174. gpbench/method_reg/SoyDNGP/SoyDNGP_Hyperparameters.py +105 -0
  175. gpbench/method_reg/SoyDNGP/__init__.py +5 -0
  176. gpbench/method_reg/XGBoost/XGboost_GPU.py +188 -0
  177. gpbench/method_reg/XGBoost/XGboost_Hyperparameters.py +167 -0
  178. gpbench/method_reg/XGBoost/__init__.py +5 -0
  179. gpbench/method_reg/__init__.py +55 -0
  180. gpbench/method_reg/rrBLUP/__init__.py +5 -0
  181. gpbench/method_reg/rrBLUP/rrBLUP.py +123 -0
  182. gpbench-1.0.0.dist-info/METADATA +379 -0
  183. gpbench-1.0.0.dist-info/RECORD +188 -0
  184. gpbench-1.0.0.dist-info/WHEEL +5 -0
  185. gpbench-1.0.0.dist-info/entry_points.txt +2 -0
  186. gpbench-1.0.0.dist-info/top_level.txt +3 -0
  187. tests/test_import.py +80 -0
  188. tests/test_method.py +232 -0
@@ -0,0 +1,67 @@
1
+ import numpy as np
2
+ from scipy.stats import skew, kurtosis
3
+ import os
4
+
5
+ def process_one_phenotype(dataset_path:str) -> dict:
6
+ """
7
+ 处理单个表型,返回 summary 字典
8
+ """
9
+ geno_path = os.path.join(dataset_path, "genotype.npz")
10
+ pheno_path = os.path.join(dataset_path, "phenotype.npz")
11
+
12
+ genotype = np.load(geno_path)['arr_0']
13
+ pheno_file = np.load(pheno_path)
14
+ phenotype = pheno_file['arr_0']
15
+ phe_name = pheno_file['arr_1']
16
+ sp_name = pheno_file['arr_2']
17
+ phe_data = phenotype[:, 0]
18
+
19
+ # 去除缺失值
20
+ mask = ~np.isnan(phe_data)
21
+ phe_clean = phe_data[mask]
22
+ geno_clean = genotype[mask] if mask.sum() > 0 else genotype
23
+
24
+ summary = {
25
+ # 基本信息
26
+ # 'species_phenotype': f"{sp_name}/{phe_name}",
27
+ 'species': sp_name,
28
+ # 'phenotype_name': phe_name,
29
+
30
+ # 维度信息
31
+ 'n_samples_total': genotype.shape[0],
32
+ 'n_samples_valid': len(phe_clean),
33
+ 'n_markers': genotype.shape[1] if genotype.ndim > 1 else 1,
34
+ 'missing_rate': 1 - len(phe_clean) / genotype.shape[0],
35
+
36
+ # 表型统计特征
37
+ 'pheno_mean': np.mean(phe_clean) if len(phe_clean) > 0 else np.nan,
38
+ 'pheno_std': np.std(phe_clean) if len(phe_clean) > 0 else np.nan,
39
+ 'pheno_min': np.min(phe_clean) if len(phe_clean) > 0 else np.nan,
40
+ 'pheno_max': np.max(phe_clean) if len(phe_clean) > 0 else np.nan,
41
+ 'pheno_median': np.median(phe_clean) if len(phe_clean) > 0 else np.nan,
42
+ 'pheno_skewness': skew(phe_clean) if len(phe_clean) > 3 else np.nan,
43
+ 'pheno_kurtosis': kurtosis(phe_clean) if len(phe_clean) > 3 else np.nan,
44
+
45
+ # 基因型统计特征
46
+ 'geno_mean': np.mean(geno_clean) if geno_clean.size > 0 else np.nan,
47
+ 'geno_std': np.std(geno_clean) if geno_clean.size > 0 else np.nan,
48
+ 'geno_missing_rate': (
49
+ np.isnan(geno_clean).sum() / geno_clean.size
50
+ if geno_clean.size > 0 else np.nan
51
+ ),
52
+ 'geno_maf': (
53
+ np.mean(
54
+ np.minimum(
55
+ np.mean(geno_clean, axis=0),
56
+ 1 - np.mean(geno_clean, axis=0)
57
+ )
58
+ ) if geno_clean.ndim > 1 and geno_clean.size > 0 else np.nan
59
+ ),
60
+
61
+ # 类型信息
62
+ 'geno_dtype': str(genotype.dtype),
63
+ 'pheno_dtype': str(phe_data.dtype),
64
+ 'is_pheno_binary': len(np.unique(phe_clean)) == 2 if len(phe_clean) > 0 else False
65
+ }
66
+
67
+ return summary
@@ -0,0 +1,65 @@
1
+ """
2
+ Minimal config loader for gwas-llm-judge.
3
+
4
+ The config file should be located at:
5
+ /home/common/hwluo/project/gwas-llm-judge/config/config.json
6
+
7
+ and contain at least the sections:
8
+ - "llm"
9
+ - "codegen_llm"
10
+ - "multimodal_llm"
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import json
16
+ from pathlib import Path
17
+ from typing import Any, Dict, Optional
18
+
19
+ from logging_utils import get_logger
20
+
21
+
22
+ logger = get_logger(__name__)
23
+
24
+ _CONFIG_CACHE: Optional[Dict[str, Any]] = None
25
+
26
+
27
+ def _config_path() -> Path:
28
+ # 固定为项目下的 config/config.json,避免依赖旧项目路径
29
+ return Path(__file__).resolve().parent / "config" / "config.json"
30
+
31
+
32
+ def _load_config() -> Dict[str, Any]:
33
+ global _CONFIG_CACHE
34
+ if _CONFIG_CACHE is not None:
35
+ return _CONFIG_CACHE
36
+
37
+ path = _config_path()
38
+ try:
39
+ with path.open("r", encoding="utf-8") as f:
40
+ _CONFIG_CACHE = json.load(f)
41
+ except FileNotFoundError:
42
+ logger.warning("Config file not found: %s", path)
43
+ _CONFIG_CACHE = {}
44
+ except Exception as exc: # noqa: BLE001
45
+ logger.error("Failed to load config %s: %s", path, exc)
46
+ _CONFIG_CACHE = {}
47
+ return _CONFIG_CACHE
48
+
49
+
50
+ def get_llm_config() -> Dict[str, Any]:
51
+ cfg = _load_config()
52
+ return dict(cfg.get("llm", {}))
53
+
54
+
55
+ def get_codegen_llm_config() -> Dict[str, Any]:
56
+ cfg = _load_config()
57
+ return dict(cfg.get("codegen_llm", {}))
58
+
59
+
60
+ def get_multimodal_llm_config() -> Dict[str, Any]:
61
+ cfg = _load_config()
62
+ return dict(cfg.get("multimodal_llm", {}))
63
+
64
+
65
+
@@ -0,0 +1,97 @@
1
+ import csv
2
+ import os
3
+ import uuid
4
+ from typing import List, Optional
5
+
6
+
7
+ def create_masked_dataset_summary(
8
+ excluded_species_phenotypes: List[str],
9
+ source_csv: str = "dataset_summary.csv",
10
+ output_suffix: str = "_masked",
11
+ ) -> tuple[str, str]:
12
+ """
13
+ 根据给定的 species_phenotype 列表,从 dataset_summary.csv 中过滤掉这些行,
14
+ 生成一个新的 CSV 文件,并返回新文件的绝对路径与前 10 行预览(包含表头)。
15
+
16
+ 参数
17
+ ----
18
+ excluded_species_phenotypes : List[str]
19
+ 需要被过滤掉的 species_phenotype 值列表(与源 CSV 中的第一列一致)。
20
+ source_csv : str, optional
21
+ 源 CSV 文件路径,默认指向当前脚本同目录下的 dataset_summary.csv(相对路径)。
22
+ output_suffix : str, optional
23
+ 生成的新文件名后缀,默认 "_masked"。
24
+
25
+ 返回
26
+ ----
27
+ tuple[str, str]
28
+ 新生成的 CSV 文件的绝对路径,以及 masked 数据的前 10 行(含表头)序列化字符串。
29
+ """
30
+ # 基于当前脚本位置来构造相对路径,避免依赖运行时工作目录
31
+ base_dir = os.path.dirname(__file__)
32
+
33
+ # 如果用户传入的是相对路径,则基于脚本目录解析(默认 dataset_summary.csv)
34
+ if not os.path.isabs(source_csv):
35
+ source_csv = os.path.join(base_dir, source_csv)
36
+
37
+ if not os.path.exists(source_csv):
38
+ raise FileNotFoundError(f"Source CSV not found: {source_csv}")
39
+
40
+ # 读取源文件并过滤
41
+ # 为了在匹配时实现“大小写不敏感”,预先构造一个全部转为小写的排除集合
42
+ excluded_lower = {item.lower() for item in excluded_species_phenotypes}
43
+ kept_rows: List[List[str]] = []
44
+ header: Optional[List[str]] = None
45
+
46
+ with open(source_csv, "r", newline="", encoding="utf-8") as f:
47
+ reader = csv.reader(f)
48
+ for i, row in enumerate(reader):
49
+ if i == 0:
50
+ header = row
51
+ continue
52
+
53
+ # row[0] 应为 species_phenotype
54
+ species_pheno = row[0]
55
+ # 使用小写形式进行匹配,实现大小写不敏感
56
+ if species_pheno.lower() in excluded_lower:
57
+ continue
58
+ kept_rows.append(row)
59
+
60
+ # 预览前 10 行(含表头),序列化为字符串(逗号分隔,每行以 \n 拼接)
61
+ preview_rows: List[List[str]] = []
62
+ if header is not None:
63
+ preview_rows.append(header)
64
+ data_limit = max(0, 10 - len(preview_rows))
65
+ if data_limit > 0:
66
+ preview_rows.extend(kept_rows[:data_limit])
67
+ preview_str = "\n".join([",".join(row) for row in preview_rows])
68
+
69
+ # 构造输出文件路径:写到 experience/tmp 目录(相对脚本目录)
70
+ tmp_dir = os.path.join(base_dir, "tmp")
71
+ os.makedirs(tmp_dir, exist_ok=True)
72
+
73
+ _, src_name = os.path.split(source_csv)
74
+ name, ext = os.path.splitext(src_name)
75
+ uid = uuid.uuid4().hex
76
+ output_name = f"{name}{output_suffix}_{uid}{ext}"
77
+ output_path = os.path.join(tmp_dir, output_name)
78
+
79
+ # 写出新的 CSV 文件
80
+ with open(output_path, "w", newline="", encoding="utf-8") as f:
81
+ writer = csv.writer(f)
82
+ if header is not None:
83
+ writer.writerow(header)
84
+ writer.writerows(kept_rows)
85
+
86
+ return os.path.abspath(output_path), preview_str
87
+
88
+
89
+ if __name__ == "__main__":
90
+ # 示例:排除若干 species_phenotype
91
+ example_excluded = ["Cattle/mkg", "Chicken/EW28"]
92
+ new_path, preview = create_masked_dataset_summary(example_excluded)
93
+ print(f"Masked dataset summary written to: {new_path}")
94
+ print("Preview (first 10 rows):")
95
+ print(preview)
96
+
97
+
@@ -0,0 +1,13 @@
1
+
2
+ dataset_summary_info = {
3
+ "file_name": "dataset_summary.csv",
4
+ "file_path": "/home/common/xwzhang/Project/GPBench/gp_agent_tool/experience/dataset_summary.csv",
5
+ "description": "This is the summary of the dataset, including the number of samples, the number of missing values, the mean, the standard deviation, the minimum, the maximum, the median, the skewness, the kurtosis, the genotype mean, the genotype standard deviation, the genotype missing rate, the genotype minor allele frequency, the genotype data type, the phenotype data type, and whether the phenotype is binary.",
6
+ "preview": "\n".join(
7
+ [
8
+ "species,phenotype_name,description,n_samples_total,n_samples_valid,n_markers,missing_rate,pheno_mean,pheno_std,pheno_min,pheno_max,pheno_median,pheno_skewness,pheno_kurtosis,geno_mean,geno_std,geno_missing_rate,geno_maf,geno_dtype,pheno_dtype,is_pheno_binary",
9
+ "Cattle,mkg,Milk yield,5024,5024,42551,0.0,-2.587579633930134e-09,0.9999004678058506,-3.383414,3.318611,0.009681,0.0086577861867763,-0.0458583441004787,1.0713618086920649,0.8048099006002398,0.0,-0.1621657327056165,int64,float64,False",
10
+ ]
11
+ ),
12
+ }
13
+
@@ -0,0 +1,12 @@
1
+ experience_info = {
2
+ "file_name": "experience_origin.csv",
3
+ "file_path": "/home/common/xwzhang/Project/GPBench/gp_agent_tool/experience/experience_origin.csv",
4
+ "description": "This is the performance of 18 methods on different species and phenotype datasets, including the correlation coefficient, the mean absolute error, the mean squared error, the R2, the running time, and the resource usage.",
5
+ "preview": "\n".join(
6
+ [
7
+ "method,species,phenotype,description,all_time_s,corr_mean,corr_std,mae_mean,mae_std,mse_mean,mse_std,r2_mean,r2_std,cpu_mem_MB,gpu_mem_MB",
8
+ "rrBLUP,Cattle,scs, Somatic cell score,2373.4649,0.7525,0.01936,0.5229,0.02173,0.4339,0.04136,0.5646,0.02846,12677.914,0",
9
+ ]
10
+ ),
11
+ }
12
+
@@ -0,0 +1,111 @@
1
+ import csv
2
+ import os
3
+ import uuid
4
+ from typing import List, Optional, Tuple
5
+
6
+
7
+ def get_matched_experience(
8
+ target_species_phenotypes: Optional[List[str]],
9
+ source_csv: Optional[str] = None,
10
+ output_suffix: str = "_matched",
11
+ ) -> Tuple[str, str]:
12
+ """
13
+ 根据给定的 species/phenotype 列表,从 experience_origin.csv 中筛选出匹配的行,
14
+ 生成新的 CSV 文件,并返回新文件的绝对路径与前 10 行预览(包含表头,字符串形式)。
15
+
16
+ 参数
17
+ ----
18
+ target_species_phenotypes : List[str] | None
19
+ 需要保留的 species/phenotype 组合,格式如 "Cattle/fpro"。
20
+ 如果为 None,则不过滤,返回源 CSV 中的全部记录。
21
+ source_csv : str, optional
22
+ 源 CSV 文件路径,默认指向当前脚本同目录下的 experience_origin.csv(相对路径)。
23
+ output_suffix : str, optional
24
+ 生成的新文件名后缀,默认 "_matched"。
25
+
26
+ 返回
27
+ ----
28
+ tuple[str, str]
29
+ 新生成的 CSV 文件的绝对路径,以及匹配数据的前 10 行(含表头)序列化字符串。
30
+ """
31
+ base_dir = os.path.dirname(__file__)
32
+
33
+ if source_csv is None:
34
+ source_csv = os.path.join(base_dir, "experience_origin.csv")
35
+ elif not os.path.isabs(source_csv):
36
+ source_csv = os.path.join(base_dir, source_csv)
37
+
38
+ if not os.path.exists(source_csv):
39
+ raise FileNotFoundError(f"Source CSV not found: {source_csv}")
40
+
41
+ # 使用小写的 (species, phenotype) 组合来做匹配,
42
+ # 以实现大小写不敏感的匹配逻辑。
43
+ match_set = set()
44
+ if target_species_phenotypes is not None:
45
+ for item in target_species_phenotypes:
46
+ if "/" not in item:
47
+ raise ValueError(f"Invalid format (expected species/phenotype): {item}")
48
+ species, phenotype = item.split("/", 1)
49
+ species = species.strip()
50
+ phenotype = phenotype.strip()
51
+ if not species or not phenotype:
52
+ raise ValueError(f"Invalid species or phenotype in: {item}")
53
+ # 统一转为小写存入集合
54
+ match_set.add((species.lower(), phenotype.lower()))
55
+
56
+ kept_rows: List[List[str]] = []
57
+ header: Optional[List[str]] = None
58
+
59
+ with open(source_csv, "r", newline="", encoding="utf-8") as f:
60
+ reader = csv.reader(f)
61
+ for i, row in enumerate(reader):
62
+ if i == 0:
63
+ header = row
64
+ continue
65
+
66
+ if len(row) < 3:
67
+ continue
68
+
69
+ # 如果 target_species_phenotypes 为 None,则不过滤,保留所有记录
70
+ if target_species_phenotypes is None:
71
+ kept_rows.append(row)
72
+ else:
73
+ # 使用小写形式进行匹配,实现大小写不敏感
74
+ species_val = row[1].strip().lower()
75
+ pheno_val = row[2].strip().lower()
76
+ if (species_val, pheno_val) in match_set:
77
+ kept_rows.append(row)
78
+
79
+ preview_rows: List[List[str]] = []
80
+ if header is not None:
81
+ preview_rows.append(header)
82
+ data_limit = max(0, 10 - len(preview_rows))
83
+ if data_limit > 0:
84
+ preview_rows.extend(kept_rows[:data_limit])
85
+ preview_str = "\n".join([",".join(row) for row in preview_rows])
86
+
87
+ tmp_dir = os.path.join(base_dir, "tmp")
88
+ os.makedirs(tmp_dir, exist_ok=True)
89
+
90
+ _, src_name = os.path.split(source_csv)
91
+ name, ext = os.path.splitext(src_name)
92
+ uid = uuid.uuid4().hex
93
+ output_name = f"{name}{output_suffix}_{uid}{ext}"
94
+ output_path = os.path.join(tmp_dir, output_name)
95
+
96
+ with open(output_path, "w", newline="", encoding="utf-8") as f:
97
+ writer = csv.writer(f)
98
+ if header is not None:
99
+ writer.writerow(header)
100
+ writer.writerows(kept_rows)
101
+
102
+ return os.path.abspath(output_path), preview_str
103
+
104
+
105
+ if __name__ == "__main__":
106
+ sample_targets = ["Rice/GYP_BLUP", "Mouse/weight", "Chickpea/Yield"]
107
+ new_path, preview = get_matched_experience(sample_targets)
108
+ print(f"Matched experience written to: {new_path}")
109
+ print("Preview (first 10 rows):")
110
+ print(preview)
111
+
@@ -0,0 +1,119 @@
1
+ from __future__ import annotations
2
+
3
+ from functools import lru_cache
4
+ from typing import Any, Dict, List
5
+
6
+ from dashscope import MultiModalConversation
7
+ from langchain_core.messages import HumanMessage
8
+ from langchain_openai import ChatOpenAI
9
+
10
+ from config import (
11
+ get_codegen_llm_config,
12
+ get_llm_config,
13
+ get_multimodal_llm_config,
14
+ )
15
+ from logging_utils import get_logger
16
+
17
+
18
+ logger = get_logger(__name__)
19
+
20
+
21
+ @lru_cache(maxsize=1)
22
+ def _base_llm_config() -> Dict[str, Any]:
23
+ return get_llm_config()
24
+
25
+
26
+ @lru_cache(maxsize=1)
27
+ def _base_codegen_llm_config() -> Dict[str, Any]:
28
+ return get_codegen_llm_config()
29
+
30
+
31
+ def _build_chat_llm(
32
+ *, temperature: float, max_tokens: int, use_codegen: bool = False
33
+ ) -> ChatOpenAI:
34
+ base_config = _base_codegen_llm_config() if use_codegen else _base_llm_config()
35
+ params: Dict[str, Any] = {
36
+ "model": base_config.get("model"),
37
+ "api_key": base_config.get("api_key"),
38
+ "base_url": base_config.get("base_url"),
39
+ "temperature": temperature,
40
+ "max_tokens": max_tokens,
41
+ }
42
+ if base_config.get("timeout_seconds") is not None:
43
+ params["timeout"] = base_config["timeout_seconds"]
44
+ if base_config.get("max_retries") is not None:
45
+ params["max_retries"] = base_config["max_retries"]
46
+ return ChatOpenAI(**params)
47
+
48
+
49
+ def run_llm(
50
+ prompt: str,
51
+ *,
52
+ temperature: float,
53
+ max_tokens: int,
54
+ use_codegen: bool = False,
55
+ node_name: str = "unknown",
56
+ ) -> str:
57
+ """单轮对话 LLM 调用,返回文本内容。"""
58
+ base_config = _base_codegen_llm_config() if use_codegen else _base_llm_config()
59
+ model_name = base_config.get("model", "unknown")
60
+
61
+ logger.info(
62
+ "[LLM Input] Node: %s | Model: %s | UseCodegen: %s",
63
+ node_name,
64
+ model_name,
65
+ use_codegen,
66
+ )
67
+ logger.info("[LLM Input Full] Node: %s\n%s", node_name, prompt)
68
+
69
+ llm = _build_chat_llm(
70
+ temperature=temperature,
71
+ max_tokens=max_tokens,
72
+ use_codegen=use_codegen,
73
+ )
74
+ response = llm.invoke([HumanMessage(content=prompt)])
75
+ response_content = getattr(response, "content", "") or ""
76
+
77
+ logger.info("[LLM Output] Node: %s | Model: %s", node_name, model_name)
78
+ logger.info("[LLM Output Full] Node: %s\n%s", node_name, response_content)
79
+
80
+ return response_content
81
+
82
+
83
+ def run_multimodal_llm(
84
+ content_payload: List[dict],
85
+ *,
86
+ node_name: str = "unknown",
87
+ ) -> str:
88
+ """多模态 LLM 调用,当前用于图像分析。"""
89
+ multimodal_config = get_multimodal_llm_config()
90
+ model_name = multimodal_config.get("model", "unknown")
91
+
92
+ messages = [{"role": "user", "content": content_payload}]
93
+
94
+ logger.info(
95
+ "[Multimodal LLM Input] Node: %s | Model: %s",
96
+ node_name,
97
+ model_name,
98
+ )
99
+
100
+ response = MultiModalConversation.call(
101
+ api_key=multimodal_config["api_key"],
102
+ model=model_name,
103
+ messages=messages,
104
+ )
105
+
106
+ text = ""
107
+ if response.output and response.output.choices:
108
+ text = response.output.choices[0].message.content[0].get("text", "") or ""
109
+
110
+ logger.info(
111
+ "[Multimodal LLM Output] Node: %s | Model: %s",
112
+ node_name,
113
+ model_name,
114
+ )
115
+ logger.info("[Multimodal LLM Output Full] Node: %s\n%s", node_name, text)
116
+
117
+ return text
118
+
119
+
@@ -0,0 +1,24 @@
1
+ import logging
2
+ from typing import Optional
3
+
4
+
5
+ def get_logger(name: Optional[str] = None) -> logging.Logger:
6
+ """
7
+ 简单的 logger 封装,避免依赖旧项目。
8
+
9
+ - 默认使用 INFO 级别。
10
+ - 只在根 logger 尚未配置 handler 时添加一个 StreamHandler。
11
+ """
12
+ logger = logging.getLogger(name)
13
+ if not logging.getLogger().handlers:
14
+ handler = logging.StreamHandler()
15
+ formatter = logging.Formatter(
16
+ "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
17
+ )
18
+ handler.setFormatter(formatter)
19
+ logging.getLogger().addHandler(handler)
20
+ logging.getLogger().setLevel(logging.INFO)
21
+ return logger
22
+
23
+
24
+