gpbench 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gp_agent_tool/compute_dataset_feature.py +67 -0
- gp_agent_tool/config.py +65 -0
- gp_agent_tool/experience/create_masked_dataset_summary.py +97 -0
- gp_agent_tool/experience/dataset_summary_info.py +13 -0
- gp_agent_tool/experience/experience_info.py +12 -0
- gp_agent_tool/experience/get_matched_experience.py +111 -0
- gp_agent_tool/llm_client.py +119 -0
- gp_agent_tool/logging_utils.py +24 -0
- gp_agent_tool/main.py +347 -0
- gp_agent_tool/read_agent/__init__.py +46 -0
- gp_agent_tool/read_agent/nodes.py +674 -0
- gp_agent_tool/read_agent/prompts.py +547 -0
- gp_agent_tool/read_agent/python_repl_tool.py +165 -0
- gp_agent_tool/read_agent/state.py +101 -0
- gp_agent_tool/read_agent/workflow.py +54 -0
- gpbench/__init__.py +25 -0
- gpbench/_selftest.py +104 -0
- gpbench/method_class/BayesA/BayesA_class.py +141 -0
- gpbench/method_class/BayesA/__init__.py +5 -0
- gpbench/method_class/BayesA/_bayesfromR.py +96 -0
- gpbench/method_class/BayesA/_param_free_base_model.py +84 -0
- gpbench/method_class/BayesA/bayesAfromR.py +16 -0
- gpbench/method_class/BayesB/BayesB_class.py +140 -0
- gpbench/method_class/BayesB/__init__.py +5 -0
- gpbench/method_class/BayesB/_bayesfromR.py +96 -0
- gpbench/method_class/BayesB/_param_free_base_model.py +84 -0
- gpbench/method_class/BayesB/bayesBfromR.py +16 -0
- gpbench/method_class/BayesC/BayesC_class.py +141 -0
- gpbench/method_class/BayesC/__init__.py +4 -0
- gpbench/method_class/BayesC/_bayesfromR.py +96 -0
- gpbench/method_class/BayesC/_param_free_base_model.py +84 -0
- gpbench/method_class/BayesC/bayesCfromR.py +16 -0
- gpbench/method_class/CropARNet/CropARNet_class.py +186 -0
- gpbench/method_class/CropARNet/CropARNet_he_class.py +154 -0
- gpbench/method_class/CropARNet/__init__.py +5 -0
- gpbench/method_class/CropARNet/base_CropARNet_class.py +178 -0
- gpbench/method_class/Cropformer/Cropformer_class.py +308 -0
- gpbench/method_class/Cropformer/__init__.py +5 -0
- gpbench/method_class/Cropformer/cropformer_he_class.py +221 -0
- gpbench/method_class/DL_GWAS/DL_GWAS_class.py +250 -0
- gpbench/method_class/DL_GWAS/DL_GWAS_he_class.py +169 -0
- gpbench/method_class/DL_GWAS/__init__.py +5 -0
- gpbench/method_class/DNNGP/DNNGP_class.py +163 -0
- gpbench/method_class/DNNGP/DNNGP_he_class.py +138 -0
- gpbench/method_class/DNNGP/__init__.py +5 -0
- gpbench/method_class/DNNGP/base_dnngp_class.py +116 -0
- gpbench/method_class/DeepCCR/DeepCCR_class.py +172 -0
- gpbench/method_class/DeepCCR/DeepCCR_he_class.py +161 -0
- gpbench/method_class/DeepCCR/__init__.py +5 -0
- gpbench/method_class/DeepCCR/base_DeepCCR_class.py +209 -0
- gpbench/method_class/DeepGS/DeepGS_class.py +184 -0
- gpbench/method_class/DeepGS/DeepGS_he_class.py +150 -0
- gpbench/method_class/DeepGS/__init__.py +5 -0
- gpbench/method_class/DeepGS/base_deepgs_class.py +153 -0
- gpbench/method_class/EIR/EIR_class.py +276 -0
- gpbench/method_class/EIR/EIR_he_class.py +184 -0
- gpbench/method_class/EIR/__init__.py +5 -0
- gpbench/method_class/EIR/utils/__init__.py +0 -0
- gpbench/method_class/EIR/utils/array_output_modules.py +97 -0
- gpbench/method_class/EIR/utils/common.py +65 -0
- gpbench/method_class/EIR/utils/lcl_layers.py +235 -0
- gpbench/method_class/EIR/utils/logging.py +59 -0
- gpbench/method_class/EIR/utils/mlp_layers.py +92 -0
- gpbench/method_class/EIR/utils/models_locally_connected.py +642 -0
- gpbench/method_class/EIR/utils/transformer_models.py +546 -0
- gpbench/method_class/ElasticNet/ElasticNet_class.py +133 -0
- gpbench/method_class/ElasticNet/ElasticNet_he_class.py +91 -0
- gpbench/method_class/ElasticNet/__init__.py +5 -0
- gpbench/method_class/G2PDeep/G2PDeep_he_class.py +217 -0
- gpbench/method_class/G2PDeep/G2Pdeep_class.py +205 -0
- gpbench/method_class/G2PDeep/__init__.py +5 -0
- gpbench/method_class/G2PDeep/base_G2PDeep_class.py +209 -0
- gpbench/method_class/GBLUP/GBLUP_class.py +183 -0
- gpbench/method_class/GBLUP/__init__.py +5 -0
- gpbench/method_class/GEFormer/GEFormer_class.py +169 -0
- gpbench/method_class/GEFormer/GEFormer_he_class.py +137 -0
- gpbench/method_class/GEFormer/__init__.py +5 -0
- gpbench/method_class/GEFormer/gMLP_class.py +357 -0
- gpbench/method_class/LightGBM/LightGBM_class.py +224 -0
- gpbench/method_class/LightGBM/LightGBM_he_class.py +121 -0
- gpbench/method_class/LightGBM/__init__.py +5 -0
- gpbench/method_class/RF/RF_GPU_class.py +165 -0
- gpbench/method_class/RF/RF_GPU_he_class.py +124 -0
- gpbench/method_class/RF/__init__.py +5 -0
- gpbench/method_class/SVC/SVC_GPU.py +181 -0
- gpbench/method_class/SVC/SVC_GPU_he.py +106 -0
- gpbench/method_class/SVC/__init__.py +5 -0
- gpbench/method_class/SoyDNGP/AlexNet_206_class.py +179 -0
- gpbench/method_class/SoyDNGP/SoyDNGP_class.py +189 -0
- gpbench/method_class/SoyDNGP/SoyDNGP_he_class.py +112 -0
- gpbench/method_class/SoyDNGP/__init__.py +5 -0
- gpbench/method_class/XGBoost/XGboost_GPU_class.py +198 -0
- gpbench/method_class/XGBoost/XGboost_GPU_he_class.py +178 -0
- gpbench/method_class/XGBoost/__init__.py +5 -0
- gpbench/method_class/__init__.py +52 -0
- gpbench/method_class/rrBLUP/__init__.py +5 -0
- gpbench/method_class/rrBLUP/rrBLUP_class.py +140 -0
- gpbench/method_reg/BayesA/BayesA.py +116 -0
- gpbench/method_reg/BayesA/__init__.py +5 -0
- gpbench/method_reg/BayesA/_bayesfromR.py +96 -0
- gpbench/method_reg/BayesA/_param_free_base_model.py +84 -0
- gpbench/method_reg/BayesA/bayesAfromR.py +16 -0
- gpbench/method_reg/BayesB/BayesB.py +117 -0
- gpbench/method_reg/BayesB/__init__.py +5 -0
- gpbench/method_reg/BayesB/_bayesfromR.py +96 -0
- gpbench/method_reg/BayesB/_param_free_base_model.py +84 -0
- gpbench/method_reg/BayesB/bayesBfromR.py +16 -0
- gpbench/method_reg/BayesC/BayesC.py +115 -0
- gpbench/method_reg/BayesC/__init__.py +5 -0
- gpbench/method_reg/BayesC/_bayesfromR.py +96 -0
- gpbench/method_reg/BayesC/_param_free_base_model.py +84 -0
- gpbench/method_reg/BayesC/bayesCfromR.py +16 -0
- gpbench/method_reg/CropARNet/CropARNet.py +159 -0
- gpbench/method_reg/CropARNet/CropARNet_Hyperparameters.py +109 -0
- gpbench/method_reg/CropARNet/__init__.py +5 -0
- gpbench/method_reg/CropARNet/base_CropARNet.py +137 -0
- gpbench/method_reg/Cropformer/Cropformer.py +313 -0
- gpbench/method_reg/Cropformer/Cropformer_Hyperparameters.py +250 -0
- gpbench/method_reg/Cropformer/__init__.py +5 -0
- gpbench/method_reg/DL_GWAS/DL_GWAS.py +186 -0
- gpbench/method_reg/DL_GWAS/DL_GWAS_Hyperparameters.py +125 -0
- gpbench/method_reg/DL_GWAS/__init__.py +5 -0
- gpbench/method_reg/DNNGP/DNNGP.py +157 -0
- gpbench/method_reg/DNNGP/DNNGP_Hyperparameters.py +118 -0
- gpbench/method_reg/DNNGP/__init__.py +5 -0
- gpbench/method_reg/DNNGP/base_dnngp.py +101 -0
- gpbench/method_reg/DeepCCR/DeepCCR.py +149 -0
- gpbench/method_reg/DeepCCR/DeepCCR_Hyperparameters.py +110 -0
- gpbench/method_reg/DeepCCR/__init__.py +5 -0
- gpbench/method_reg/DeepCCR/base_DeepCCR.py +171 -0
- gpbench/method_reg/DeepGS/DeepGS.py +165 -0
- gpbench/method_reg/DeepGS/DeepGS_Hyperparameters.py +114 -0
- gpbench/method_reg/DeepGS/__init__.py +5 -0
- gpbench/method_reg/DeepGS/base_deepgs.py +98 -0
- gpbench/method_reg/EIR/EIR.py +258 -0
- gpbench/method_reg/EIR/EIR_Hyperparameters.py +178 -0
- gpbench/method_reg/EIR/__init__.py +5 -0
- gpbench/method_reg/EIR/utils/__init__.py +0 -0
- gpbench/method_reg/EIR/utils/array_output_modules.py +97 -0
- gpbench/method_reg/EIR/utils/common.py +65 -0
- gpbench/method_reg/EIR/utils/lcl_layers.py +235 -0
- gpbench/method_reg/EIR/utils/logging.py +59 -0
- gpbench/method_reg/EIR/utils/mlp_layers.py +92 -0
- gpbench/method_reg/EIR/utils/models_locally_connected.py +642 -0
- gpbench/method_reg/EIR/utils/transformer_models.py +546 -0
- gpbench/method_reg/ElasticNet/ElasticNet.py +123 -0
- gpbench/method_reg/ElasticNet/ElasticNet_he.py +83 -0
- gpbench/method_reg/ElasticNet/__init__.py +5 -0
- gpbench/method_reg/G2PDeep/G2PDeep_Hyperparameters.py +107 -0
- gpbench/method_reg/G2PDeep/G2Pdeep.py +166 -0
- gpbench/method_reg/G2PDeep/__init__.py +5 -0
- gpbench/method_reg/G2PDeep/base_G2PDeep.py +209 -0
- gpbench/method_reg/GBLUP/GBLUP_R.py +182 -0
- gpbench/method_reg/GBLUP/__init__.py +5 -0
- gpbench/method_reg/GEFormer/GEFormer.py +164 -0
- gpbench/method_reg/GEFormer/GEFormer_Hyperparameters.py +106 -0
- gpbench/method_reg/GEFormer/__init__.py +5 -0
- gpbench/method_reg/GEFormer/gMLP.py +341 -0
- gpbench/method_reg/LightGBM/LightGBM.py +237 -0
- gpbench/method_reg/LightGBM/LightGBM_Hyperparameters.py +77 -0
- gpbench/method_reg/LightGBM/__init__.py +5 -0
- gpbench/method_reg/MVP/MVP.py +182 -0
- gpbench/method_reg/MVP/MVP_Hyperparameters.py +126 -0
- gpbench/method_reg/MVP/__init__.py +5 -0
- gpbench/method_reg/MVP/base_MVP.py +113 -0
- gpbench/method_reg/RF/RF_GPU.py +174 -0
- gpbench/method_reg/RF/RF_Hyperparameters.py +163 -0
- gpbench/method_reg/RF/__init__.py +5 -0
- gpbench/method_reg/SVC/SVC_GPU.py +194 -0
- gpbench/method_reg/SVC/SVC_Hyperparameters.py +107 -0
- gpbench/method_reg/SVC/__init__.py +5 -0
- gpbench/method_reg/SoyDNGP/AlexNet_206.py +185 -0
- gpbench/method_reg/SoyDNGP/SoyDNGP.py +179 -0
- gpbench/method_reg/SoyDNGP/SoyDNGP_Hyperparameters.py +105 -0
- gpbench/method_reg/SoyDNGP/__init__.py +5 -0
- gpbench/method_reg/XGBoost/XGboost_GPU.py +188 -0
- gpbench/method_reg/XGBoost/XGboost_Hyperparameters.py +167 -0
- gpbench/method_reg/XGBoost/__init__.py +5 -0
- gpbench/method_reg/__init__.py +55 -0
- gpbench/method_reg/rrBLUP/__init__.py +5 -0
- gpbench/method_reg/rrBLUP/rrBLUP.py +123 -0
- gpbench-1.0.0.dist-info/METADATA +379 -0
- gpbench-1.0.0.dist-info/RECORD +188 -0
- gpbench-1.0.0.dist-info/WHEEL +5 -0
- gpbench-1.0.0.dist-info/entry_points.txt +2 -0
- gpbench-1.0.0.dist-info/top_level.txt +3 -0
- tests/test_import.py +80 -0
- tests/test_method.py +232 -0
|
@@ -0,0 +1,547 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Read Agent Prompt 模板(对齐 textMSA 版本,移除 textmsa 依赖)。
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
from typing import Any, Optional
|
|
9
|
+
|
|
10
|
+
from logging_utils import get_logger
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
logger = get_logger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _normalize_language(language: Optional[str]) -> str:
|
|
17
|
+
"""Normalize language input; default to English."""
|
|
18
|
+
if not language:
|
|
19
|
+
return "en"
|
|
20
|
+
lower = language.lower()
|
|
21
|
+
if lower.startswith("zh"):
|
|
22
|
+
return "zh"
|
|
23
|
+
if lower.startswith("en"):
|
|
24
|
+
return "en"
|
|
25
|
+
return "en"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _get_prompt(prompt_map: dict[str, str], language: Optional[str]) -> str:
|
|
29
|
+
"""Select prompt by language with English fallback."""
|
|
30
|
+
lang = _normalize_language(language)
|
|
31
|
+
return prompt_map.get(lang, prompt_map["en"])
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
PLAN_PROMPT = {
|
|
35
|
+
"en": """
|
|
36
|
+
You are a file reading planning assistant. Based on the user question and the provided file overview, create a SEQUENTIAL reading plan that lists all files that need to be processed in a specific order.
|
|
37
|
+
|
|
38
|
+
User question:
|
|
39
|
+
{user_query}
|
|
40
|
+
|
|
41
|
+
File overview:
|
|
42
|
+
{file_overview}
|
|
43
|
+
|
|
44
|
+
**Planning Rules:**
|
|
45
|
+
1. **Select appropriate files:**
|
|
46
|
+
- For text files (.txt, .md, .py, .js, config files, README, notes, documentation): Include them in the plan for direct reading
|
|
47
|
+
- For image files (.png, .jpg, .jpeg, .gif, .bmp, .svg): Include them in the plan for image analysis
|
|
48
|
+
- For data files (.csv, .h5ad, .json, .parquet, .xlsx, etc.): Include them in the plan for code-based analysis
|
|
49
|
+
- Only include files that are relevant to answering the user question
|
|
50
|
+
|
|
51
|
+
2. **Plan structure:**
|
|
52
|
+
- Each plan item should specify: file_name, file_path, plan_detail, and order_reasoning
|
|
53
|
+
- plan_detail should describe what needs to be done with this file (e.g., "Read and summarize the content", "Analyze the data and extract key statistics", "Describe the image content")
|
|
54
|
+
- order_reasoning should explain WHY this file should be read at this position in the sequence (e.g., "Read first to understand the project structure", "Read after config files to understand data format", "Read last to synthesize all information")
|
|
55
|
+
|
|
56
|
+
3. **Sequential ordering is critical:**
|
|
57
|
+
- List files in the EXACT order they should be processed sequentially
|
|
58
|
+
- Even if files are independent, you must determine an optimal reading order
|
|
59
|
+
- Consider: foundational files first (README, configs), then supporting files, then data files, then analysis files
|
|
60
|
+
- The order_reasoning for each file should explain its position relative to other files
|
|
61
|
+
- Provide overall reasoning that explains the entire sequence strategy
|
|
62
|
+
|
|
63
|
+
Return JSON:
|
|
64
|
+
{{
|
|
65
|
+
"plans": [
|
|
66
|
+
{{
|
|
67
|
+
"file_name": "filename.ext",
|
|
68
|
+
"file_path": "/path/to/file",
|
|
69
|
+
"plan_detail": "Description of what to do with this file",
|
|
70
|
+
"order_reasoning": "Why this file should be read at this position (e.g., 'Read first because it contains project overview', 'Read after file X to understand context', etc.)"
|
|
71
|
+
}}
|
|
72
|
+
],
|
|
73
|
+
"reasoning": "Overall reasoning for the entire sequential reading plan, explaining the strategy and why files are ordered this way"
|
|
74
|
+
}}
|
|
75
|
+
|
|
76
|
+
**Important Notes:**
|
|
77
|
+
- Do not use placeholders or example values
|
|
78
|
+
- The order matters: files will be read sequentially, and each read can see results from previous reads
|
|
79
|
+
- Provide clear reasoning for the order, even if files seem independent
|
|
80
|
+
""",
|
|
81
|
+
"zh": """
|
|
82
|
+
你是文件阅读规划助手。根据用户问题和外部提供的文件概览字符串,制定顺序阅读计划,按特定顺序列出所有需要处理的文件。
|
|
83
|
+
|
|
84
|
+
用户问题:
|
|
85
|
+
{user_query}
|
|
86
|
+
|
|
87
|
+
文件概览:
|
|
88
|
+
{file_overview}
|
|
89
|
+
|
|
90
|
+
**规划规则:**
|
|
91
|
+
1. **选择合适的文件:**
|
|
92
|
+
- 对于文本文件(.txt, .md, .py, .js, 配置文件, README, 笔记, 文档等):包含在计划中,用于直接读取
|
|
93
|
+
- 对于图像文件(.png, .jpg, .jpeg, .gif, .bmp, .svg 等):包含在计划中,用于图像分析
|
|
94
|
+
- 对于数据文件(.csv, .h5ad, .json, .parquet, .xlsx 等):包含在计划中,用于基于代码的分析
|
|
95
|
+
- 只包含与回答用户问题相关的文件
|
|
96
|
+
|
|
97
|
+
2. **计划结构:**
|
|
98
|
+
- 每个计划项应指定:file_name, file_path, plan_detail, 和 order_reasoning
|
|
99
|
+
- plan_detail 应描述需要对该文件做什么(例如:"读取并总结内容"、"分析数据并提取关键统计信息"、"描述图像内容")
|
|
100
|
+
- order_reasoning 应解释为什么该文件应该在此顺序位置读取(例如:"首先读取以了解项目结构"、"在配置文件之后读取以了解数据格式"、"最后读取以综合所有信息")
|
|
101
|
+
|
|
102
|
+
3. **顺序至关重要:**
|
|
103
|
+
- 按顺序处理的精确顺序列出文件
|
|
104
|
+
- 即使文件是独立的,也必须确定最优的阅读顺序
|
|
105
|
+
- 考虑:基础文件优先(README、配置文件),然后是支持文件,然后是数据文件,最后是分析文件
|
|
106
|
+
- 每个文件的 order_reasoning 应解释其相对于其他文件的位置
|
|
107
|
+
- 提供整体推理,解释整个顺序策略
|
|
108
|
+
|
|
109
|
+
请返回JSON:
|
|
110
|
+
{{
|
|
111
|
+
"plans": [
|
|
112
|
+
{{
|
|
113
|
+
"file_name": "文件名.ext",
|
|
114
|
+
"file_path": "/路径/到/文件",
|
|
115
|
+
"plan_detail": "对该文件需要做什么的描述",
|
|
116
|
+
"order_reasoning": "为什么该文件应该在此位置读取(例如:'首先读取因为它包含项目概览'、'在文件X之后读取以了解上下文'等)"
|
|
117
|
+
}}
|
|
118
|
+
],
|
|
119
|
+
"reasoning": "整个顺序阅读计划的整体推理,解释策略以及为什么文件按此顺序排列"
|
|
120
|
+
}}
|
|
121
|
+
|
|
122
|
+
**重要提示:**
|
|
123
|
+
- 不要使用占位符或示例值
|
|
124
|
+
- 顺序很重要:文件将按顺序读取,每次读取都可以看到之前读取的结果
|
|
125
|
+
- 即使文件看起来是独立的,也要为顺序提供清晰的推理
|
|
126
|
+
""",
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
DATA_PREVIEW_ANALYSIS_PROMPT = {
|
|
131
|
+
"en": """
|
|
132
|
+
You are a data analysis planning assistant. Based on the user query and data file preview, generate guidance information to help with subsequent code generation.
|
|
133
|
+
|
|
134
|
+
User query:
|
|
135
|
+
{user_query}
|
|
136
|
+
|
|
137
|
+
File information:
|
|
138
|
+
{file_info}
|
|
139
|
+
|
|
140
|
+
**Previous Reading Results:**
|
|
141
|
+
{previous_results}
|
|
142
|
+
|
|
143
|
+
**Task:**
|
|
144
|
+
Analyze the data preview and user query, then generate structured guidance that includes:
|
|
145
|
+
1. **Data Characteristics**: Key features of the data (structure, columns, data types, size, etc.)
|
|
146
|
+
2. **Analysis Objectives**: What needs to be analyzed based on the user query
|
|
147
|
+
3. **Code Generation Strategy**: Recommended approach for generating analysis code (which libraries to use, key operations needed, etc.)
|
|
148
|
+
4. **Important Notes**: Potential pitfalls, data quality issues, or special considerations
|
|
149
|
+
|
|
150
|
+
**Return Format:**
|
|
151
|
+
You MUST return a JSON object (not markdown code block) with the following structure:
|
|
152
|
+
{{
|
|
153
|
+
"guidance": "A comprehensive guidance text that summarizes data characteristics, analysis objectives, code generation strategy, and important notes"
|
|
154
|
+
}}
|
|
155
|
+
""",
|
|
156
|
+
"zh": """
|
|
157
|
+
你是数据分析规划助手。根据用户查询和数据文件预览,生成指导信息以帮助后续的代码生成。
|
|
158
|
+
|
|
159
|
+
用户查询:
|
|
160
|
+
{user_query}
|
|
161
|
+
|
|
162
|
+
文件信息:
|
|
163
|
+
{file_info}
|
|
164
|
+
|
|
165
|
+
**之前的读取结果:**
|
|
166
|
+
{previous_results}
|
|
167
|
+
|
|
168
|
+
**任务:**
|
|
169
|
+
分析数据预览和用户查询,然后生成结构化的指导信息,包括:
|
|
170
|
+
1. **数据特征**:数据的关键特征(结构、列、数据类型、大小等)
|
|
171
|
+
2. **分析目标**:根据用户查询需要分析什么
|
|
172
|
+
3. **代码生成策略**:生成分析代码的推荐方法(使用哪些库、需要的关键操作等)
|
|
173
|
+
4. **重要注意事项**:潜在的陷阱、数据质量问题或特殊考虑
|
|
174
|
+
|
|
175
|
+
**返回格式:**
|
|
176
|
+
你必须返回一个 JSON 对象(不要使用 markdown 代码块),格式如下:
|
|
177
|
+
{{
|
|
178
|
+
"guidance": "综合的指导文本,总结数据特征、分析目标、代码生成策略和重要注意事项"
|
|
179
|
+
}}
|
|
180
|
+
""",
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
CODE_GENERATION_PROMPT = {
|
|
185
|
+
"en": """
|
|
186
|
+
You are a code generation assistant. Generate Python code to analyze the specified data file according to the given instruction.
|
|
187
|
+
|
|
188
|
+
Instruction:
|
|
189
|
+
{instruction}
|
|
190
|
+
|
|
191
|
+
File information:
|
|
192
|
+
{file_info}
|
|
193
|
+
|
|
194
|
+
**Previous Reading Results:**
|
|
195
|
+
{previous_results}
|
|
196
|
+
|
|
197
|
+
**Analysis Guidance:**
|
|
198
|
+
{analysis_guidance}
|
|
199
|
+
|
|
200
|
+
**Python Library Guidelines for Structured Data Files:**
|
|
201
|
+
- **CSV files**: Use `pandas.read_csv()` for reading. Use `pandas.DataFrame.to_csv()` for writing.
|
|
202
|
+
- **JSON data files**: Use `json.load()` / `json.dump()` or `pandas.read_json()` for reading. Use `json.dump()` / `json.dumps()` or `pandas.DataFrame.to_json()` for writing.
|
|
203
|
+
- **h5ad files (AnnData)**: Use `scanpy.read_h5ad()` or `anndata.read_h5ad()` for reading. Use `adata.write_h5ad()` for writing.
|
|
204
|
+
- **HDF5 files**: Use `h5py.File()` or `pandas.read_hdf()` for reading. Use `h5py.File().create_dataset()` or `pandas.DataFrame.to_hdf()` for writing.
|
|
205
|
+
- **Parquet files**: Use `pandas.read_parquet()` or `pyarrow.parquet.read_table()` for reading. Use `pandas.DataFrame.to_parquet()` or `pyarrow.parquet.write_table()` for writing.
|
|
206
|
+
- **Excel files**: Use `pandas.read_excel()` (requires `openpyxl` or `xlrd` engine) for reading. Use `pandas.DataFrame.to_excel()` for writing.
|
|
207
|
+
|
|
208
|
+
**Output Requirements:**
|
|
209
|
+
- Read files using the provided real paths.
|
|
210
|
+
- Input files may be outside the working directory; read them from their provided paths without relocating.
|
|
211
|
+
- Write all output files into the working directory (no outputs outside the working directory).
|
|
212
|
+
- **IMPORTANT**: Print the analysis results to stdout so they can be captured. Use print() statements to output key findings, statistics, summaries, etc.
|
|
213
|
+
- **NO PLOTTING**: **ABSOLUTELY FORBIDDEN** to use any visualization libraries (e.g., matplotlib, seaborn, plotly, etc.) for plotting during code generation. **DO NOT** generate any plots, images, or visualization outputs. If visualization is needed for analysis, use print() to output numerical results, statistical summaries, and other text information instead.
|
|
214
|
+
- Do not fabricate data or use example data.
|
|
215
|
+
- The generated code must be directly usable on the actual input data files. Do NOT generate placeholder code that waits for future/real data, and do NOT use any mock/simulated data. Always perform analysis based on the provided file paths and current data.
|
|
216
|
+
- When errors occur, surface detailed error information (stack trace and contextual details) so failures are easy to debug.
|
|
217
|
+
- **CRITICAL - Error Handling Rules:**
|
|
218
|
+
- **NEVER** use try-except blocks that silently swallow exceptions without re-raising them or printing detailed error information
|
|
219
|
+
- If you use try-except, you MUST either:
|
|
220
|
+
(1) Re-raise the exception after logging/printing it, OR
|
|
221
|
+
(2) Print the full error details (including traceback) to stderr using `traceback.print_exc()` or `sys.stderr.write()`
|
|
222
|
+
- **DO NOT** catch exceptions and only print a simple error message without the full traceback
|
|
223
|
+
- **DO NOT** catch exceptions and continue execution silently - this makes debugging impossible
|
|
224
|
+
- If error handling is needed, prefer letting exceptions propagate naturally, or use proper error handling that preserves error information
|
|
225
|
+
- **Consider previous reading results**: You can reference information from previously read files to better understand the context and generate more relevant analysis code.
|
|
226
|
+
|
|
227
|
+
**Result Output Requirements (for stdout output):**
|
|
228
|
+
- The output must strictly adhere to the instruction and only report what is found in the execution results
|
|
229
|
+
- Use natural narrative text, not lists or structured formats
|
|
230
|
+
- Do NOT include any suggestions, recommendations, or advice beyond what is in the execution results
|
|
231
|
+
- Do NOT provide any recommendations or suggestions that go beyond the scope of the execution results
|
|
232
|
+
|
|
233
|
+
**Return Format:**
|
|
234
|
+
You MUST return a JSON object (not markdown code block) with the following structure:
|
|
235
|
+
{{
|
|
236
|
+
"code": "The Python code as a string (no markdown code block, just raw code)"
|
|
237
|
+
}}
|
|
238
|
+
""",
|
|
239
|
+
"zh": """
|
|
240
|
+
你是代码生成助手。根据给定的指令生成 Python 代码来分析指定的数据文件。
|
|
241
|
+
|
|
242
|
+
指令:
|
|
243
|
+
{instruction}
|
|
244
|
+
|
|
245
|
+
文件信息:
|
|
246
|
+
{file_info}
|
|
247
|
+
|
|
248
|
+
**之前的读取结果:**
|
|
249
|
+
{previous_results}
|
|
250
|
+
|
|
251
|
+
**分析指导:**
|
|
252
|
+
{analysis_guidance}
|
|
253
|
+
|
|
254
|
+
**结构化数据文件的 Python 库使用指南:**
|
|
255
|
+
- **CSV 文件**:使用 `pandas.read_csv()` 读取。使用 `pandas.DataFrame.to_csv()` 写入。
|
|
256
|
+
- **JSON 数据文件**:使用 `json.load()` / `json.dump()` 或 `pandas.read_json()` 读取。使用 `json.dump()` / `json.dumps()` 或 `pandas.DataFrame.to_json()` 写入。
|
|
257
|
+
- **h5ad 文件(AnnData)**:使用 `scanpy.read_h5ad()` 或 `anndata.read_h5ad()` 读取。使用 `adata.write_h5ad()` 写入。
|
|
258
|
+
- **HDF5 文件**:使用 `h5py.File()` 或 `pandas.read_hdf()` 读取。使用 `h5py.File().create_dataset()` 或 `pandas.DataFrame.to_hdf()` 写入。
|
|
259
|
+
- **Parquet 文件**:使用 `pandas.read_parquet()` 或 `pyarrow.parquet.read_table()` 读取。使用 `pandas.DataFrame.to_parquet()` 或 `pyarrow.parquet.write_table()` 写入。
|
|
260
|
+
- **Excel 文件**:使用 `pandas.read_excel()`(需要 `openpyxl` 或 `xlrd` 引擎)读取。使用 `pandas.DataFrame.to_excel()` 写入。
|
|
261
|
+
|
|
262
|
+
**输出要求:**
|
|
263
|
+
- 读取文件时使用提供的真实路径。
|
|
264
|
+
- 输入文件可能不在工作目录中,请直接使用提供的路径读取,不要搬移。
|
|
265
|
+
- 将所有输出文件写入工作目录(不要写到工作目录以外)。
|
|
266
|
+
- **重要**:将分析结果打印到 stdout,以便可以捕获。使用 print() 语句输出关键发现、统计信息、摘要等。
|
|
267
|
+
- **禁止画图**:代码生成过程中**绝对禁止**使用任何可视化库(如 matplotlib、seaborn、plotly 等)进行画图操作,**禁止**生成任何图表、图像文件或可视化输出。如果分析需要可视化,请使用 print() 输出数值结果、统计摘要等文本信息。
|
|
268
|
+
- 不要伪造数据或使用示例数据。
|
|
269
|
+
- 生成的代码必须直接面向实际输入数据文件,可立即运行。不要生成依赖“未来补充真实数据”的占位代码,也不要使用任何模拟/伪造/示例数据,必须基于当前提供的文件路径和真实数据执行分析。
|
|
270
|
+
- 发生错误时需要输出详细的错误信息(包含堆栈与关键上下文),方便定位问题。
|
|
271
|
+
- **关键 - 错误处理规则:**
|
|
272
|
+
- **绝对禁止**使用会静默吞掉异常而不重新抛出或打印详细错误信息的 try-except 代码块
|
|
273
|
+
- 如果使用 try-except,你必须:
|
|
274
|
+
(1) 在记录/打印后重新抛出异常,或者
|
|
275
|
+
(2) 使用 `traceback.print_exc()` 或 `sys.stderr.write()` 将完整错误详情(包括堆栈跟踪)打印到 stderr
|
|
276
|
+
- **不要**捕获异常后只打印简单错误信息而不包含完整堆栈跟踪
|
|
277
|
+
- **不要**捕获异常后静默继续执行 - 这会使调试变得不可能
|
|
278
|
+
- 如果需要错误处理,优先让异常自然传播,或使用能保留错误信息的正确错误处理方式
|
|
279
|
+
- **考虑之前的读取结果**:你可以参考之前读取文件的信息,以更好地理解上下文并生成更相关的分析代码。
|
|
280
|
+
|
|
281
|
+
**结果输出要求(stdout / stderr 区分):**
|
|
282
|
+
- 正常的分析结果(成功执行)必须使用 print() 打印到 stdout。
|
|
283
|
+
- 错误信息、异常和完整堆栈必须写入 stderr(要么让异常自然抛出,要么使用 `traceback.print_exc()` / `sys.stderr.write()`)。
|
|
284
|
+
- 输出内容必须严格忠于指令,只报告执行结果中发现的内容。
|
|
285
|
+
- 使用自然的叙述性文字,不要使用列表或结构化格式。
|
|
286
|
+
- 不包含任何建议、推荐或超出执行结果范围的建议。
|
|
287
|
+
|
|
288
|
+
**返回格式:**
|
|
289
|
+
你必须返回一个 JSON 对象(不要使用 markdown 代码块),格式如下:
|
|
290
|
+
{{
|
|
291
|
+
"code": "Python 代码字符串(不要使用 markdown 代码块,直接返回原始代码)"
|
|
292
|
+
}}
|
|
293
|
+
""",
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
CODE_RETRY_PROMPT = {
|
|
298
|
+
"en": """
|
|
299
|
+
You are a code generation assistant. The previous code execution failed. Please fix the code based on the error message and try again.
|
|
300
|
+
|
|
301
|
+
User query:
|
|
302
|
+
{user_query}
|
|
303
|
+
|
|
304
|
+
Original instruction:
|
|
305
|
+
{instruction}
|
|
306
|
+
|
|
307
|
+
File information:
|
|
308
|
+
{file_info}
|
|
309
|
+
|
|
310
|
+
**Previous Reading Results:**
|
|
311
|
+
{previous_results}
|
|
312
|
+
|
|
313
|
+
Previous code:
|
|
314
|
+
{previous_code}
|
|
315
|
+
|
|
316
|
+
Error message (stderr):
|
|
317
|
+
{error_message}
|
|
318
|
+
|
|
319
|
+
Please analyze the error, fix the code, and return the corrected version. Consider the previous reading results for context.
|
|
320
|
+
|
|
321
|
+
**CRITICAL - Error Handling Rules:**
|
|
322
|
+
- **NEVER** use try-except blocks that silently swallow exceptions without re-raising them or printing detailed error information
|
|
323
|
+
- If you use try-except, you MUST either:
|
|
324
|
+
(1) Re-raise the exception after logging/printing it, OR
|
|
325
|
+
(2) Print the full error details (including traceback) to stderr using `traceback.print_exc()` or `sys.stderr.write()`
|
|
326
|
+
- **DO NOT** catch exceptions and only print a simple error message without the full traceback
|
|
327
|
+
- **DO NOT** catch exceptions and continue execution silently - this makes debugging impossible
|
|
328
|
+
- If error handling is needed, prefer letting exceptions propagate naturally, or use proper error handling that preserves error information
|
|
329
|
+
|
|
330
|
+
**Result Output Requirements (for stdout output):**
|
|
331
|
+
- The output must strictly adhere to the instruction and only report what is found in the execution results
|
|
332
|
+
- Use natural narrative text, not lists or structured formats
|
|
333
|
+
- Do NOT include any suggestions, recommendations, or advice beyond what is in the execution results
|
|
334
|
+
- Do NOT provide any recommendations or suggestions that go beyond the scope of the execution results
|
|
335
|
+
|
|
336
|
+
**Return Format:**
|
|
337
|
+
You MUST return a JSON object (not markdown code block) with the following structure:
|
|
338
|
+
{{
|
|
339
|
+
"code": "The corrected Python code as a string (no markdown code block, just raw code)"
|
|
340
|
+
}}
|
|
341
|
+
""",
|
|
342
|
+
"zh": """
|
|
343
|
+
你是代码生成助手。之前的代码执行失败了。请根据错误信息修复代码并重试。
|
|
344
|
+
|
|
345
|
+
用户问题:
|
|
346
|
+
{user_query}
|
|
347
|
+
|
|
348
|
+
原始指令:
|
|
349
|
+
{instruction}
|
|
350
|
+
|
|
351
|
+
文件信息:
|
|
352
|
+
{file_info}
|
|
353
|
+
|
|
354
|
+
**之前的读取结果:**
|
|
355
|
+
{previous_results}
|
|
356
|
+
|
|
357
|
+
之前的代码:
|
|
358
|
+
{previous_code}
|
|
359
|
+
|
|
360
|
+
错误信息(stderr):
|
|
361
|
+
{error_message}
|
|
362
|
+
|
|
363
|
+
请分析错误,修复代码,并返回修正后的版本。考虑之前的读取结果以获取上下文。
|
|
364
|
+
|
|
365
|
+
**关键 - 错误处理规则:**
|
|
366
|
+
- **绝对禁止**使用会静默吞掉异常而不重新抛出或打印详细错误信息的 try-except 代码块
|
|
367
|
+
- 如果使用 try-except,你必须:
|
|
368
|
+
(1) 在记录/打印后重新抛出异常,或者
|
|
369
|
+
(2) 使用 `traceback.print_exc()` 或 `sys.stderr.write()` 将完整错误详情(包括堆栈跟踪)打印到 stderr
|
|
370
|
+
- **不要**捕获异常后只打印简单错误信息而不包含完整堆栈跟踪
|
|
371
|
+
- **不要**捕获异常后静默继续执行 - 这会使调试变得不可能
|
|
372
|
+
- 如果需要错误处理,优先让异常自然传播,或使用能保留错误信息的正确错误处理方式
|
|
373
|
+
|
|
374
|
+
**结果输出要求(针对 stdout 输出):**
|
|
375
|
+
- 输出必须严格忠于指令,只报告执行结果中发现的内容
|
|
376
|
+
- 使用自然的叙述性文字,不要使用列表或结构化格式
|
|
377
|
+
- 不包含任何建议、推荐或超出执行结果范围的建议
|
|
378
|
+
- 不要提供任何超出执行结果范围的推荐或建议
|
|
379
|
+
|
|
380
|
+
**返回格式:**
|
|
381
|
+
你必须返回一个 JSON 对象(不要使用 markdown 代码块),格式如下:
|
|
382
|
+
{{
|
|
383
|
+
"code": "修正后的 Python 代码字符串(不要使用 markdown 代码块,直接返回原始代码)"
|
|
384
|
+
}}
|
|
385
|
+
""",
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
ANSWER_PROMPT = {
|
|
390
|
+
"en": """
|
|
391
|
+
You are a report generation assistant. Based on the user question and all execution results, produce a comprehensive narrative answer.
|
|
392
|
+
|
|
393
|
+
User question:
|
|
394
|
+
{user_query}
|
|
395
|
+
|
|
396
|
+
Execution results:
|
|
397
|
+
{execution_results}
|
|
398
|
+
|
|
399
|
+
Return JSON:
|
|
400
|
+
{{
|
|
401
|
+
"final_answer": "The narrative final answer based entirely on execution results."
|
|
402
|
+
}}
|
|
403
|
+
""",
|
|
404
|
+
"zh": """
|
|
405
|
+
你是报告生成助手。根据用户问题和所有执行结果,生成综合性的叙述性最终答案。
|
|
406
|
+
|
|
407
|
+
用户问题:
|
|
408
|
+
{user_query}
|
|
409
|
+
|
|
410
|
+
执行结果:
|
|
411
|
+
{execution_results}
|
|
412
|
+
|
|
413
|
+
请返回 JSON:
|
|
414
|
+
{{
|
|
415
|
+
"final_answer": "完全基于执行结果的叙述性最终答案。"
|
|
416
|
+
}}
|
|
417
|
+
""",
|
|
418
|
+
}
|
|
419
|
+
|
|
420
|
+
|
|
421
|
+
def _safe_to_jsonable(data: object):
|
|
422
|
+
if isinstance(data, (str, int, float, bool)) or data is None:
|
|
423
|
+
return data
|
|
424
|
+
if isinstance(data, dict):
|
|
425
|
+
return {k: _safe_to_jsonable(v) for k, v in data.items()}
|
|
426
|
+
if isinstance(data, (list, tuple)):
|
|
427
|
+
return [_safe_to_jsonable(v) for v in data]
|
|
428
|
+
return str(data)
|
|
429
|
+
|
|
430
|
+
|
|
431
|
+
def _fmt_json(data: object) -> str:
|
|
432
|
+
if data is None:
|
|
433
|
+
return "null"
|
|
434
|
+
if isinstance(data, str):
|
|
435
|
+
return data
|
|
436
|
+
return json.dumps(_safe_to_jsonable(data), ensure_ascii=False, indent=2)
|
|
437
|
+
|
|
438
|
+
|
|
439
|
+
def format_plan_prompt(
|
|
440
|
+
user_query: str,
|
|
441
|
+
file_overview: str,
|
|
442
|
+
language: Optional[str] = "en",
|
|
443
|
+
) -> str:
|
|
444
|
+
return _get_prompt(PLAN_PROMPT, language).format(
|
|
445
|
+
user_query=user_query,
|
|
446
|
+
file_overview=file_overview,
|
|
447
|
+
)
|
|
448
|
+
|
|
449
|
+
|
|
450
|
+
def format_data_preview_analysis_prompt(
|
|
451
|
+
user_query: str,
|
|
452
|
+
file_info: dict[str, Any],
|
|
453
|
+
previous_results: str = "",
|
|
454
|
+
language: Optional[str] = "en",
|
|
455
|
+
) -> str:
|
|
456
|
+
return _get_prompt(DATA_PREVIEW_ANALYSIS_PROMPT, language).format(
|
|
457
|
+
user_query=user_query,
|
|
458
|
+
file_info=_fmt_json(file_info),
|
|
459
|
+
previous_results=previous_results or "No previous files have been read yet.",
|
|
460
|
+
)
|
|
461
|
+
|
|
462
|
+
|
|
463
|
+
def format_code_generation_prompt(
|
|
464
|
+
instruction: str,
|
|
465
|
+
file_info: dict[str, Any],
|
|
466
|
+
previous_results: str = "",
|
|
467
|
+
analysis_guidance: str = "",
|
|
468
|
+
language: Optional[str] = "en",
|
|
469
|
+
) -> str:
|
|
470
|
+
return _get_prompt(CODE_GENERATION_PROMPT, language).format(
|
|
471
|
+
instruction=instruction,
|
|
472
|
+
file_info=_fmt_json(file_info),
|
|
473
|
+
previous_results=previous_results or "No previous files have been read yet.",
|
|
474
|
+
analysis_guidance=analysis_guidance or "No analysis guidance available.",
|
|
475
|
+
)
|
|
476
|
+
|
|
477
|
+
|
|
478
|
+
def format_code_retry_prompt(
|
|
479
|
+
user_query: str,
|
|
480
|
+
instruction: str,
|
|
481
|
+
file_info: dict[str, Any],
|
|
482
|
+
previous_code: str,
|
|
483
|
+
error_message: str,
|
|
484
|
+
previous_results: str = "",
|
|
485
|
+
language: Optional[str] = "en",
|
|
486
|
+
) -> str:
|
|
487
|
+
return _get_prompt(CODE_RETRY_PROMPT, language).format(
|
|
488
|
+
user_query=user_query,
|
|
489
|
+
instruction=instruction,
|
|
490
|
+
file_info=_fmt_json(file_info),
|
|
491
|
+
previous_code=previous_code,
|
|
492
|
+
error_message=error_message,
|
|
493
|
+
previous_results=previous_results or "No previous files have been read yet.",
|
|
494
|
+
)
|
|
495
|
+
|
|
496
|
+
|
|
497
|
+
TEXT_SUMMARY_PROMPT = {
|
|
498
|
+
"en": """
|
|
499
|
+
You are a text analysis assistant. Analyze the following file content and answer the user's question based on the content and previous reading results.
|
|
500
|
+
|
|
501
|
+
User question/requirement: {instruction}
|
|
502
|
+
|
|
503
|
+
Previous Reading Results:
|
|
504
|
+
{previous_results}
|
|
505
|
+
|
|
506
|
+
Current File Content:
|
|
507
|
+
{file_content}
|
|
508
|
+
""",
|
|
509
|
+
"zh": """
|
|
510
|
+
你是文本分析助手。请分析以下文件内容,并根据内容和之前的读取结果回答用户问题。
|
|
511
|
+
|
|
512
|
+
用户问题/需求:{instruction}
|
|
513
|
+
|
|
514
|
+
之前的读取结果:
|
|
515
|
+
{previous_results}
|
|
516
|
+
|
|
517
|
+
当前文件内容:
|
|
518
|
+
{file_content}
|
|
519
|
+
""",
|
|
520
|
+
}
|
|
521
|
+
|
|
522
|
+
|
|
523
|
+
def format_text_summary_prompt(
|
|
524
|
+
instruction: str,
|
|
525
|
+
file_content: str,
|
|
526
|
+
previous_results: str = "",
|
|
527
|
+
language: Optional[str] = "en",
|
|
528
|
+
) -> str:
|
|
529
|
+
return _get_prompt(TEXT_SUMMARY_PROMPT, language).format(
|
|
530
|
+
instruction=instruction,
|
|
531
|
+
file_content=file_content,
|
|
532
|
+
previous_results=previous_results
|
|
533
|
+
or ("尚未读取任何文件。" if _normalize_language(language) == "zh" else "No previous files have been read yet."),
|
|
534
|
+
)
|
|
535
|
+
|
|
536
|
+
|
|
537
|
+
def format_answer_prompt(
|
|
538
|
+
user_query: str,
|
|
539
|
+
execution_results: list[dict[str, Any]],
|
|
540
|
+
language: Optional[str] = "en",
|
|
541
|
+
) -> str:
|
|
542
|
+
return _get_prompt(ANSWER_PROMPT, language).format(
|
|
543
|
+
user_query=user_query,
|
|
544
|
+
execution_results=_fmt_json(execution_results),
|
|
545
|
+
)
|
|
546
|
+
|
|
547
|
+
|