PyPI - gpbench - Versions diffs - 1.0.0__tar.gz - Mend

gpbench 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (193) hide show

gpbench-1.0.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,379 @@
+Metadata-Version: 2.4
+Name: gpbench
+Version: 1.0.0
+Summary: A benchmarking toolkit for genomic prediction with multiple methods and LLM-powered analysis
+Author: GPBench Contributors
+License: MIT
+Keywords: genomic prediction,bioinformatics,machine learning,deep learning
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Science/Research
+Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Requires-Python: >=3.8
+Description-Content-Type: text/markdown
+Requires-Dist: numpy==1.26.4
+Requires-Dist: pandas<2.2.3,>=2.0
+Requires-Dist: scipy==1.13.1
+Requires-Dist: matplotlib==3.9.4
+Requires-Dist: seaborn==0.13.2
+Requires-Dist: scikit-learn==1.6.1
+Requires-Dist: torch==2.8.0
+Requires-Dist: torchvision==0.23.0
+Requires-Dist: torchmetrics==1.8.2
+Requires-Dist: pytorch-lightning==2.5.6
+Requires-Dist: lightning==2.5.6
+Requires-Dist: lightning-utilities==0.15.2
+Requires-Dist: tensorflow==2.20.0
+Requires-Dist: keras==3.10.0
+Requires-Dist: tensorboard==2.20.0
+Requires-Dist: tensorboard-data-server==0.7.2
+Requires-Dist: xgboost==2.1.4
+Requires-Dist: lightgbm==4.6.0
+Requires-Dist: optuna==2.10.0
+Requires-Dist: umap-learn==0.5.9.post2
+Requires-Dist: pynndescent==0.5.13
+Requires-Dist: langchain-core==0.3.82
+Requires-Dist: langchain-openai==0.3.35
+Requires-Dist: langgraph==0.6.11
+Requires-Dist: langgraph-checkpoint==2.1.2
+Requires-Dist: langgraph-prebuilt==0.6.5
+Requires-Dist: langgraph-sdk==0.2.9
+Requires-Dist: langsmith==0.4.37
+Requires-Dist: openai==2.8.1
+Requires-Dist: dashscope==1.25.7
+Requires-Dist: tiktoken==0.12.0
+Requires-Dist: transformers==4.57.3
+Requires-Dist: tokenizers==0.22.1
+Requires-Dist: huggingface-hub==0.36.0
+Requires-Dist: psutil==7.1.3
+Requires-Dist: tqdm==4.67.1
+Requires-Dist: pyyaml==6.0.3
+Requires-Dist: requests==2.32.5
+Requires-Dist: requests-toolbelt==1.0.0
+Requires-Dist: python-dotenv==1.2.1
+Requires-Dist: rich==13.9.4
+Requires-Dist: rich-argparse==1.7.2
+Requires-Dist: pyecharts==2.0.9
+Requires-Dist: h5py==3.14.0
+Requires-Dist: pandas-plink==2.2.9
+Requires-Dist: xarray==2024.7.0
+Requires-Dist: statsmodels==0.14.5
+Requires-Dist: patsy==1.0.2
+Requires-Dist: rpy2==3.5.16
+Requires-Dist: aiohttp==3.13.2
+Requires-Dist: httpx==0.28.1
+Requires-Dist: httpcore==1.0.9
+Requires-Dist: anyio==4.12.0
+Requires-Dist: orjson==3.11.5
+Requires-Dist: ormsgpack==1.11.0
+Requires-Dist: simplejson==3.20.2
+Requires-Dist: protobuf==6.33.0
+Requires-Dist: flatbuffers==25.9.23
+Requires-Dist: sympy==1.14.0
+Requires-Dist: mpmath==1.3.0
+Requires-Dist: opt-einsum==3.4.0
+Requires-Dist: cmaes==0.12.0
+Requires-Dist: flaml==2.3.6
+Requires-Dist: pyro-api==0.1.2
+Requires-Dist: pyro-ppl==1.9.1
+Requires-Dist: fsspec==2025.10.0
+Requires-Dist: filelock==3.19.1
+Requires-Dist: diskcache==5.6.3
+Requires-Dist: platformdirs==4.4.0
+Requires-Dist: distro==1.9.0
+Requires-Dist: pydantic==2.12.4
+Requires-Dist: pydantic-core==2.41.5
+Requires-Dist: typing-extensions==4.15.0
+Requires-Dist: typing-inspection==0.4.2
+Requires-Dist: annotated-types==0.7.0
+Requires-Dist: pytest==8.4.2
+Requires-Dist: pytest-cov==7.0.0
+Requires-Dist: coverage==7.10.7
+Requires-Dist: joblib==1.5.2
+Requires-Dist: threadpoolctl==3.6.0
+Requires-Dist: networkx==3.2.1
+Requires-Dist: einops==0.8.1
+Requires-Dist: triton==3.4.0
+Requires-Dist: safetensors==0.7.0
+Requires-Dist: ml-dtypes==0.5.3
+Requires-Dist: tenacity==9.1.2
+Requires-Dist: xxhash==3.6.0
+Requires-Dist: xlsxwriter==3.2.9
+Requires-Dist: aislib==0.1.14a0
+Requires-Dist: swanlab==0.7.6
+Provides-Extra: cuda
+Requires-Dist: nvidia-cublas-cu12==12.8.4.1; extra == "cuda"
+Requires-Dist: nvidia-cuda-cupti-cu12==12.8.90; extra == "cuda"
+Requires-Dist: nvidia-cuda-nvrtc-cu12==12.8.93; extra == "cuda"
+Requires-Dist: nvidia-cuda-runtime-cu12==12.8.90; extra == "cuda"
+Requires-Dist: nvidia-cudnn-cu12==9.10.2.21; extra == "cuda"
+Requires-Dist: nvidia-cufft-cu12==11.3.3.83; extra == "cuda"
+Requires-Dist: nvidia-cufile-cu12==1.13.1.3; extra == "cuda"
+Requires-Dist: nvidia-curand-cu12==10.3.9.90; extra == "cuda"
+Requires-Dist: nvidia-cusolver-cu12==11.7.3.90; extra == "cuda"
+Requires-Dist: nvidia-cusparse-cu12==12.5.8.93; extra == "cuda"
+Requires-Dist: nvidia-cusparselt-cu12==0.7.1; extra == "cuda"
+Requires-Dist: nvidia-ml-py==13.580.82; extra == "cuda"
+Requires-Dist: nvidia-nccl-cu12==2.27.3; extra == "cuda"
+Requires-Dist: nvidia-nvjitlink-cu12==12.8.93; extra == "cuda"
+Requires-Dist: nvidia-nvtx-cu12==12.8.90; extra == "cuda"
+Provides-Extra: extra
+Requires-Dist: autogen-agentchat==0.2.40; extra == "extra"
+Requires-Dist: swanlab==0.7.6; extra == "extra"
+Requires-Dist: docker==7.1.0; extra == "extra"
+Requires-Dist: boto3==1.40.69; extra == "extra"
+Requires-Dist: botocore==1.40.69; extra == "extra"
+Requires-Dist: s3transfer==0.14.0; extra == "extra"
+Provides-Extra: dev
+Requires-Dist: black>=22.0.0; extra == "dev"
+Requires-Dist: flake8>=4.0.0; extra == "dev"
+Requires-Dist: mypy>=1.0.0; extra == "dev"
+# GPBench
+GPBench is a benchmarking toolkit for genomic prediction. This repository reimplements and integrates many commonly used methods, including classic linear statistical approaches and machine learning / deep learning methods: rrBLUP, GBLUP, BayesA/B/C, SVR, Random Forest, XGBoost, LightGBM, DeepGS, DL_GWAS, G2PDeep, MVP, DNNGP, SoyDNGP, DeepCCR, EIR, Cropformer, GEFormer, CropARNet, etc.
+Project Website: [https://www.sdu-idea.cn/GPBench/](https://www.sdu-idea.cn/GPBench/)
+![GPBench overview](data/fig/fig1.png)
+## Key Features
+- Implements multiple genomic prediction methods and reproducible experimental workflows
+- Supports GPU-accelerated deep learning methods (using PyTorch)
+- Unified data loading and 10-fold cross-validation pipeline
+- Outputs standardized evaluation metrics (PCC, MAE, MSE, R2) and per-fold predictions
+- **LLM-powered analysis tool** (`gp_agent_tool`): Analyzes dataset characteristics, finds similar datasets, and recommends suitable genomic prediction methods based on historical experimental experience
+## Important Structure
+- `data/`: Example/real dataset directory, each species/dataset is a subfolder (e.g., `data/Cotton/`), containing:
+	- `genotype.npz`: genotype matrix (typically saved as a NumPy array)
+	- `phenotype.npz`: phenotype data (contains phenotype matrix and phenotype names)
+- `method_reg/`: subdirectories with implementations for each method (each method usually contains a main runner script plus hyperparameter/utility scripts)
+- `result/`: default output directory for experimental results
+- `gp_agent_tool/`: LLM-powered dataset analysis and method recommendation tool (see [Dataset Analysis Tool](#dataset-analysis-tool-gp_agent_tool) section)
+- `environment.yml`: dependency file for creating a conda environment (recommended)
+## Environment Setup (recommended: conda)
+There is an `environment.yml` in the repository; it is recommended to create and activate a conda environment with it:
+```bash
+# On a machine with conda:
+conda env create -f environment.yml
+conda activate Benchmark
+```
+Notes:
+- `environment.yml` contains most dependencies (including CUDA / cuDNN related packages and pip list) and is suitable for GPU-enabled environments (the file references CUDA 11.8 and matching RAPIDS/torch/cupy versions).
+- Ensure the target machine has an NVIDIA driver compatible with CUDA 11.8/12.
+- If you cannot use the environment file directly, you can install main dependencies into an existing Python environment as needed:
+```bash
+pip install -U numpy pandas scikit-learn torch torchvision optuna psutil xgboost lightgbm
+```
+(Warning: the above is a simplified installation; some packages may need additional configuration on GPU systems or certain platforms.)
+## Data Format and Preparation
+- Each species folder should contain `genotype.npz` and `phenotype.npz`.
+- `genotype.npz` usually stores a 2D array (number of samples × number of SNPs).
+- `phenotype.npz` typically includes two arrays: the phenotype matrix (number of samples × number of phenotypes) and a list of phenotype names.
+Quickly view phenotype names for a dataset (e.g., `Cotton`):
+```bash
+python - <<'PY'
+import numpy as np
+obj = np.load('data/Cotton/phenotype.npz')
+print(obj['arr_1'])
+PY
+```
+## Quick Start (example with a method)
+Most methods have a main script under `method_reg/<Method>/`. Scripts usually accept parameters like `--methods`, `--species`, `--phe`, `--data_dir`, `--result_dir`, etc. Example:
+```bash
+# 1) Activate the environment
+conda activate Benchmark
+# 2) Run a single phenotype with DeepCCR (note: include trailing slash after --species)
+python method_reg/DeepCCR/DeepCCR.py \
+	--methods DeepCCR/ \
+	--species Cotton/ \
+	--phe FibLen_17_18 \
+	--data_dir data/ \
+	--result_dir result/
+```
+Common optional arguments (may vary across scripts):
+- `--epoch`: number of training epochs (example scripts often default to 1000)
+- `--batch_size`: batch size
+- `--lr`: learning rate
+- `--patience`: early stopping patience
+You can inspect the argparse help for the specific script in the method directory:
+```bash
+python method_reg/DeepCCR/DeepCCR.py -h
+```
+## Dataset Analysis Tool (gp_agent_tool)
+The `gp_agent_tool` is an LLM-powered analysis tool that performs comprehensive dataset analysis and automatically recommends suitable genomic prediction methods. It analyzes your dataset characteristics, computes statistical features, finds similar datasets from historical experiments, and provides evidence-based method recommendations.
+### Features
+- **Dataset statistical analysis**: Automatically computes and analyzes dataset statistics including sample size, marker count, phenotype distribution, missing rates, and statistical properties
+- **Similar dataset discovery**: Finds datasets with similar statistical distributions to your query dataset from historical experimental databases
+- **Method recommendation**: Recommends genomic prediction methods that have shown best performance on similar datasets based on historical experience
+- **Bilingual support**: Supports both Chinese and English queries and analysis
+- **Experience-based insights**: Leverages comprehensive historical experimental results to provide evidence-based analysis and recommendations
+### Prerequisites
+1. **LLM Configuration**: Create a configuration file at `gp_agent_tool/config/config.json` with your LLM API settings:
+```json
+{
+  "llm": {
+    "model": "gpt-4o-mini",
+    "api_key": "YOUR_OPENAI_API_KEY",
+    "base_url": "https://api.openai.com/v1",
+    "timeout_seconds": 60,
+    "max_retries": 3
+  },
+  "codegen_llm": {
+    "model": "gpt-4o-mini",
+    "api_key": "YOUR_OPENAI_API_KEY",
+    "base_url": "https://api.openai.com/v1",
+    "timeout_seconds": 60,
+    "max_retries": 3
+  },
+  "multimodal_llm": {
+    "model": "qwen-vl-max",
+    "api_key": "YOUR_DASHSCOPE_API_KEY"
+  }
+}
+```
+**Important**: Please replace the `api_key` fields in the configuration file with your own API keys:
+- Replace `YOUR_OPENAI_API_KEY` in `llm` and `codegen_llm` with your OpenAI API key
+- Replace `YOUR_DASHSCOPE_API_KEY` in `multimodal_llm` with your Alibaba Cloud DashScope API key
+You can obtain API keys from the following URLs:
+- OpenAI API key: https://platform.openai.com/api-keys
+- Alibaba Cloud DashScope API key: https://dashscope.console.aliyun.com/apiKey
+2. **Additional Dependencies**: Install required packages for the tool:
+```bash
+pip install langchain langgraph openai
+```
+### Usage
+#### Basic Usage
+Run the tool from the project root directory:
+```bash
+cd gp_agent_tool
+python main.py \
+  -q "Based on existing models, summarize the patterns in the mkg trait of cattle." \
+  -o result.json
+```
+Or in English:
+```bash
+python main.py \
+  -d ../data/Rapeseed \
+  -q "Recommend the best methods for this dataset" \
+  -o result.json
+```
+#### Command-line Arguments
+- **`-d / --dataset`** (optional): Path to the dataset directory containing `genotype.npz` and `phenotype.npz`. The tool will analyze this dataset to compute statistical features. If not provided, analysis and recommendations are based on the complete experience table only.
+- **`-q / --user-query`** (required): Your analysis requirement or question description (supports both Chinese and English). Examples: "分析这个数据集的特征" / "Analyze this dataset and recommend methods" / "What methods work best for binary phenotypes?"
+- **`-m / --mask`** (optional): Specify a `species/phenotype` (e.g., `Rapeseed/FloweringTime`) to mask in the reference experience database, preventing "answer leakage" when evaluating on known datasets.
+- **`-o / --output`** (optional): Path to save the analysis result as a JSON file. If not provided, results are printed to the terminal.
+#### Dataset Analysis Features
+When a dataset path is provided, the tool automatically computes the following statistical features:
+- **Sample information**: Total samples, valid samples, missing rate
+- **Marker information**: Number of markers, genotype statistics (mean, std, missing rate, MAF)
+- **Phenotype statistics**: Mean, std, min, max, median, skewness, kurtosis
+- **Data type information**: Genotype and phenotype data types, binary phenotype detection
+#### Example Output
+The tool returns a JSON object with two main sections:
+```json
+{
+  "similar_datasets": {
+    "items": ["Chickpea/Days_to_0.5_flowering", "Cotton/FibLen_17_18"],
+    "reason": "These datasets have similar statistical distributions..."
+  },
+  "methods": {
+    "items": ["GBLUP", "XGBoost", "LightGBM"],
+    "reason": "Based on historical experience, these methods showed best performance on similar datasets..."
+  }
+}
+```
+#### Analysis Workflow
+When you provide a dataset path, the tool performs the following analysis steps:
+1. **Dataset feature extraction**: Computes statistical features from your dataset (phenotype mean, std, skewness, kurtosis, sample size, marker count, etc.)
+2. **Similar dataset matching**: Compares your dataset features with historical datasets to find the most similar ones
+3. **Experience table filtering**: Filters the historical experience table to include only results from similar datasets
+4. **Method analysis and recommendation**: Analyzes which methods performed best on similar datasets and recommends them with detailed reasoning
+#### Use Cases
+1. **General method query**: Query methods based on specific criteria without providing a dataset:
+```bash
+python main.py \
+  -q "What methods work best for small sample sizes?" \
+  -o result.json
+```
+2. **Evaluation mode with masking**: When evaluating on a known dataset, mask it to avoid bias in the analysis:
+```bash
+python main.py \
+  -d ../data/Rapeseed \
+  -q "Analyze this dataset and recommend appropriate algorithms." \
+  -m Rapeseed/FloweringTime \
+  -o result.json
+```
+## Output Description
+- Each method run creates a directory under `result/` named by method/species/phenotype, e.g., `result/DeepCCR/Cotton/<PHENO>/`.
+- Per-fold prediction results are typically saved as `fold{n}.csv`, containing `Y_test` and `Y_pred` columns.
+- The script prints or saves average evaluation metrics at the end: PCC (Pearson correlation coefficient), MAE, MSE, R2, along with runtime and memory/GPU usage.
+## Full Dataset Link
+- [Species dataset](https://doi.org/10.6084/m9.figshare.31007608): contains genotype and phenotype data for 16 species.
+## Running Tips & Troubleshooting
+- For GPU usage, ensure `conda activate Benchmark` and that CUDA drivers are available; `torch.cuda.is_available()` should return True.
+- If you encounter memory or GPU OOM issues, try reducing `--batch_size` or disabling some parallel settings in scripts.
+- If running on CPU-only systems, some GPU-specific methods (RAPIDS or GPU-only implementations) may be unavailable or require alternative implementations.
+## Contributing & Contact
+- Contributions via issues and PRs are welcome. Please describe changes and testing in PRs.
+- Contact: open an Issue in the repository or reach the repository owner (GitHub user: `xwzhang2118`).

gpbench-1.0.0/README.md ADDED Viewed

@@ -0,0 +1,244 @@
+# GPBench
+GPBench is a benchmarking toolkit for genomic prediction. This repository reimplements and integrates many commonly used methods, including classic linear statistical approaches and machine learning / deep learning methods: rrBLUP, GBLUP, BayesA/B/C, SVR, Random Forest, XGBoost, LightGBM, DeepGS, DL_GWAS, G2PDeep, MVP, DNNGP, SoyDNGP, DeepCCR, EIR, Cropformer, GEFormer, CropARNet, etc.
+Project Website: [https://www.sdu-idea.cn/GPBench/](https://www.sdu-idea.cn/GPBench/)
+![GPBench overview](data/fig/fig1.png)
+## Key Features
+- Implements multiple genomic prediction methods and reproducible experimental workflows
+- Supports GPU-accelerated deep learning methods (using PyTorch)
+- Unified data loading and 10-fold cross-validation pipeline
+- Outputs standardized evaluation metrics (PCC, MAE, MSE, R2) and per-fold predictions
+- **LLM-powered analysis tool** (`gp_agent_tool`): Analyzes dataset characteristics, finds similar datasets, and recommends suitable genomic prediction methods based on historical experimental experience
+## Important Structure
+- `data/`: Example/real dataset directory, each species/dataset is a subfolder (e.g., `data/Cotton/`), containing:
+	- `genotype.npz`: genotype matrix (typically saved as a NumPy array)
+	- `phenotype.npz`: phenotype data (contains phenotype matrix and phenotype names)
+- `method_reg/`: subdirectories with implementations for each method (each method usually contains a main runner script plus hyperparameter/utility scripts)
+- `result/`: default output directory for experimental results
+- `gp_agent_tool/`: LLM-powered dataset analysis and method recommendation tool (see [Dataset Analysis Tool](#dataset-analysis-tool-gp_agent_tool) section)
+- `environment.yml`: dependency file for creating a conda environment (recommended)
+## Environment Setup (recommended: conda)
+There is an `environment.yml` in the repository; it is recommended to create and activate a conda environment with it:
+```bash
+# On a machine with conda:
+conda env create -f environment.yml
+conda activate Benchmark
+```
+Notes:
+- `environment.yml` contains most dependencies (including CUDA / cuDNN related packages and pip list) and is suitable for GPU-enabled environments (the file references CUDA 11.8 and matching RAPIDS/torch/cupy versions).
+- Ensure the target machine has an NVIDIA driver compatible with CUDA 11.8/12.
+- If you cannot use the environment file directly, you can install main dependencies into an existing Python environment as needed:
+```bash
+pip install -U numpy pandas scikit-learn torch torchvision optuna psutil xgboost lightgbm
+```
+(Warning: the above is a simplified installation; some packages may need additional configuration on GPU systems or certain platforms.)
+## Data Format and Preparation
+- Each species folder should contain `genotype.npz` and `phenotype.npz`.
+- `genotype.npz` usually stores a 2D array (number of samples × number of SNPs).
+- `phenotype.npz` typically includes two arrays: the phenotype matrix (number of samples × number of phenotypes) and a list of phenotype names.
+Quickly view phenotype names for a dataset (e.g., `Cotton`):
+```bash
+python - <<'PY'
+import numpy as np
+obj = np.load('data/Cotton/phenotype.npz')
+print(obj['arr_1'])
+PY
+```
+## Quick Start (example with a method)
+Most methods have a main script under `method_reg/<Method>/`. Scripts usually accept parameters like `--methods`, `--species`, `--phe`, `--data_dir`, `--result_dir`, etc. Example:
+```bash
+# 1) Activate the environment
+conda activate Benchmark
+# 2) Run a single phenotype with DeepCCR (note: include trailing slash after --species)
+python method_reg/DeepCCR/DeepCCR.py \
+	--methods DeepCCR/ \
+	--species Cotton/ \
+	--phe FibLen_17_18 \
+	--data_dir data/ \
+	--result_dir result/
+```
+Common optional arguments (may vary across scripts):
+- `--epoch`: number of training epochs (example scripts often default to 1000)
+- `--batch_size`: batch size
+- `--lr`: learning rate
+- `--patience`: early stopping patience
+You can inspect the argparse help for the specific script in the method directory:
+```bash
+python method_reg/DeepCCR/DeepCCR.py -h
+```
+## Dataset Analysis Tool (gp_agent_tool)
+The `gp_agent_tool` is an LLM-powered analysis tool that performs comprehensive dataset analysis and automatically recommends suitable genomic prediction methods. It analyzes your dataset characteristics, computes statistical features, finds similar datasets from historical experiments, and provides evidence-based method recommendations.
+### Features
+- **Dataset statistical analysis**: Automatically computes and analyzes dataset statistics including sample size, marker count, phenotype distribution, missing rates, and statistical properties
+- **Similar dataset discovery**: Finds datasets with similar statistical distributions to your query dataset from historical experimental databases
+- **Method recommendation**: Recommends genomic prediction methods that have shown best performance on similar datasets based on historical experience
+- **Bilingual support**: Supports both Chinese and English queries and analysis
+- **Experience-based insights**: Leverages comprehensive historical experimental results to provide evidence-based analysis and recommendations
+### Prerequisites
+1. **LLM Configuration**: Create a configuration file at `gp_agent_tool/config/config.json` with your LLM API settings:
+```json
+{
+  "llm": {
+    "model": "gpt-4o-mini",
+    "api_key": "YOUR_OPENAI_API_KEY",
+    "base_url": "https://api.openai.com/v1",
+    "timeout_seconds": 60,
+    "max_retries": 3
+  },
+  "codegen_llm": {
+    "model": "gpt-4o-mini",
+    "api_key": "YOUR_OPENAI_API_KEY",
+    "base_url": "https://api.openai.com/v1",
+    "timeout_seconds": 60,
+    "max_retries": 3
+  },
+  "multimodal_llm": {
+    "model": "qwen-vl-max",
+    "api_key": "YOUR_DASHSCOPE_API_KEY"
+  }
+}
+```
+**Important**: Please replace the `api_key` fields in the configuration file with your own API keys:
+- Replace `YOUR_OPENAI_API_KEY` in `llm` and `codegen_llm` with your OpenAI API key
+- Replace `YOUR_DASHSCOPE_API_KEY` in `multimodal_llm` with your Alibaba Cloud DashScope API key
+You can obtain API keys from the following URLs:
+- OpenAI API key: https://platform.openai.com/api-keys
+- Alibaba Cloud DashScope API key: https://dashscope.console.aliyun.com/apiKey
+2. **Additional Dependencies**: Install required packages for the tool:
+```bash
+pip install langchain langgraph openai
+```
+### Usage
+#### Basic Usage
+Run the tool from the project root directory:
+```bash
+cd gp_agent_tool
+python main.py \
+  -q "Based on existing models, summarize the patterns in the mkg trait of cattle." \
+  -o result.json
+```
+Or in English:
+```bash
+python main.py \
+  -d ../data/Rapeseed \
+  -q "Recommend the best methods for this dataset" \
+  -o result.json
+```
+#### Command-line Arguments
+- **`-d / --dataset`** (optional): Path to the dataset directory containing `genotype.npz` and `phenotype.npz`. The tool will analyze this dataset to compute statistical features. If not provided, analysis and recommendations are based on the complete experience table only.
+- **`-q / --user-query`** (required): Your analysis requirement or question description (supports both Chinese and English). Examples: "分析这个数据集的特征" / "Analyze this dataset and recommend methods" / "What methods work best for binary phenotypes?"
+- **`-m / --mask`** (optional): Specify a `species/phenotype` (e.g., `Rapeseed/FloweringTime`) to mask in the reference experience database, preventing "answer leakage" when evaluating on known datasets.
+- **`-o / --output`** (optional): Path to save the analysis result as a JSON file. If not provided, results are printed to the terminal.
+#### Dataset Analysis Features
+When a dataset path is provided, the tool automatically computes the following statistical features:
+- **Sample information**: Total samples, valid samples, missing rate
+- **Marker information**: Number of markers, genotype statistics (mean, std, missing rate, MAF)
+- **Phenotype statistics**: Mean, std, min, max, median, skewness, kurtosis
+- **Data type information**: Genotype and phenotype data types, binary phenotype detection
+#### Example Output
+The tool returns a JSON object with two main sections:
+```json
+{
+  "similar_datasets": {
+    "items": ["Chickpea/Days_to_0.5_flowering", "Cotton/FibLen_17_18"],
+    "reason": "These datasets have similar statistical distributions..."
+  },
+  "methods": {
+    "items": ["GBLUP", "XGBoost", "LightGBM"],
+    "reason": "Based on historical experience, these methods showed best performance on similar datasets..."
+  }
+}
+```
+#### Analysis Workflow
+When you provide a dataset path, the tool performs the following analysis steps:
+1. **Dataset feature extraction**: Computes statistical features from your dataset (phenotype mean, std, skewness, kurtosis, sample size, marker count, etc.)
+2. **Similar dataset matching**: Compares your dataset features with historical datasets to find the most similar ones
+3. **Experience table filtering**: Filters the historical experience table to include only results from similar datasets
+4. **Method analysis and recommendation**: Analyzes which methods performed best on similar datasets and recommends them with detailed reasoning
+#### Use Cases
+1. **General method query**: Query methods based on specific criteria without providing a dataset:
+```bash
+python main.py \
+  -q "What methods work best for small sample sizes?" \
+  -o result.json
+```
+2. **Evaluation mode with masking**: When evaluating on a known dataset, mask it to avoid bias in the analysis:
+```bash
+python main.py \
+  -d ../data/Rapeseed \
+  -q "Analyze this dataset and recommend appropriate algorithms." \
+  -m Rapeseed/FloweringTime \
+  -o result.json
+```
+## Output Description
+- Each method run creates a directory under `result/` named by method/species/phenotype, e.g., `result/DeepCCR/Cotton/<PHENO>/`.
+- Per-fold prediction results are typically saved as `fold{n}.csv`, containing `Y_test` and `Y_pred` columns.
+- The script prints or saves average evaluation metrics at the end: PCC (Pearson correlation coefficient), MAE, MSE, R2, along with runtime and memory/GPU usage.
+## Full Dataset Link
+- [Species dataset](https://doi.org/10.6084/m9.figshare.31007608): contains genotype and phenotype data for 16 species.
+## Running Tips & Troubleshooting
+- For GPU usage, ensure `conda activate Benchmark` and that CUDA drivers are available; `torch.cuda.is_available()` should return True.
+- If you encounter memory or GPU OOM issues, try reducing `--batch_size` or disabling some parallel settings in scripts.
+- If running on CPU-only systems, some GPU-specific methods (RAPIDS or GPU-only implementations) may be unavailable or require alternative implementations.
+## Contributing & Contact
+- Contributions via issues and PRs are welcome. Please describe changes and testing in PRs.
+- Contact: open an Issue in the repository or reach the repository owner (GitHub user: `xwzhang2118`).

gpbench-1.0.0/gp_agent_tool/compute_dataset_feature.py ADDED Viewed

@@ -0,0 +1,67 @@
+import numpy as np
+from scipy.stats import skew, kurtosis
+import os
+def process_one_phenotype(dataset_path:str) -> dict:
+    """
+    处理单个表型，返回 summary 字典
+    """
+    geno_path = os.path.join(dataset_path, "genotype.npz")
+    pheno_path = os.path.join(dataset_path, "phenotype.npz")
+    genotype = np.load(geno_path)['arr_0']
+    pheno_file = np.load(pheno_path)
+    phenotype = pheno_file['arr_0']
+    phe_name = pheno_file['arr_1']
+    sp_name = pheno_file['arr_2']
+    phe_data = phenotype[:, 0]
+    # 去除缺失值
+    mask = ~np.isnan(phe_data)
+    phe_clean = phe_data[mask]
+    geno_clean = genotype[mask] if mask.sum() > 0 else genotype
+    summary = {
+        # 基本信息
+        # 'species_phenotype': f"{sp_name}/{phe_name}",
+        'species': sp_name,
+        # 'phenotype_name': phe_name,
+        # 维度信息
+        'n_samples_total': genotype.shape[0],
+        'n_samples_valid': len(phe_clean),
+        'n_markers': genotype.shape[1] if genotype.ndim > 1 else 1,
+        'missing_rate': 1 - len(phe_clean) / genotype.shape[0],
+        # 表型统计特征
+        'pheno_mean': np.mean(phe_clean) if len(phe_clean) > 0 else np.nan,
+        'pheno_std': np.std(phe_clean) if len(phe_clean) > 0 else np.nan,
+        'pheno_min': np.min(phe_clean) if len(phe_clean) > 0 else np.nan,
+        'pheno_max': np.max(phe_clean) if len(phe_clean) > 0 else np.nan,
+        'pheno_median': np.median(phe_clean) if len(phe_clean) > 0 else np.nan,
+        'pheno_skewness': skew(phe_clean) if len(phe_clean) > 3 else np.nan,
+        'pheno_kurtosis': kurtosis(phe_clean) if len(phe_clean) > 3 else np.nan,
+        # 基因型统计特征
+        'geno_mean': np.mean(geno_clean) if geno_clean.size > 0 else np.nan,
+        'geno_std': np.std(geno_clean) if geno_clean.size > 0 else np.nan,
+        'geno_missing_rate': (
+            np.isnan(geno_clean).sum() / geno_clean.size
+            if geno_clean.size > 0 else np.nan
+        ),
+        'geno_maf': (
+            np.mean(
+                np.minimum(
+                    np.mean(geno_clean, axis=0),
+                    1 - np.mean(geno_clean, axis=0)
+                )
+            ) if geno_clean.ndim > 1 and geno_clean.size > 0 else np.nan
+        ),
+        # 类型信息
+        'geno_dtype': str(genotype.dtype),
+        'pheno_dtype': str(phe_data.dtype),
+        'is_pheno_binary': len(np.unique(phe_clean)) == 2 if len(phe_clean) > 0 else False
+    }
+    return summary