biopytools 0.21.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biopytools-0.21.1/PKG-INFO +27 -0
- biopytools-0.21.1/README.md +114 -0
- biopytools-0.21.1/biopytools/admixture/__init__.py +35 -0
- biopytools-0.21.1/biopytools/admixture/analysis.py +200 -0
- biopytools-0.21.1/biopytools/admixture/config.py +82 -0
- biopytools-0.21.1/biopytools/admixture/data_processing.py +482 -0
- biopytools-0.21.1/biopytools/admixture/main.py +255 -0
- biopytools-0.21.1/biopytools/admixture/results.py +234 -0
- biopytools-0.21.1/biopytools/admixture/utils.py +212 -0
- biopytools-0.21.1/biopytools/agp2table/__init__.py +27 -0
- biopytools-0.21.1/biopytools/agp2table/config.py +52 -0
- biopytools-0.21.1/biopytools/agp2table/main.py +532 -0
- biopytools-0.21.1/biopytools/agp2table/utils.py +53 -0
- biopytools-0.21.1/biopytools/annovar/__init__.py +40 -0
- biopytools-0.21.1/biopytools/annovar/annotation.py +63 -0
- biopytools-0.21.1/biopytools/annovar/config.py +82 -0
- biopytools-0.21.1/biopytools/annovar/data_processing.py +147 -0
- biopytools-0.21.1/biopytools/annovar/main.py +265 -0
- biopytools-0.21.1/biopytools/annovar/results.py +61 -0
- biopytools-0.21.1/biopytools/annovar/results_processor.py +465 -0
- biopytools-0.21.1/biopytools/annovar/utils.py +247 -0
- biopytools-0.21.1/biopytools/bam_cov/__init__.py +31 -0
- biopytools-0.21.1/biopytools/bam_cov/config.py +108 -0
- biopytools-0.21.1/biopytools/bam_cov/main.py +286 -0
- biopytools-0.21.1/biopytools/bam_cov/utils.py +351 -0
- biopytools-0.21.1/biopytools/bam_stats/__init__.py +8 -0
- biopytools-0.21.1/biopytools/bam_stats/alignment_stats.py +102 -0
- biopytools-0.21.1/biopytools/bam_stats/batch_processor.py +215 -0
- biopytools-0.21.1/biopytools/bam_stats/config.py +150 -0
- biopytools-0.21.1/biopytools/bam_stats/core.py +149 -0
- biopytools-0.21.1/biopytools/bam_stats/coverage_stats.py +161 -0
- biopytools-0.21.1/biopytools/bam_stats/duplicate_stats.py +91 -0
- biopytools-0.21.1/biopytools/bam_stats/genome_stats.py +207 -0
- biopytools-0.21.1/biopytools/bam_stats/insert_stats.py +71 -0
- biopytools-0.21.1/biopytools/bam_stats/main.py +81 -0
- biopytools-0.21.1/biopytools/bam_stats/sample_stats.py +82 -0
- biopytools-0.21.1/biopytools/bam_stats/sequence_stats.py +137 -0
- biopytools-0.21.1/biopytools/bam_stats/utils.py +139 -0
- biopytools-0.21.1/biopytools/blast/__init__.py +29 -0
- biopytools-0.21.1/biopytools/blast/alignment_visualizer.py +201 -0
- biopytools-0.21.1/biopytools/blast/config.py +403 -0
- biopytools-0.21.1/biopytools/blast/html_alignment.py +276 -0
- biopytools-0.21.1/biopytools/blast/html_templates.py +443 -0
- biopytools-0.21.1/biopytools/blast/main.py +1090 -0
- biopytools-0.21.1/biopytools/blast/statistics.py +257 -0
- biopytools-0.21.1/biopytools/blast/text_alignment.py +189 -0
- biopytools-0.21.1/biopytools/blast/utils.py +218 -0
- biopytools-0.21.1/biopytools/busco/__init__.py +31 -0
- biopytools-0.21.1/biopytools/busco/analyzer.py +187 -0
- biopytools-0.21.1/biopytools/busco/config.py +103 -0
- biopytools-0.21.1/biopytools/busco/main.py +234 -0
- biopytools-0.21.1/biopytools/busco/results.py +227 -0
- biopytools-0.21.1/biopytools/busco/utils.py +224 -0
- biopytools-0.21.1/biopytools/bwa/__init__.py +11 -0
- biopytools-0.21.1/biopytools/bwa/alignment.py +137 -0
- biopytools-0.21.1/biopytools/bwa/config.py +171 -0
- biopytools-0.21.1/biopytools/bwa/coverage.py +104 -0
- biopytools-0.21.1/biopytools/bwa/genome.py +53 -0
- biopytools-0.21.1/biopytools/bwa/main.py +275 -0
- biopytools-0.21.1/biopytools/bwa/stats.py +90 -0
- biopytools-0.21.1/biopytools/bwa/utils.py +145 -0
- biopytools-0.21.1/biopytools/cli/commands/admixture.py +191 -0
- biopytools-0.21.1/biopytools/cli/commands/agp2table.py +104 -0
- biopytools-0.21.1/biopytools/cli/commands/annovar.py +150 -0
- biopytools-0.21.1/biopytools/cli/commands/bam_cov.py +184 -0
- biopytools-0.21.1/biopytools/cli/commands/bam_stats.py +108 -0
- biopytools-0.21.1/biopytools/cli/commands/blast.py +342 -0
- biopytools-0.21.1/biopytools/cli/commands/busco.py +281 -0
- biopytools-0.21.1/biopytools/cli/commands/bwa.py +311 -0
- biopytools-0.21.1/biopytools/cli/commands/deepbsa.py +138 -0
- biopytools-0.21.1/biopytools/cli/commands/dual_rnaseq.py +148 -0
- biopytools-0.21.1/biopytools/cli/commands/fastp.py +197 -0
- biopytools-0.21.1/biopytools/cli/commands/genome_analysis.py +174 -0
- biopytools-0.21.1/biopytools/cli/commands/hifi_hic.py +222 -0
- biopytools-0.21.1/biopytools/cli/commands/iseq.py +173 -0
- biopytools-0.21.1/biopytools/cli/commands/rnaseq.py +161 -0
- biopytools-0.21.1/biopytools/cli/commands/sra2fastq.py +130 -0
- biopytools-0.21.1/biopytools/cli/commands/vcf2pca.py +188 -0
- biopytools-0.21.1/biopytools/cli/commands/vcf2phylip.py +145 -0
- biopytools-0.21.1/biopytools/cli/main.py +244 -0
- biopytools-0.21.1/biopytools/deepbsa/__init__.py +10 -0
- biopytools-0.21.1/biopytools/deepbsa/config.py +97 -0
- biopytools-0.21.1/biopytools/deepbsa/main.py +161 -0
- biopytools-0.21.1/biopytools/deepbsa/merge_results.py +345 -0
- biopytools-0.21.1/biopytools/deepbsa/plot_data_calculator.py +362 -0
- biopytools-0.21.1/biopytools/deepbsa/runner.py +278 -0
- biopytools-0.21.1/biopytools/deepbsa/utils.py +250 -0
- biopytools-0.21.1/biopytools/dual_rnaseq/__init__.py +38 -0
- biopytools-0.21.1/biopytools/dual_rnaseq/bam_to_fastq.py +309 -0
- biopytools-0.21.1/biopytools/dual_rnaseq/classification.py +260 -0
- biopytools-0.21.1/biopytools/dual_rnaseq/config.py +93 -0
- biopytools-0.21.1/biopytools/dual_rnaseq/data_processing.py +226 -0
- biopytools-0.21.1/biopytools/dual_rnaseq/indexing.py +153 -0
- biopytools-0.21.1/biopytools/dual_rnaseq/main.py +283 -0
- biopytools-0.21.1/biopytools/dual_rnaseq/mapping_stats.py +286 -0
- biopytools-0.21.1/biopytools/dual_rnaseq/quantification.py +163 -0
- biopytools-0.21.1/biopytools/dual_rnaseq/results.py +148 -0
- biopytools-0.21.1/biopytools/dual_rnaseq/utils.py +132 -0
- biopytools-0.21.1/biopytools/fastp/__init__.py +35 -0
- biopytools-0.21.1/biopytools/fastp/config.py +145 -0
- biopytools-0.21.1/biopytools/fastp/data_processing.py +247 -0
- biopytools-0.21.1/biopytools/fastp/main.py +236 -0
- biopytools-0.21.1/biopytools/fastp/processing.py +155 -0
- biopytools-0.21.1/biopytools/fastp/results.py +82 -0
- biopytools-0.21.1/biopytools/fastp/utils.py +148 -0
- biopytools-0.21.1/biopytools/genome_analysis/__init__.py +8 -0
- biopytools-0.21.1/biopytools/genome_analysis/config.py +99 -0
- biopytools-0.21.1/biopytools/genome_analysis/genomescope.py +581 -0
- biopytools-0.21.1/biopytools/genome_analysis/main.py +618 -0
- biopytools-0.21.1/biopytools/genome_analysis/utils.py +950 -0
- biopytools-0.21.1/biopytools/hifi_hic/__init__.py +40 -0
- biopytools-0.21.1/biopytools/hifi_hic/assembler.py +237 -0
- biopytools-0.21.1/biopytools/hifi_hic/config.py +261 -0
- biopytools-0.21.1/biopytools/hifi_hic/logger.py +64 -0
- biopytools-0.21.1/biopytools/hifi_hic/main.py +383 -0
- biopytools-0.21.1/biopytools/hifi_hic/ngs_polisher.py +397 -0
- biopytools-0.21.1/biopytools/hifi_hic/purge_dups_wrapper.py +148 -0
- biopytools-0.21.1/biopytools/hifi_hic/report.py +234 -0
- biopytools-0.21.1/biopytools/hifi_hic/utils.py +200 -0
- biopytools-0.21.1/biopytools/iseq/__init__.py +28 -0
- biopytools-0.21.1/biopytools/iseq/calculator.py +192 -0
- biopytools-0.21.1/biopytools/iseq/config.py +108 -0
- biopytools-0.21.1/biopytools/iseq/main.py +158 -0
- biopytools-0.21.1/biopytools/iseq/utils.py +92 -0
- biopytools-0.21.1/biopytools/rnaseq/__init__.py +30 -0
- biopytools-0.21.1/biopytools/rnaseq/alignment.py +302 -0
- biopytools-0.21.1/biopytools/rnaseq/config.py +149 -0
- biopytools-0.21.1/biopytools/rnaseq/data_processing.py +403 -0
- biopytools-0.21.1/biopytools/rnaseq/main.py +446 -0
- biopytools-0.21.1/biopytools/rnaseq/quantification.py +190 -0
- biopytools-0.21.1/biopytools/rnaseq/results.py +267 -0
- biopytools-0.21.1/biopytools/rnaseq/utils.py +214 -0
- biopytools-0.21.1/biopytools/sra2fastq/__init__.py +29 -0
- biopytools-0.21.1/biopytools/sra2fastq/config.py +123 -0
- biopytools-0.21.1/biopytools/sra2fastq/main.py +149 -0
- biopytools-0.21.1/biopytools/sra2fastq/processor.py +141 -0
- biopytools-0.21.1/biopytools/sra2fastq/report.py +75 -0
- biopytools-0.21.1/biopytools/sra2fastq/utils.py +131 -0
- biopytools-0.21.1/biopytools/vcf2phylip/__init__.py +35 -0
- biopytools-0.21.1/biopytools/vcf2phylip/config.py +68 -0
- biopytools-0.21.1/biopytools/vcf2phylip/main.py +162 -0
- biopytools-0.21.1/biopytools/vcf2phylip/matrix_writer.py +146 -0
- biopytools-0.21.1/biopytools/vcf2phylip/processor.py +134 -0
- biopytools-0.21.1/biopytools/vcf2phylip/utils.py +75 -0
- biopytools-0.21.1/biopytools/vcf2phylip/vcf_parser.py +76 -0
- biopytools-0.21.1/biopytools/vcf_pca/__init__.py +30 -0
- biopytools-0.21.1/biopytools/vcf_pca/config.py +76 -0
- biopytools-0.21.1/biopytools/vcf_pca/data_processing.py +194 -0
- biopytools-0.21.1/biopytools/vcf_pca/main.py +186 -0
- biopytools-0.21.1/biopytools/vcf_pca/pca_analysis.py +174 -0
- biopytools-0.21.1/biopytools/vcf_pca/results.py +60 -0
- biopytools-0.21.1/biopytools/vcf_pca/utils.py +104 -0
- biopytools-0.21.1/biopytools/vcf_pca/visualization.py +171 -0
- biopytools-0.21.1/biopytools.egg-info/PKG-INFO +27 -0
- biopytools-0.21.1/biopytools.egg-info/SOURCES.txt +159 -0
- biopytools-0.21.1/biopytools.egg-info/dependency_links.txt +1 -0
- biopytools-0.21.1/biopytools.egg-info/entry_points.txt +2 -0
- biopytools-0.21.1/biopytools.egg-info/requires.txt +19 -0
- biopytools-0.21.1/biopytools.egg-info/top_level.txt +1 -0
- biopytools-0.21.1/pyproject.toml +46 -0
- biopytools-0.21.1/setup.cfg +4 -0
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: biopytools
|
|
3
|
+
Version: 0.21.1
|
|
4
|
+
Summary: Bioinformatics tools package with Python.
|
|
5
|
+
Author-email: Xiang LI <lixiang117423@gmail.com>
|
|
6
|
+
Project-URL: Homepage, https://github.com/lixiang117423/biopytools
|
|
7
|
+
Project-URL: Documentation, https://lixiang117423.github.io/article/biopytools-readme/
|
|
8
|
+
Project-URL: Repository, https://github.com/lixiang117423/biopytools.git
|
|
9
|
+
Project-URL: Issues, https://github.com/lixiang117423/biopytools/issues
|
|
10
|
+
Requires-Python: >=3.10
|
|
11
|
+
Requires-Dist: pandas
|
|
12
|
+
Requires-Dist: numpy
|
|
13
|
+
Requires-Dist: pyfastx>=0.8.4
|
|
14
|
+
Requires-Dist: scikit-learn>=1.0.0
|
|
15
|
+
Requires-Dist: matplotlib>=3.5.0
|
|
16
|
+
Requires-Dist: seaborn>=0.11.0
|
|
17
|
+
Requires-Dist: click>=8.0.0
|
|
18
|
+
Provides-Extra: dev
|
|
19
|
+
Requires-Dist: pytest>=6.0; extra == "dev"
|
|
20
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
|
21
|
+
Requires-Dist: black; extra == "dev"
|
|
22
|
+
Requires-Dist: isort; extra == "dev"
|
|
23
|
+
Requires-Dist: flake8; extra == "dev"
|
|
24
|
+
Provides-Extra: docs
|
|
25
|
+
Requires-Dist: mkdocs; extra == "docs"
|
|
26
|
+
Requires-Dist: mkdocs-material; extra == "docs"
|
|
27
|
+
Requires-Dist: mkdocs-mermaid2-plugin; extra == "docs"
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
# BioPyTools
|
|
2
|
+
|
|
3
|
+
A Python toolkit for bioinformatics analysis and computational biology.
|
|
4
|
+
|
|
5
|
+
一个用于生物信息学分析和计算生物学的Python工具包。
|
|
6
|
+
|
|
7
|
+
## 简介 | Overview
|
|
8
|
+
|
|
9
|
+
BioPyTools 是一个专为生物信息学研究设计的Python工具包,提供了一系列常用的生物数据分析功能。
|
|
10
|
+
|
|
11
|
+
BioPyTools is a Python toolkit designed for bioinformatics research, providing a series of commonly used biological data analysis functions.
|
|
12
|
+
|
|
13
|
+
## 系统要求 | Requirements
|
|
14
|
+
|
|
15
|
+
- Python >= 3.8
|
|
16
|
+
- NumPy >= 1.19.0
|
|
17
|
+
- Pandas >= 1.2.0
|
|
18
|
+
- Matplotlib >= 3.3.0
|
|
19
|
+
|
|
20
|
+
## 环境配置 / Environment Setup
|
|
21
|
+
|
|
22
|
+
相关的Conda 环境配置文件位于 [`conda_env/`](conda_env/) 目录下。
|
|
23
|
+
|
|
24
|
+
Conda environment files can be found in the [`conda_env/`](conda_env/) directory.
|
|
25
|
+
|
|
26
|
+
## 安装方法 | Installation
|
|
27
|
+
|
|
28
|
+
### 从源码安装 | Install from source
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
git clone https://github.com/lixiang117423/biopytools.git
|
|
32
|
+
cd biopytools
|
|
33
|
+
pip install -e .
|
|
34
|
+
|
|
35
|
+
# or
|
|
36
|
+
pip install .
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## 使用方法 | Usage
|
|
40
|
+
|
|
41
|
+
### 查看帮助 | Getting Help
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
biopytools -h
|
|
45
|
+
Usage: biopytools [OPTIONS] COMMAND [ARGS]...
|
|
46
|
+
|
|
47
|
+
BioPyTools - 生物信息学分析工具包
|
|
48
|
+
|
|
49
|
+
要查看特定命令的帮助,请运行:biopytools <命令> -h/--help, 如biopytools annovar -h
|
|
50
|
+
|
|
51
|
+
Options:
|
|
52
|
+
-v, --version Show the version and exit.
|
|
53
|
+
-h, --help Show this message and exit.
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## 模块文档 | README
|
|
57
|
+
|
|
58
|
+
[admixture](./docs/admixture.md) - [群体结构分析软件Admixture](https://genome.cshlp.org/content/19/9/1655)
|
|
59
|
+
|
|
60
|
+
[agp2table](./docs/agp2table.md) - agp文件转换为table
|
|
61
|
+
|
|
62
|
+
[annovar](./docs/annovar.md) - [ANNOVAR变异功能注释工具](https://academic.oup.com/nar/article/38/16/e164/1749458)
|
|
63
|
+
|
|
64
|
+
[bam-cov](./docs/bam_coverage_stats.md) - BAM文件覆盖度统计
|
|
65
|
+
|
|
66
|
+
[blast](./docs/blast_v2.md) - [序列比对工具](https://academic.oup.com/nar/article/36/suppl_2/W5/2505810?login=true)
|
|
67
|
+
|
|
68
|
+
[busco](./docs/busco.md) - [BUSCO](https://academic.oup.com/bioinformatics/article/31/19/3210/211866)
|
|
69
|
+
|
|
70
|
+
[bwa](docs/bam_coverage_stats.md) - [BWA比对](https://academic.oup.com/bioinformatics/article/25/14/1754/225615)
|
|
71
|
+
|
|
72
|
+
[deepbsa](./docs/deepbsa.md) - [DeepBSA](https://github.com/lizhao007/DeepBSA/)
|
|
73
|
+
|
|
74
|
+
[dual-rnaseq](docs/dual_rnaseq.md) - 双向RNA-Seq
|
|
75
|
+
|
|
76
|
+
[fastp](./docs/fastp.md) - [fastq文件质控](https://onlinelibrary.wiley.com/doi/10.1002/imt2.70078)
|
|
77
|
+
|
|
78
|
+
[genomescope](docs/genomescope.md) - 使用[GenomeScope 2.0](https://github.com/tbenavi1/genomescope2.0)和[Smudgeplot](https://github.com/KamilSJaron/smudgeplot)评估基因组大小和倍性
|
|
79
|
+
|
|
80
|
+
[iseq](./docs/iseq.md) - [iSeq下载测序数据](https://github.com/BioOmics/iSeq)
|
|
81
|
+
|
|
82
|
+
[rna-seq](./docs/rnaseq.md) - [Hisat2](https://www.nature.com/articles/s41587-019-0201-4) + [StringTie2](https://link.springer.com/article/10.1186/s13059-019-1910-1)转录组流程
|
|
83
|
+
|
|
84
|
+
[sra2fastq](docs/sra2fastq.md) - SRA文件转FSATQ文件-基于[parallel-fastq-dump](https://github.com/rvalieris/parallel-fastq-dump)
|
|
85
|
+
|
|
86
|
+
[vcf2pca](docs/vcf2pca.md) - 基于[VCF2PCACluster](https://github.com/hewm2008/VCF2PCACluster)和PLINK的VCF文件做PCA
|
|
87
|
+
|
|
88
|
+
[vcf2phylip](docs/vcf2phylip.md) - VCF文件转PHYLIP格式文件
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
## 许可证 | License
|
|
92
|
+
|
|
93
|
+
本项目采用 MIT 许可证 - 详见 [LICENSE](LICENSE) 文件
|
|
94
|
+
|
|
95
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
96
|
+
|
|
97
|
+
## 作者信息 | Author
|
|
98
|
+
|
|
99
|
+
**李详 (Xiang Li)**
|
|
100
|
+
- Email: lixiang117423@gmail.com
|
|
101
|
+
- GitHub: [@lixiang117423](https://github.com/lixiang117423)
|
|
102
|
+
|
|
103
|
+
## 致谢 | Acknowledgments
|
|
104
|
+
|
|
105
|
+
- 感谢所有为本项目做出贡献的开发者 | Thanks to all developers who contributed to this project
|
|
106
|
+
- 感谢开源社区的支持 | Thanks to the open source community for support
|
|
107
|
+
|
|
108
|
+
## 问题反馈 | Issues
|
|
109
|
+
|
|
110
|
+
如果遇到问题或有建议,请在GitHub上提交issue:
|
|
111
|
+
|
|
112
|
+
If you encounter problems or have suggestions, please submit an issue on GitHub:
|
|
113
|
+
|
|
114
|
+
https://github.com/lixiang117423/biopytools/issues
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ADMIXTURE群体结构分析工具包|ADMIXTURE Population Structure Analysis Toolkit
|
|
3
|
+
|
|
4
|
+
功能: VCF到ADMIXTURE分析的完整流程,支持群体结构分析和协变量生成
|
|
5
|
+
Features: Complete pipeline from VCF to ADMIXTURE analysis, supporting population structure analysis and covariate generation
|
|
6
|
+
|
|
7
|
+
作者|Author: Xiang LI
|
|
8
|
+
版本|Version: 1.0.0
|
|
9
|
+
日期|Date: 2024-12-30
|
|
10
|
+
|
|
11
|
+
使用示例|Usage Examples:
|
|
12
|
+
from biopytools.admixture import AdmixtureAnalyzer, AdmixtureConfig
|
|
13
|
+
|
|
14
|
+
# 创建分析器|Create analyzer
|
|
15
|
+
analyzer = AdmixtureAnalyzer(
|
|
16
|
+
vcf_file="data.vcf.gz",
|
|
17
|
+
output_dir="admixture_results",
|
|
18
|
+
min_k=2,
|
|
19
|
+
max_k=10
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
# 运行分析|Run analysis
|
|
23
|
+
analyzer.run_analysis()
|
|
24
|
+
|
|
25
|
+
命令行使用|Command Line Usage:
|
|
26
|
+
python -m admixture.main -v input.vcf -o results -k 2 -K 10
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
__version__ = "1.0.0"
|
|
30
|
+
__author__ = "Xiang LI"
|
|
31
|
+
|
|
32
|
+
from .main import AdmixtureAnalyzer
|
|
33
|
+
from .config import AdmixtureConfig
|
|
34
|
+
|
|
35
|
+
__all__ = ['AdmixtureAnalyzer', 'AdmixtureConfig']
|
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ADMIXTURE分析核心模块|ADMIXTURE Analysis Core Module
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import glob
|
|
7
|
+
import pandas as pd
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from .utils import CommandRunner
|
|
10
|
+
|
|
11
|
+
class AdmixtureAnalyzer:
|
|
12
|
+
"""ADMIXTURE分析器|ADMIXTURE Analyzer"""
|
|
13
|
+
|
|
14
|
+
def __init__(self, config, logger, cmd_runner: CommandRunner):
|
|
15
|
+
self.config = config
|
|
16
|
+
self.logger = logger
|
|
17
|
+
self.cmd_runner = cmd_runner
|
|
18
|
+
|
|
19
|
+
def run_admixture_analysis(self, plink_prefix: str):
|
|
20
|
+
""" 运行ADMIXTURE分析|Run ADMIXTURE analysis"""
|
|
21
|
+
self.logger.info(" 开始ADMIXTURE分析|Starting ADMIXTURE analysis")
|
|
22
|
+
|
|
23
|
+
bed_file = f"{plink_prefix}.bed"
|
|
24
|
+
if not os.path.exists(bed_file):
|
|
25
|
+
raise FileNotFoundError(f" PLINK文件不存在|PLINK file not found: {bed_file}")
|
|
26
|
+
|
|
27
|
+
# 为每个K值运行ADMIXTURE|Run ADMIXTURE for each K value
|
|
28
|
+
for k in range(self.config.min_k, self.config.max_k + 1):
|
|
29
|
+
self._run_single_k(bed_file, k)
|
|
30
|
+
|
|
31
|
+
# 计算交叉验证误差|Calculate cross-validation error
|
|
32
|
+
best_k = self._find_best_k()
|
|
33
|
+
|
|
34
|
+
self.logger.info(f" ADMIXTURE分析完成|ADMIXTURE analysis completed")
|
|
35
|
+
self.logger.info(f" 最优K值|Best K value: {best_k}")
|
|
36
|
+
|
|
37
|
+
return best_k
|
|
38
|
+
|
|
39
|
+
def _run_single_k(self, bed_file: str, k: int):
|
|
40
|
+
""" 为单个K值运行ADMIXTURE|Run ADMIXTURE for single K value"""
|
|
41
|
+
self.logger.info(f" 运行ADMIXTURE K={k}|Running ADMIXTURE K={k}")
|
|
42
|
+
|
|
43
|
+
# 设置输出文件名|Set output file names
|
|
44
|
+
log_file = os.path.join(self.config.output_dir, f"log_{k}.out")
|
|
45
|
+
|
|
46
|
+
# 构建命令,使用相对路径避免长路径问题
|
|
47
|
+
bed_basename = os.path.basename(bed_file)
|
|
48
|
+
cmd = (
|
|
49
|
+
f"cd {self.config.output_dir} && "
|
|
50
|
+
f"admixture --cv={self.config.cv_folds} -j{self.config.threads} "
|
|
51
|
+
f"{bed_basename} {k} > log_{k}.out 2>&1"
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
# 运行命令|Run command
|
|
55
|
+
try:
|
|
56
|
+
result = self.cmd_runner.run(cmd, f"ADMIXTURE分析 K={k}|ADMIXTURE analysis K={k}")
|
|
57
|
+
|
|
58
|
+
# 检查输出文件是否生成成功
|
|
59
|
+
expected_q_file = os.path.join(self.config.output_dir, f"{os.path.splitext(bed_basename)[0]}.{k}.Q")
|
|
60
|
+
if os.path.exists(expected_q_file):
|
|
61
|
+
self.logger.info(f" K={k} 分析成功完成|K={k} analysis completed successfully")
|
|
62
|
+
else:
|
|
63
|
+
self.logger.warning(f" K={k} 分析可能未完全成功,Q文件不存在|K={k} analysis may not be fully successful, Q file missing")
|
|
64
|
+
|
|
65
|
+
except Exception as e:
|
|
66
|
+
self.logger.error(f" K={k}分析失败|K={k} analysis failed: {e}")
|
|
67
|
+
# 尝试读取日志文件了解错误原因
|
|
68
|
+
if os.path.exists(log_file):
|
|
69
|
+
with open(log_file, 'r') as f:
|
|
70
|
+
log_content = f.read()
|
|
71
|
+
self.logger.error(f" ADMIXTURE日志|ADMIXTURE log:\n{log_content}")
|
|
72
|
+
raise
|
|
73
|
+
|
|
74
|
+
def _find_best_k(self):
|
|
75
|
+
""" 找到最优K值|Find best K value"""
|
|
76
|
+
cv_results = []
|
|
77
|
+
|
|
78
|
+
# 从log文件中提取CV误差|Extract CV error from log files
|
|
79
|
+
for k in range(self.config.min_k, self.config.max_k + 1):
|
|
80
|
+
log_file = os.path.join(self.config.output_dir, f"log_{k}.out")
|
|
81
|
+
|
|
82
|
+
if os.path.exists(log_file):
|
|
83
|
+
with open(log_file, 'r') as f:
|
|
84
|
+
for line in f:
|
|
85
|
+
if 'CV error' in line:
|
|
86
|
+
cv_error = float(line.strip().split()[-1])
|
|
87
|
+
cv_results.append({'K': k, 'CV_error': cv_error})
|
|
88
|
+
break
|
|
89
|
+
|
|
90
|
+
if not cv_results:
|
|
91
|
+
raise ValueError(" 未找到CV误差信息|No CV error information found")
|
|
92
|
+
|
|
93
|
+
# 找到最小CV误差对应的K值|Find K with minimum CV error
|
|
94
|
+
cv_df = pd.DataFrame(cv_results)
|
|
95
|
+
best_k = cv_df.loc[cv_df['CV_error'].idxmin(), 'K']
|
|
96
|
+
|
|
97
|
+
# 保存CV结果|Save CV results
|
|
98
|
+
cv_file = os.path.join(self.config.output_dir, "cv_results.csv")
|
|
99
|
+
cv_df.to_csv(cv_file, index=False)
|
|
100
|
+
|
|
101
|
+
return int(best_k)
|
|
102
|
+
|
|
103
|
+
class ResultsProcessor:
|
|
104
|
+
""" 结果处理器|Results Processor"""
|
|
105
|
+
|
|
106
|
+
def __init__(self, config, logger):
|
|
107
|
+
self.config = config
|
|
108
|
+
self.logger = logger
|
|
109
|
+
|
|
110
|
+
def process_results(self, best_k: int):
|
|
111
|
+
""" 处理分析结果|Process analysis results"""
|
|
112
|
+
self.logger.info(" 处理分析结果|Processing analysis results")
|
|
113
|
+
|
|
114
|
+
# 读取Q文件(个体祖先成分)| Read Q file (individual ancestry proportions)
|
|
115
|
+
q_file = os.path.join(self.config.output_dir, f"{self.config.base_name}.{best_k}.Q")
|
|
116
|
+
q_data = self._read_q_file(q_file, best_k)
|
|
117
|
+
|
|
118
|
+
# 读取P文件(等位基因频率)| Read P file (allele frequencies)
|
|
119
|
+
p_file = os.path.join(self.config.output_dir, f"{self.config.base_name}.{best_k}.P")
|
|
120
|
+
p_data = self._read_p_file(p_file, best_k)
|
|
121
|
+
|
|
122
|
+
# 计算统计信息|Calculate statistics
|
|
123
|
+
stats = self._calculate_statistics(q_data, best_k)
|
|
124
|
+
|
|
125
|
+
# 保存处理后的结果|Save processed results
|
|
126
|
+
self._save_processed_results(q_data, p_data, stats, best_k)
|
|
127
|
+
|
|
128
|
+
return q_data, p_data, stats
|
|
129
|
+
|
|
130
|
+
def _read_q_file(self, q_file: str, k: int):
|
|
131
|
+
""" 读取Q文件|Read Q file"""
|
|
132
|
+
if not os.path.exists(q_file):
|
|
133
|
+
raise FileNotFoundError(f" Q文件不存在|Q file not found: {q_file}")
|
|
134
|
+
|
|
135
|
+
q_data = pd.read_csv(q_file, sep=r'\s+', header=None)
|
|
136
|
+
q_data.columns = [f"Pop{i+1}" for i in range(k)]
|
|
137
|
+
|
|
138
|
+
# 添加个体信息|Add individual information
|
|
139
|
+
fam_file = os.path.join(self.config.output_dir, f"{self.config.base_name}.fam")
|
|
140
|
+
if os.path.exists(fam_file):
|
|
141
|
+
fam_data = pd.read_csv(fam_file, sep=r'\s+', header=None)
|
|
142
|
+
q_data['FID'] = fam_data.iloc[:, 0]
|
|
143
|
+
q_data['IID'] = fam_data.iloc[:, 1]
|
|
144
|
+
|
|
145
|
+
return q_data
|
|
146
|
+
|
|
147
|
+
def _read_p_file(self, p_file: str, k: int):
|
|
148
|
+
""" 读取P文件|Read P file"""
|
|
149
|
+
if not os.path.exists(p_file):
|
|
150
|
+
self.logger.warning(f" P文件不存在|P file not found: {p_file}")
|
|
151
|
+
return None
|
|
152
|
+
|
|
153
|
+
p_data = pd.read_csv(p_file, sep=r'\s+', header=None)
|
|
154
|
+
p_data.columns = [f"Pop{i+1}" for i in range(k)]
|
|
155
|
+
|
|
156
|
+
return p_data
|
|
157
|
+
|
|
158
|
+
def _calculate_statistics(self, q_data: pd.DataFrame, k: int):
|
|
159
|
+
""" 计算统计信息|Calculate statistics"""
|
|
160
|
+
pop_cols = [f"Pop{i+1}" for i in range(k)]
|
|
161
|
+
|
|
162
|
+
# 计算每个个体的最大祖先成分|Calculate max ancestry for each individual
|
|
163
|
+
max_ancestry = q_data[pop_cols].max(axis=1)
|
|
164
|
+
|
|
165
|
+
# 计算混合程度|Calculate admixture level
|
|
166
|
+
admixture_level = 1 - max_ancestry
|
|
167
|
+
|
|
168
|
+
# 统计信息|Statistics
|
|
169
|
+
stats = {
|
|
170
|
+
'total_individuals': len(q_data),
|
|
171
|
+
'highly_admixed': sum(admixture_level > 0.3),
|
|
172
|
+
'pure_individuals': sum(max_ancestry > 0.9),
|
|
173
|
+
'mean_admixture_level': admixture_level.mean(),
|
|
174
|
+
'mean_max_ancestry': max_ancestry.mean()
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
return stats
|
|
178
|
+
|
|
179
|
+
def _save_processed_results(self, q_data: pd.DataFrame, p_data: pd.DataFrame,
|
|
180
|
+
stats: dict, best_k: int):
|
|
181
|
+
""" 保存处理后的结果|Save processed results"""
|
|
182
|
+
# 保存详细Q矩阵|Save detailed Q matrix
|
|
183
|
+
q_output = os.path.join(self.config.output_dir, "admixture_proportions.csv")
|
|
184
|
+
q_data.to_csv(q_output, index=False)
|
|
185
|
+
|
|
186
|
+
# 保存统计信息|Save statistics
|
|
187
|
+
stats_output = os.path.join(self.config.output_dir, "admixture_statistics.txt")
|
|
188
|
+
with open(stats_output, 'w') as f:
|
|
189
|
+
f.write(" ADMIXTURE分析统计信息|ADMIXTURE Analysis Statistics\n")
|
|
190
|
+
f.write("=" * 80 + "\n\n")
|
|
191
|
+
f.write(f" 最优K值|Best K value: {best_k}\n")
|
|
192
|
+
f.write(f"总个体数|Total individuals: {stats['total_individuals']}\n")
|
|
193
|
+
f.write(f" 高度混合个体数|Highly admixed individuals: {stats['highly_admixed']}\n")
|
|
194
|
+
f.write(f"纯合个体数|Pure individuals: {stats['pure_individuals']}\n")
|
|
195
|
+
f.write(f"平均混合程度|Mean admixture level: {stats['mean_admixture_level']:.3f}\n")
|
|
196
|
+
f.write(f" 平均最大祖先成分|Mean max ancestry: {stats['mean_max_ancestry']:.3f}\n")
|
|
197
|
+
|
|
198
|
+
self.logger.info(f" 结果已保存|Results saved:")
|
|
199
|
+
self.logger.info(f" - 个体祖先成分|Individual ancestry proportions: {q_output}")
|
|
200
|
+
self.logger.info(f" - 统计信息|Statistics: {stats_output}")
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ADMIXTURE分析配置管理模块|ADMIXTURE Analysis Configuration Management Module
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Optional, List
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class AdmixtureConfig:
|
|
13
|
+
"""ADMIXTURE分析配置类|ADMIXTURE Analysis Configuration Class"""
|
|
14
|
+
|
|
15
|
+
# 必需文件|Required files
|
|
16
|
+
vcf_file: str
|
|
17
|
+
output_dir: str = "admixture_results"
|
|
18
|
+
|
|
19
|
+
# 分析参数|Analysis parameters
|
|
20
|
+
min_k: int = 2
|
|
21
|
+
max_k: int = 10
|
|
22
|
+
cv_folds: int = 5
|
|
23
|
+
threads: int = 64
|
|
24
|
+
|
|
25
|
+
# 质控参数|Quality control parameters
|
|
26
|
+
maf: float = 0.05
|
|
27
|
+
missing_rate: float = 0.1
|
|
28
|
+
hwe_pvalue: float = 1e-6
|
|
29
|
+
|
|
30
|
+
# 处理选项|Processing options
|
|
31
|
+
skip_preprocessing: bool = False
|
|
32
|
+
keep_intermediate: bool = False
|
|
33
|
+
|
|
34
|
+
# 日志配置|Logging configuration
|
|
35
|
+
log_level: str = "INFO"
|
|
36
|
+
quiet: bool = False
|
|
37
|
+
verbose: int = 0
|
|
38
|
+
|
|
39
|
+
# 执行控制|Execution control
|
|
40
|
+
force: bool = False
|
|
41
|
+
dry_run: bool = False
|
|
42
|
+
force: bool = False
|
|
43
|
+
|
|
44
|
+
# 内部属性|Internal attributes
|
|
45
|
+
base_name: str = "admixture_ready"
|
|
46
|
+
|
|
47
|
+
def __post_init__(self):
|
|
48
|
+
"""初始化后处理|Post-initialization processing"""
|
|
49
|
+
self.output_path = Path(self.output_dir)
|
|
50
|
+
self.output_path.mkdir(parents=True, exist_ok=True)
|
|
51
|
+
|
|
52
|
+
# 标准化路径|Normalize paths
|
|
53
|
+
self.vcf_file = os.path.normpath(os.path.abspath(self.vcf_file))
|
|
54
|
+
self.output_dir = os.path.normpath(os.path.abspath(self.output_dir))
|
|
55
|
+
|
|
56
|
+
def validate(self):
|
|
57
|
+
"""验证配置参数|Validate configuration parameters"""
|
|
58
|
+
errors = []
|
|
59
|
+
|
|
60
|
+
# 检查输入文件|Check input file
|
|
61
|
+
if not os.path.exists(self.vcf_file):
|
|
62
|
+
errors.append(f"VCF文件不存在|VCF file does not exist: {self.vcf_file}")
|
|
63
|
+
|
|
64
|
+
# 检查K值范围|Check K range
|
|
65
|
+
if self.min_k < 1 or self.max_k < self.min_k:
|
|
66
|
+
errors.append(f"无效的K值范围|Invalid K range: {self.min_k} to {self.max_k}")
|
|
67
|
+
|
|
68
|
+
# 检查线程数|Check thread count
|
|
69
|
+
if self.threads <= 0:
|
|
70
|
+
errors.append(f"线程数必须为正整数|Thread count must be positive: {self.threads}")
|
|
71
|
+
|
|
72
|
+
# 检查质控参数|Check QC parameters
|
|
73
|
+
if not 0 <= self.maf <= 0.5:
|
|
74
|
+
errors.append(f"MAF值应在0-0.5之间|MAF should be between 0-0.5: {self.maf}")
|
|
75
|
+
|
|
76
|
+
if not 0 <= self.missing_rate <= 1:
|
|
77
|
+
errors.append(f"缺失率应在0-1之间|Missing rate should be between 0-1: {self.missing_rate}")
|
|
78
|
+
|
|
79
|
+
if errors:
|
|
80
|
+
raise ValueError("配置错误|Configuration error:\n" + "\n".join(errors))
|
|
81
|
+
|
|
82
|
+
return True
|