biopytools 0.21.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (161) hide show
  1. biopytools-0.21.1/PKG-INFO +27 -0
  2. biopytools-0.21.1/README.md +114 -0
  3. biopytools-0.21.1/biopytools/admixture/__init__.py +35 -0
  4. biopytools-0.21.1/biopytools/admixture/analysis.py +200 -0
  5. biopytools-0.21.1/biopytools/admixture/config.py +82 -0
  6. biopytools-0.21.1/biopytools/admixture/data_processing.py +482 -0
  7. biopytools-0.21.1/biopytools/admixture/main.py +255 -0
  8. biopytools-0.21.1/biopytools/admixture/results.py +234 -0
  9. biopytools-0.21.1/biopytools/admixture/utils.py +212 -0
  10. biopytools-0.21.1/biopytools/agp2table/__init__.py +27 -0
  11. biopytools-0.21.1/biopytools/agp2table/config.py +52 -0
  12. biopytools-0.21.1/biopytools/agp2table/main.py +532 -0
  13. biopytools-0.21.1/biopytools/agp2table/utils.py +53 -0
  14. biopytools-0.21.1/biopytools/annovar/__init__.py +40 -0
  15. biopytools-0.21.1/biopytools/annovar/annotation.py +63 -0
  16. biopytools-0.21.1/biopytools/annovar/config.py +82 -0
  17. biopytools-0.21.1/biopytools/annovar/data_processing.py +147 -0
  18. biopytools-0.21.1/biopytools/annovar/main.py +265 -0
  19. biopytools-0.21.1/biopytools/annovar/results.py +61 -0
  20. biopytools-0.21.1/biopytools/annovar/results_processor.py +465 -0
  21. biopytools-0.21.1/biopytools/annovar/utils.py +247 -0
  22. biopytools-0.21.1/biopytools/bam_cov/__init__.py +31 -0
  23. biopytools-0.21.1/biopytools/bam_cov/config.py +108 -0
  24. biopytools-0.21.1/biopytools/bam_cov/main.py +286 -0
  25. biopytools-0.21.1/biopytools/bam_cov/utils.py +351 -0
  26. biopytools-0.21.1/biopytools/bam_stats/__init__.py +8 -0
  27. biopytools-0.21.1/biopytools/bam_stats/alignment_stats.py +102 -0
  28. biopytools-0.21.1/biopytools/bam_stats/batch_processor.py +215 -0
  29. biopytools-0.21.1/biopytools/bam_stats/config.py +150 -0
  30. biopytools-0.21.1/biopytools/bam_stats/core.py +149 -0
  31. biopytools-0.21.1/biopytools/bam_stats/coverage_stats.py +161 -0
  32. biopytools-0.21.1/biopytools/bam_stats/duplicate_stats.py +91 -0
  33. biopytools-0.21.1/biopytools/bam_stats/genome_stats.py +207 -0
  34. biopytools-0.21.1/biopytools/bam_stats/insert_stats.py +71 -0
  35. biopytools-0.21.1/biopytools/bam_stats/main.py +81 -0
  36. biopytools-0.21.1/biopytools/bam_stats/sample_stats.py +82 -0
  37. biopytools-0.21.1/biopytools/bam_stats/sequence_stats.py +137 -0
  38. biopytools-0.21.1/biopytools/bam_stats/utils.py +139 -0
  39. biopytools-0.21.1/biopytools/blast/__init__.py +29 -0
  40. biopytools-0.21.1/biopytools/blast/alignment_visualizer.py +201 -0
  41. biopytools-0.21.1/biopytools/blast/config.py +403 -0
  42. biopytools-0.21.1/biopytools/blast/html_alignment.py +276 -0
  43. biopytools-0.21.1/biopytools/blast/html_templates.py +443 -0
  44. biopytools-0.21.1/biopytools/blast/main.py +1090 -0
  45. biopytools-0.21.1/biopytools/blast/statistics.py +257 -0
  46. biopytools-0.21.1/biopytools/blast/text_alignment.py +189 -0
  47. biopytools-0.21.1/biopytools/blast/utils.py +218 -0
  48. biopytools-0.21.1/biopytools/busco/__init__.py +31 -0
  49. biopytools-0.21.1/biopytools/busco/analyzer.py +187 -0
  50. biopytools-0.21.1/biopytools/busco/config.py +103 -0
  51. biopytools-0.21.1/biopytools/busco/main.py +234 -0
  52. biopytools-0.21.1/biopytools/busco/results.py +227 -0
  53. biopytools-0.21.1/biopytools/busco/utils.py +224 -0
  54. biopytools-0.21.1/biopytools/bwa/__init__.py +11 -0
  55. biopytools-0.21.1/biopytools/bwa/alignment.py +137 -0
  56. biopytools-0.21.1/biopytools/bwa/config.py +171 -0
  57. biopytools-0.21.1/biopytools/bwa/coverage.py +104 -0
  58. biopytools-0.21.1/biopytools/bwa/genome.py +53 -0
  59. biopytools-0.21.1/biopytools/bwa/main.py +275 -0
  60. biopytools-0.21.1/biopytools/bwa/stats.py +90 -0
  61. biopytools-0.21.1/biopytools/bwa/utils.py +145 -0
  62. biopytools-0.21.1/biopytools/cli/commands/admixture.py +191 -0
  63. biopytools-0.21.1/biopytools/cli/commands/agp2table.py +104 -0
  64. biopytools-0.21.1/biopytools/cli/commands/annovar.py +150 -0
  65. biopytools-0.21.1/biopytools/cli/commands/bam_cov.py +184 -0
  66. biopytools-0.21.1/biopytools/cli/commands/bam_stats.py +108 -0
  67. biopytools-0.21.1/biopytools/cli/commands/blast.py +342 -0
  68. biopytools-0.21.1/biopytools/cli/commands/busco.py +281 -0
  69. biopytools-0.21.1/biopytools/cli/commands/bwa.py +311 -0
  70. biopytools-0.21.1/biopytools/cli/commands/deepbsa.py +138 -0
  71. biopytools-0.21.1/biopytools/cli/commands/dual_rnaseq.py +148 -0
  72. biopytools-0.21.1/biopytools/cli/commands/fastp.py +197 -0
  73. biopytools-0.21.1/biopytools/cli/commands/genome_analysis.py +174 -0
  74. biopytools-0.21.1/biopytools/cli/commands/hifi_hic.py +222 -0
  75. biopytools-0.21.1/biopytools/cli/commands/iseq.py +173 -0
  76. biopytools-0.21.1/biopytools/cli/commands/rnaseq.py +161 -0
  77. biopytools-0.21.1/biopytools/cli/commands/sra2fastq.py +130 -0
  78. biopytools-0.21.1/biopytools/cli/commands/vcf2pca.py +188 -0
  79. biopytools-0.21.1/biopytools/cli/commands/vcf2phylip.py +145 -0
  80. biopytools-0.21.1/biopytools/cli/main.py +244 -0
  81. biopytools-0.21.1/biopytools/deepbsa/__init__.py +10 -0
  82. biopytools-0.21.1/biopytools/deepbsa/config.py +97 -0
  83. biopytools-0.21.1/biopytools/deepbsa/main.py +161 -0
  84. biopytools-0.21.1/biopytools/deepbsa/merge_results.py +345 -0
  85. biopytools-0.21.1/biopytools/deepbsa/plot_data_calculator.py +362 -0
  86. biopytools-0.21.1/biopytools/deepbsa/runner.py +278 -0
  87. biopytools-0.21.1/biopytools/deepbsa/utils.py +250 -0
  88. biopytools-0.21.1/biopytools/dual_rnaseq/__init__.py +38 -0
  89. biopytools-0.21.1/biopytools/dual_rnaseq/bam_to_fastq.py +309 -0
  90. biopytools-0.21.1/biopytools/dual_rnaseq/classification.py +260 -0
  91. biopytools-0.21.1/biopytools/dual_rnaseq/config.py +93 -0
  92. biopytools-0.21.1/biopytools/dual_rnaseq/data_processing.py +226 -0
  93. biopytools-0.21.1/biopytools/dual_rnaseq/indexing.py +153 -0
  94. biopytools-0.21.1/biopytools/dual_rnaseq/main.py +283 -0
  95. biopytools-0.21.1/biopytools/dual_rnaseq/mapping_stats.py +286 -0
  96. biopytools-0.21.1/biopytools/dual_rnaseq/quantification.py +163 -0
  97. biopytools-0.21.1/biopytools/dual_rnaseq/results.py +148 -0
  98. biopytools-0.21.1/biopytools/dual_rnaseq/utils.py +132 -0
  99. biopytools-0.21.1/biopytools/fastp/__init__.py +35 -0
  100. biopytools-0.21.1/biopytools/fastp/config.py +145 -0
  101. biopytools-0.21.1/biopytools/fastp/data_processing.py +247 -0
  102. biopytools-0.21.1/biopytools/fastp/main.py +236 -0
  103. biopytools-0.21.1/biopytools/fastp/processing.py +155 -0
  104. biopytools-0.21.1/biopytools/fastp/results.py +82 -0
  105. biopytools-0.21.1/biopytools/fastp/utils.py +148 -0
  106. biopytools-0.21.1/biopytools/genome_analysis/__init__.py +8 -0
  107. biopytools-0.21.1/biopytools/genome_analysis/config.py +99 -0
  108. biopytools-0.21.1/biopytools/genome_analysis/genomescope.py +581 -0
  109. biopytools-0.21.1/biopytools/genome_analysis/main.py +618 -0
  110. biopytools-0.21.1/biopytools/genome_analysis/utils.py +950 -0
  111. biopytools-0.21.1/biopytools/hifi_hic/__init__.py +40 -0
  112. biopytools-0.21.1/biopytools/hifi_hic/assembler.py +237 -0
  113. biopytools-0.21.1/biopytools/hifi_hic/config.py +261 -0
  114. biopytools-0.21.1/biopytools/hifi_hic/logger.py +64 -0
  115. biopytools-0.21.1/biopytools/hifi_hic/main.py +383 -0
  116. biopytools-0.21.1/biopytools/hifi_hic/ngs_polisher.py +397 -0
  117. biopytools-0.21.1/biopytools/hifi_hic/purge_dups_wrapper.py +148 -0
  118. biopytools-0.21.1/biopytools/hifi_hic/report.py +234 -0
  119. biopytools-0.21.1/biopytools/hifi_hic/utils.py +200 -0
  120. biopytools-0.21.1/biopytools/iseq/__init__.py +28 -0
  121. biopytools-0.21.1/biopytools/iseq/calculator.py +192 -0
  122. biopytools-0.21.1/biopytools/iseq/config.py +108 -0
  123. biopytools-0.21.1/biopytools/iseq/main.py +158 -0
  124. biopytools-0.21.1/biopytools/iseq/utils.py +92 -0
  125. biopytools-0.21.1/biopytools/rnaseq/__init__.py +30 -0
  126. biopytools-0.21.1/biopytools/rnaseq/alignment.py +302 -0
  127. biopytools-0.21.1/biopytools/rnaseq/config.py +149 -0
  128. biopytools-0.21.1/biopytools/rnaseq/data_processing.py +403 -0
  129. biopytools-0.21.1/biopytools/rnaseq/main.py +446 -0
  130. biopytools-0.21.1/biopytools/rnaseq/quantification.py +190 -0
  131. biopytools-0.21.1/biopytools/rnaseq/results.py +267 -0
  132. biopytools-0.21.1/biopytools/rnaseq/utils.py +214 -0
  133. biopytools-0.21.1/biopytools/sra2fastq/__init__.py +29 -0
  134. biopytools-0.21.1/biopytools/sra2fastq/config.py +123 -0
  135. biopytools-0.21.1/biopytools/sra2fastq/main.py +149 -0
  136. biopytools-0.21.1/biopytools/sra2fastq/processor.py +141 -0
  137. biopytools-0.21.1/biopytools/sra2fastq/report.py +75 -0
  138. biopytools-0.21.1/biopytools/sra2fastq/utils.py +131 -0
  139. biopytools-0.21.1/biopytools/vcf2phylip/__init__.py +35 -0
  140. biopytools-0.21.1/biopytools/vcf2phylip/config.py +68 -0
  141. biopytools-0.21.1/biopytools/vcf2phylip/main.py +162 -0
  142. biopytools-0.21.1/biopytools/vcf2phylip/matrix_writer.py +146 -0
  143. biopytools-0.21.1/biopytools/vcf2phylip/processor.py +134 -0
  144. biopytools-0.21.1/biopytools/vcf2phylip/utils.py +75 -0
  145. biopytools-0.21.1/biopytools/vcf2phylip/vcf_parser.py +76 -0
  146. biopytools-0.21.1/biopytools/vcf_pca/__init__.py +30 -0
  147. biopytools-0.21.1/biopytools/vcf_pca/config.py +76 -0
  148. biopytools-0.21.1/biopytools/vcf_pca/data_processing.py +194 -0
  149. biopytools-0.21.1/biopytools/vcf_pca/main.py +186 -0
  150. biopytools-0.21.1/biopytools/vcf_pca/pca_analysis.py +174 -0
  151. biopytools-0.21.1/biopytools/vcf_pca/results.py +60 -0
  152. biopytools-0.21.1/biopytools/vcf_pca/utils.py +104 -0
  153. biopytools-0.21.1/biopytools/vcf_pca/visualization.py +171 -0
  154. biopytools-0.21.1/biopytools.egg-info/PKG-INFO +27 -0
  155. biopytools-0.21.1/biopytools.egg-info/SOURCES.txt +159 -0
  156. biopytools-0.21.1/biopytools.egg-info/dependency_links.txt +1 -0
  157. biopytools-0.21.1/biopytools.egg-info/entry_points.txt +2 -0
  158. biopytools-0.21.1/biopytools.egg-info/requires.txt +19 -0
  159. biopytools-0.21.1/biopytools.egg-info/top_level.txt +1 -0
  160. biopytools-0.21.1/pyproject.toml +46 -0
  161. biopytools-0.21.1/setup.cfg +4 -0
@@ -0,0 +1,27 @@
1
+ Metadata-Version: 2.4
2
+ Name: biopytools
3
+ Version: 0.21.1
4
+ Summary: Bioinformatics tools package with Python.
5
+ Author-email: Xiang LI <lixiang117423@gmail.com>
6
+ Project-URL: Homepage, https://github.com/lixiang117423/biopytools
7
+ Project-URL: Documentation, https://lixiang117423.github.io/article/biopytools-readme/
8
+ Project-URL: Repository, https://github.com/lixiang117423/biopytools.git
9
+ Project-URL: Issues, https://github.com/lixiang117423/biopytools/issues
10
+ Requires-Python: >=3.10
11
+ Requires-Dist: pandas
12
+ Requires-Dist: numpy
13
+ Requires-Dist: pyfastx>=0.8.4
14
+ Requires-Dist: scikit-learn>=1.0.0
15
+ Requires-Dist: matplotlib>=3.5.0
16
+ Requires-Dist: seaborn>=0.11.0
17
+ Requires-Dist: click>=8.0.0
18
+ Provides-Extra: dev
19
+ Requires-Dist: pytest>=6.0; extra == "dev"
20
+ Requires-Dist: pytest-cov; extra == "dev"
21
+ Requires-Dist: black; extra == "dev"
22
+ Requires-Dist: isort; extra == "dev"
23
+ Requires-Dist: flake8; extra == "dev"
24
+ Provides-Extra: docs
25
+ Requires-Dist: mkdocs; extra == "docs"
26
+ Requires-Dist: mkdocs-material; extra == "docs"
27
+ Requires-Dist: mkdocs-mermaid2-plugin; extra == "docs"
@@ -0,0 +1,114 @@
1
+ # BioPyTools
2
+
3
+ A Python toolkit for bioinformatics analysis and computational biology.
4
+
5
+ 一个用于生物信息学分析和计算生物学的Python工具包。
6
+
7
+ ## 简介 | Overview
8
+
9
+ BioPyTools 是一个专为生物信息学研究设计的Python工具包,提供了一系列常用的生物数据分析功能。
10
+
11
+ BioPyTools is a Python toolkit designed for bioinformatics research, providing a series of commonly used biological data analysis functions.
12
+
13
+ ## 系统要求 | Requirements
14
+
15
+ - Python >= 3.8
16
+ - NumPy >= 1.19.0
17
+ - Pandas >= 1.2.0
18
+ - Matplotlib >= 3.3.0
19
+
20
+ ## 环境配置 / Environment Setup
21
+
22
+ 相关的Conda 环境配置文件位于 [`conda_env/`](conda_env/) 目录下。
23
+
24
+ Conda environment files can be found in the [`conda_env/`](conda_env/) directory.
25
+
26
+ ## 安装方法 | Installation
27
+
28
+ ### 从源码安装 | Install from source
29
+
30
+ ```bash
31
+ git clone https://github.com/lixiang117423/biopytools.git
32
+ cd biopytools
33
+ pip install -e .
34
+
35
+ # or
36
+ pip install .
37
+ ```
38
+
39
+ ## 使用方法 | Usage
40
+
41
+ ### 查看帮助 | Getting Help
42
+
43
+ ```bash
44
+ biopytools -h
45
+ Usage: biopytools [OPTIONS] COMMAND [ARGS]...
46
+
47
+ BioPyTools - 生物信息学分析工具包
48
+
49
+ 要查看特定命令的帮助,请运行:biopytools <命令> -h/--help, 如biopytools annovar -h
50
+
51
+ Options:
52
+ -v, --version Show the version and exit.
53
+ -h, --help Show this message and exit.
54
+ ```
55
+
56
+ ## 模块文档 | README
57
+
58
+ [admixture](./docs/admixture.md) - [群体结构分析软件Admixture](https://genome.cshlp.org/content/19/9/1655)
59
+
60
+ [agp2table](./docs/agp2table.md) - agp文件转换为table
61
+
62
+ [annovar](./docs/annovar.md) - [ANNOVAR变异功能注释工具](https://academic.oup.com/nar/article/38/16/e164/1749458)
63
+
64
+ [bam-cov](./docs/bam_coverage_stats.md) - BAM文件覆盖度统计
65
+
66
+ [blast](./docs/blast_v2.md) - [序列比对工具](https://academic.oup.com/nar/article/36/suppl_2/W5/2505810?login=true)
67
+
68
+ [busco](./docs/busco.md) - [BUSCO](https://academic.oup.com/bioinformatics/article/31/19/3210/211866)
69
+
70
+ [bwa](docs/bam_coverage_stats.md) - [BWA比对](https://academic.oup.com/bioinformatics/article/25/14/1754/225615)
71
+
72
+ [deepbsa](./docs/deepbsa.md) - [DeepBSA](https://github.com/lizhao007/DeepBSA/)
73
+
74
+ [dual-rnaseq](docs/dual_rnaseq.md) - 双向RNA-Seq
75
+
76
+ [fastp](./docs/fastp.md) - [fastq文件质控](https://onlinelibrary.wiley.com/doi/10.1002/imt2.70078)
77
+
78
+ [genomescope](docs/genomescope.md) - 使用[GenomeScope 2.0](https://github.com/tbenavi1/genomescope2.0)和[Smudgeplot](https://github.com/KamilSJaron/smudgeplot)评估基因组大小和倍性
79
+
80
+ [iseq](./docs/iseq.md) - [iSeq下载测序数据](https://github.com/BioOmics/iSeq)
81
+
82
+ [rna-seq](./docs/rnaseq.md) - [Hisat2](https://www.nature.com/articles/s41587-019-0201-4) + [StringTie2](https://link.springer.com/article/10.1186/s13059-019-1910-1)转录组流程
83
+
84
+ [sra2fastq](docs/sra2fastq.md) - SRA文件转FSATQ文件-基于[parallel-fastq-dump](https://github.com/rvalieris/parallel-fastq-dump)
85
+
86
+ [vcf2pca](docs/vcf2pca.md) - 基于[VCF2PCACluster](https://github.com/hewm2008/VCF2PCACluster)和PLINK的VCF文件做PCA
87
+
88
+ [vcf2phylip](docs/vcf2phylip.md) - VCF文件转PHYLIP格式文件
89
+
90
+
91
+ ## 许可证 | License
92
+
93
+ 本项目采用 MIT 许可证 - 详见 [LICENSE](LICENSE) 文件
94
+
95
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
96
+
97
+ ## 作者信息 | Author
98
+
99
+ **李详 (Xiang Li)**
100
+ - Email: lixiang117423@gmail.com
101
+ - GitHub: [@lixiang117423](https://github.com/lixiang117423)
102
+
103
+ ## 致谢 | Acknowledgments
104
+
105
+ - 感谢所有为本项目做出贡献的开发者 | Thanks to all developers who contributed to this project
106
+ - 感谢开源社区的支持 | Thanks to the open source community for support
107
+
108
+ ## 问题反馈 | Issues
109
+
110
+ 如果遇到问题或有建议,请在GitHub上提交issue:
111
+
112
+ If you encounter problems or have suggestions, please submit an issue on GitHub:
113
+
114
+ https://github.com/lixiang117423/biopytools/issues
@@ -0,0 +1,35 @@
1
+ """
2
+ ADMIXTURE群体结构分析工具包|ADMIXTURE Population Structure Analysis Toolkit
3
+
4
+ 功能: VCF到ADMIXTURE分析的完整流程,支持群体结构分析和协变量生成
5
+ Features: Complete pipeline from VCF to ADMIXTURE analysis, supporting population structure analysis and covariate generation
6
+
7
+ 作者|Author: Xiang LI
8
+ 版本|Version: 1.0.0
9
+ 日期|Date: 2024-12-30
10
+
11
+ 使用示例|Usage Examples:
12
+ from biopytools.admixture import AdmixtureAnalyzer, AdmixtureConfig
13
+
14
+ # 创建分析器|Create analyzer
15
+ analyzer = AdmixtureAnalyzer(
16
+ vcf_file="data.vcf.gz",
17
+ output_dir="admixture_results",
18
+ min_k=2,
19
+ max_k=10
20
+ )
21
+
22
+ # 运行分析|Run analysis
23
+ analyzer.run_analysis()
24
+
25
+ 命令行使用|Command Line Usage:
26
+ python -m admixture.main -v input.vcf -o results -k 2 -K 10
27
+ """
28
+
29
+ __version__ = "1.0.0"
30
+ __author__ = "Xiang LI"
31
+
32
+ from .main import AdmixtureAnalyzer
33
+ from .config import AdmixtureConfig
34
+
35
+ __all__ = ['AdmixtureAnalyzer', 'AdmixtureConfig']
@@ -0,0 +1,200 @@
1
+ """
2
+ ADMIXTURE分析核心模块|ADMIXTURE Analysis Core Module
3
+ """
4
+
5
+ import os
6
+ import glob
7
+ import pandas as pd
8
+ from pathlib import Path
9
+ from .utils import CommandRunner
10
+
11
+ class AdmixtureAnalyzer:
12
+ """ADMIXTURE分析器|ADMIXTURE Analyzer"""
13
+
14
+ def __init__(self, config, logger, cmd_runner: CommandRunner):
15
+ self.config = config
16
+ self.logger = logger
17
+ self.cmd_runner = cmd_runner
18
+
19
+ def run_admixture_analysis(self, plink_prefix: str):
20
+ """ 运行ADMIXTURE分析|Run ADMIXTURE analysis"""
21
+ self.logger.info(" 开始ADMIXTURE分析|Starting ADMIXTURE analysis")
22
+
23
+ bed_file = f"{plink_prefix}.bed"
24
+ if not os.path.exists(bed_file):
25
+ raise FileNotFoundError(f" PLINK文件不存在|PLINK file not found: {bed_file}")
26
+
27
+ # 为每个K值运行ADMIXTURE|Run ADMIXTURE for each K value
28
+ for k in range(self.config.min_k, self.config.max_k + 1):
29
+ self._run_single_k(bed_file, k)
30
+
31
+ # 计算交叉验证误差|Calculate cross-validation error
32
+ best_k = self._find_best_k()
33
+
34
+ self.logger.info(f" ADMIXTURE分析完成|ADMIXTURE analysis completed")
35
+ self.logger.info(f" 最优K值|Best K value: {best_k}")
36
+
37
+ return best_k
38
+
39
+ def _run_single_k(self, bed_file: str, k: int):
40
+ """ 为单个K值运行ADMIXTURE|Run ADMIXTURE for single K value"""
41
+ self.logger.info(f" 运行ADMIXTURE K={k}|Running ADMIXTURE K={k}")
42
+
43
+ # 设置输出文件名|Set output file names
44
+ log_file = os.path.join(self.config.output_dir, f"log_{k}.out")
45
+
46
+ # 构建命令,使用相对路径避免长路径问题
47
+ bed_basename = os.path.basename(bed_file)
48
+ cmd = (
49
+ f"cd {self.config.output_dir} && "
50
+ f"admixture --cv={self.config.cv_folds} -j{self.config.threads} "
51
+ f"{bed_basename} {k} > log_{k}.out 2>&1"
52
+ )
53
+
54
+ # 运行命令|Run command
55
+ try:
56
+ result = self.cmd_runner.run(cmd, f"ADMIXTURE分析 K={k}|ADMIXTURE analysis K={k}")
57
+
58
+ # 检查输出文件是否生成成功
59
+ expected_q_file = os.path.join(self.config.output_dir, f"{os.path.splitext(bed_basename)[0]}.{k}.Q")
60
+ if os.path.exists(expected_q_file):
61
+ self.logger.info(f" K={k} 分析成功完成|K={k} analysis completed successfully")
62
+ else:
63
+ self.logger.warning(f" K={k} 分析可能未完全成功,Q文件不存在|K={k} analysis may not be fully successful, Q file missing")
64
+
65
+ except Exception as e:
66
+ self.logger.error(f" K={k}分析失败|K={k} analysis failed: {e}")
67
+ # 尝试读取日志文件了解错误原因
68
+ if os.path.exists(log_file):
69
+ with open(log_file, 'r') as f:
70
+ log_content = f.read()
71
+ self.logger.error(f" ADMIXTURE日志|ADMIXTURE log:\n{log_content}")
72
+ raise
73
+
74
+ def _find_best_k(self):
75
+ """ 找到最优K值|Find best K value"""
76
+ cv_results = []
77
+
78
+ # 从log文件中提取CV误差|Extract CV error from log files
79
+ for k in range(self.config.min_k, self.config.max_k + 1):
80
+ log_file = os.path.join(self.config.output_dir, f"log_{k}.out")
81
+
82
+ if os.path.exists(log_file):
83
+ with open(log_file, 'r') as f:
84
+ for line in f:
85
+ if 'CV error' in line:
86
+ cv_error = float(line.strip().split()[-1])
87
+ cv_results.append({'K': k, 'CV_error': cv_error})
88
+ break
89
+
90
+ if not cv_results:
91
+ raise ValueError(" 未找到CV误差信息|No CV error information found")
92
+
93
+ # 找到最小CV误差对应的K值|Find K with minimum CV error
94
+ cv_df = pd.DataFrame(cv_results)
95
+ best_k = cv_df.loc[cv_df['CV_error'].idxmin(), 'K']
96
+
97
+ # 保存CV结果|Save CV results
98
+ cv_file = os.path.join(self.config.output_dir, "cv_results.csv")
99
+ cv_df.to_csv(cv_file, index=False)
100
+
101
+ return int(best_k)
102
+
103
+ class ResultsProcessor:
104
+ """ 结果处理器|Results Processor"""
105
+
106
+ def __init__(self, config, logger):
107
+ self.config = config
108
+ self.logger = logger
109
+
110
+ def process_results(self, best_k: int):
111
+ """ 处理分析结果|Process analysis results"""
112
+ self.logger.info(" 处理分析结果|Processing analysis results")
113
+
114
+ # 读取Q文件(个体祖先成分)| Read Q file (individual ancestry proportions)
115
+ q_file = os.path.join(self.config.output_dir, f"{self.config.base_name}.{best_k}.Q")
116
+ q_data = self._read_q_file(q_file, best_k)
117
+
118
+ # 读取P文件(等位基因频率)| Read P file (allele frequencies)
119
+ p_file = os.path.join(self.config.output_dir, f"{self.config.base_name}.{best_k}.P")
120
+ p_data = self._read_p_file(p_file, best_k)
121
+
122
+ # 计算统计信息|Calculate statistics
123
+ stats = self._calculate_statistics(q_data, best_k)
124
+
125
+ # 保存处理后的结果|Save processed results
126
+ self._save_processed_results(q_data, p_data, stats, best_k)
127
+
128
+ return q_data, p_data, stats
129
+
130
+ def _read_q_file(self, q_file: str, k: int):
131
+ """ 读取Q文件|Read Q file"""
132
+ if not os.path.exists(q_file):
133
+ raise FileNotFoundError(f" Q文件不存在|Q file not found: {q_file}")
134
+
135
+ q_data = pd.read_csv(q_file, sep=r'\s+', header=None)
136
+ q_data.columns = [f"Pop{i+1}" for i in range(k)]
137
+
138
+ # 添加个体信息|Add individual information
139
+ fam_file = os.path.join(self.config.output_dir, f"{self.config.base_name}.fam")
140
+ if os.path.exists(fam_file):
141
+ fam_data = pd.read_csv(fam_file, sep=r'\s+', header=None)
142
+ q_data['FID'] = fam_data.iloc[:, 0]
143
+ q_data['IID'] = fam_data.iloc[:, 1]
144
+
145
+ return q_data
146
+
147
+ def _read_p_file(self, p_file: str, k: int):
148
+ """ 读取P文件|Read P file"""
149
+ if not os.path.exists(p_file):
150
+ self.logger.warning(f" P文件不存在|P file not found: {p_file}")
151
+ return None
152
+
153
+ p_data = pd.read_csv(p_file, sep=r'\s+', header=None)
154
+ p_data.columns = [f"Pop{i+1}" for i in range(k)]
155
+
156
+ return p_data
157
+
158
+ def _calculate_statistics(self, q_data: pd.DataFrame, k: int):
159
+ """ 计算统计信息|Calculate statistics"""
160
+ pop_cols = [f"Pop{i+1}" for i in range(k)]
161
+
162
+ # 计算每个个体的最大祖先成分|Calculate max ancestry for each individual
163
+ max_ancestry = q_data[pop_cols].max(axis=1)
164
+
165
+ # 计算混合程度|Calculate admixture level
166
+ admixture_level = 1 - max_ancestry
167
+
168
+ # 统计信息|Statistics
169
+ stats = {
170
+ 'total_individuals': len(q_data),
171
+ 'highly_admixed': sum(admixture_level > 0.3),
172
+ 'pure_individuals': sum(max_ancestry > 0.9),
173
+ 'mean_admixture_level': admixture_level.mean(),
174
+ 'mean_max_ancestry': max_ancestry.mean()
175
+ }
176
+
177
+ return stats
178
+
179
+ def _save_processed_results(self, q_data: pd.DataFrame, p_data: pd.DataFrame,
180
+ stats: dict, best_k: int):
181
+ """ 保存处理后的结果|Save processed results"""
182
+ # 保存详细Q矩阵|Save detailed Q matrix
183
+ q_output = os.path.join(self.config.output_dir, "admixture_proportions.csv")
184
+ q_data.to_csv(q_output, index=False)
185
+
186
+ # 保存统计信息|Save statistics
187
+ stats_output = os.path.join(self.config.output_dir, "admixture_statistics.txt")
188
+ with open(stats_output, 'w') as f:
189
+ f.write(" ADMIXTURE分析统计信息|ADMIXTURE Analysis Statistics\n")
190
+ f.write("=" * 80 + "\n\n")
191
+ f.write(f" 最优K值|Best K value: {best_k}\n")
192
+ f.write(f"总个体数|Total individuals: {stats['total_individuals']}\n")
193
+ f.write(f" 高度混合个体数|Highly admixed individuals: {stats['highly_admixed']}\n")
194
+ f.write(f"纯合个体数|Pure individuals: {stats['pure_individuals']}\n")
195
+ f.write(f"平均混合程度|Mean admixture level: {stats['mean_admixture_level']:.3f}\n")
196
+ f.write(f" 平均最大祖先成分|Mean max ancestry: {stats['mean_max_ancestry']:.3f}\n")
197
+
198
+ self.logger.info(f" 结果已保存|Results saved:")
199
+ self.logger.info(f" - 个体祖先成分|Individual ancestry proportions: {q_output}")
200
+ self.logger.info(f" - 统计信息|Statistics: {stats_output}")
@@ -0,0 +1,82 @@
1
+ """
2
+ ADMIXTURE分析配置管理模块|ADMIXTURE Analysis Configuration Management Module
3
+ """
4
+
5
+ import os
6
+ from dataclasses import dataclass
7
+ from pathlib import Path
8
+ from typing import Optional, List
9
+
10
+
11
+ @dataclass
12
+ class AdmixtureConfig:
13
+ """ADMIXTURE分析配置类|ADMIXTURE Analysis Configuration Class"""
14
+
15
+ # 必需文件|Required files
16
+ vcf_file: str
17
+ output_dir: str = "admixture_results"
18
+
19
+ # 分析参数|Analysis parameters
20
+ min_k: int = 2
21
+ max_k: int = 10
22
+ cv_folds: int = 5
23
+ threads: int = 64
24
+
25
+ # 质控参数|Quality control parameters
26
+ maf: float = 0.05
27
+ missing_rate: float = 0.1
28
+ hwe_pvalue: float = 1e-6
29
+
30
+ # 处理选项|Processing options
31
+ skip_preprocessing: bool = False
32
+ keep_intermediate: bool = False
33
+
34
+ # 日志配置|Logging configuration
35
+ log_level: str = "INFO"
36
+ quiet: bool = False
37
+ verbose: int = 0
38
+
39
+ # 执行控制|Execution control
40
+ force: bool = False
41
+ dry_run: bool = False
42
+ force: bool = False
43
+
44
+ # 内部属性|Internal attributes
45
+ base_name: str = "admixture_ready"
46
+
47
+ def __post_init__(self):
48
+ """初始化后处理|Post-initialization processing"""
49
+ self.output_path = Path(self.output_dir)
50
+ self.output_path.mkdir(parents=True, exist_ok=True)
51
+
52
+ # 标准化路径|Normalize paths
53
+ self.vcf_file = os.path.normpath(os.path.abspath(self.vcf_file))
54
+ self.output_dir = os.path.normpath(os.path.abspath(self.output_dir))
55
+
56
+ def validate(self):
57
+ """验证配置参数|Validate configuration parameters"""
58
+ errors = []
59
+
60
+ # 检查输入文件|Check input file
61
+ if not os.path.exists(self.vcf_file):
62
+ errors.append(f"VCF文件不存在|VCF file does not exist: {self.vcf_file}")
63
+
64
+ # 检查K值范围|Check K range
65
+ if self.min_k < 1 or self.max_k < self.min_k:
66
+ errors.append(f"无效的K值范围|Invalid K range: {self.min_k} to {self.max_k}")
67
+
68
+ # 检查线程数|Check thread count
69
+ if self.threads <= 0:
70
+ errors.append(f"线程数必须为正整数|Thread count must be positive: {self.threads}")
71
+
72
+ # 检查质控参数|Check QC parameters
73
+ if not 0 <= self.maf <= 0.5:
74
+ errors.append(f"MAF值应在0-0.5之间|MAF should be between 0-0.5: {self.maf}")
75
+
76
+ if not 0 <= self.missing_rate <= 1:
77
+ errors.append(f"缺失率应在0-1之间|Missing rate should be between 0-1: {self.missing_rate}")
78
+
79
+ if errors:
80
+ raise ValueError("配置错误|Configuration error:\n" + "\n".join(errors))
81
+
82
+ return True