tb-consensus-aligner 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tb_consensus_aligner-1.0.0/PKG-INFO +151 -0
- tb_consensus_aligner-1.0.0/README.md +138 -0
- tb_consensus_aligner-1.0.0/pyproject.toml +28 -0
- tb_consensus_aligner-1.0.0/setup.cfg +4 -0
- tb_consensus_aligner-1.0.0/tb_consensus_aligner/__init__.py +0 -0
- tb_consensus_aligner-1.0.0/tb_consensus_aligner/consensus_galaxy.py +491 -0
- tb_consensus_aligner-1.0.0/tb_consensus_aligner/main_galaxy.py +161 -0
- tb_consensus_aligner-1.0.0/tb_consensus_aligner/snp_aligner_galaxy.py +264 -0
- tb_consensus_aligner-1.0.0/tb_consensus_aligner.egg-info/PKG-INFO +151 -0
- tb_consensus_aligner-1.0.0/tb_consensus_aligner.egg-info/SOURCES.txt +12 -0
- tb_consensus_aligner-1.0.0/tb_consensus_aligner.egg-info/dependency_links.txt +1 -0
- tb_consensus_aligner-1.0.0/tb_consensus_aligner.egg-info/entry_points.txt +2 -0
- tb_consensus_aligner-1.0.0/tb_consensus_aligner.egg-info/requires.txt +1 -0
- tb_consensus_aligner-1.0.0/tb_consensus_aligner.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: tb_consensus_aligner
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Building consensus fasta files and variable multi-sequence alignments for mycobacterial genomes.
|
|
5
|
+
Author-email: scg40 <gian.schuepbach@swisstph.ch>
|
|
6
|
+
Classifier: Programming Language :: Python :: 3
|
|
7
|
+
Classifier: Operating System :: OS Independent
|
|
8
|
+
Classifier: Intended Audience :: Science/Research
|
|
9
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
10
|
+
Requires-Python: >=3.8
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
Requires-Dist: biopython>=1.80
|
|
13
|
+
|
|
14
|
+
# TBConsensusAligner
|
|
15
|
+
TBConsensusAligner is a tool designed to create a multi-sequence alignment via consensus
|
|
16
|
+
genomes created from VCF files.
|
|
17
|
+
It can currently be used in different modes as follows
|
|
18
|
+
- VCF to SNP alignment `-s all`
|
|
19
|
+
- input = VCFs, output = SNP alignment
|
|
20
|
+
- runs `consensus_galaxy.py` and `snp_aligner_galaxy.py`
|
|
21
|
+
|
|
22
|
+
- VCF to consensus FASTA files `-s consensus`
|
|
23
|
+
- input = VCFs, output = consensus FASTA files
|
|
24
|
+
- runs only `consensus_galaxy.py`
|
|
25
|
+
|
|
26
|
+
If the script produces a SNP alignment from VCFs, the user can choose whether to ouptut just the SNP alignment
|
|
27
|
+
or the used consensus FASTA files as well with the `-m` option.
|
|
28
|
+
|
|
29
|
+
When running the `consensus_galaxy.py` the user has to provide the VCF files, the reference genome used in their creation and the multisample
|
|
30
|
+
depth file from bamtools depth. Optionally, the user can provide one or several BED files to mask certain regions of the genome.
|
|
31
|
+
|
|
32
|
+
## Usage
|
|
33
|
+
The script is run via the command line.
|
|
34
|
+
|
|
35
|
+
***usage***:
|
|
36
|
+
|
|
37
|
+
```
|
|
38
|
+
Usage: TBConsensusAligner [options]
|
|
39
|
+
|
|
40
|
+
Options: -s STR ['consensus' or 'all'] Create either consensus files or consensus files and a variable alignment
|
|
41
|
+
|
|
42
|
+
-m STR ['everything' or 'alignment_only'] If -s all is chosen, output either consensus files and the alignment or just the alignment
|
|
43
|
+
|
|
44
|
+
-v list[STR] Path to the input VCF files
|
|
45
|
+
|
|
46
|
+
-r STR Path to the reference genome
|
|
47
|
+
|
|
48
|
+
-d STR Path to the depth file created by samtools depth
|
|
49
|
+
|
|
50
|
+
-c STR Path to the outgroup VCF for the alignment
|
|
51
|
+
|
|
52
|
+
-b list[STR] Path to the BED files to mask certain genomic regions
|
|
53
|
+
|
|
54
|
+
-o STR Path to the output directory
|
|
55
|
+
|
|
56
|
+
-g FLOAT Percentage of undefined states allowed per polymorphic position in the alignment default at 90 percent value=0.9
|
|
57
|
+
|
|
58
|
+
-t BOOLEAN Test Mode to run the script with smaller genomes
|
|
59
|
+
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
## Example usages command line
|
|
63
|
+
|
|
64
|
+
### From 3 VCFs to alignment, output consensus FASTAs and SNP alignment, masked by two bedfiles
|
|
65
|
+
|
|
66
|
+
```
|
|
67
|
+
python yourfolder/galaxy_main.py \
|
|
68
|
+
-s all \
|
|
69
|
+
-m everything \
|
|
70
|
+
-v path/to/vcf1/vcf1.vcf -v path/to/vcf2/vcf2.vcf -v path/to/vcf2/vcf2.vcf \
|
|
71
|
+
-r path/to/reference/reference_genome.reference.fasta \
|
|
72
|
+
-d path/to/depthfile/depthfile.tabular \
|
|
73
|
+
-c path/to/outgroupvcf/outgroup.vcf \
|
|
74
|
+
-b path/to/bedfile1/bed1.bed -b path/to/bedfile2/bed2.bed \
|
|
75
|
+
-o output
|
|
76
|
+
-g 0.9
|
|
77
|
+
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## Logic consensus_galaxy.py
|
|
81
|
+
|
|
82
|
+
This script produces consensus FASTA files from VCFs. The user provides the VCFs, the reference genome,
|
|
83
|
+
the multisample depth file obtained from `bamtools depth` and optionally BED files to mask certain genomic regions.
|
|
84
|
+
|
|
85
|
+
***algorithm***
|
|
86
|
+
|
|
87
|
+
We loop through each position in the reference genome and build the consensus sequence base per base with the following rules:
|
|
88
|
+
|
|
89
|
+
```
|
|
90
|
+
- SNPs and MNPs with a frequency >90% are encoded with the alternative base from the VCF.
|
|
91
|
+
- SNPs and MNPs with a frequency between 10% and 90% are encoded with the ambiguity base.
|
|
92
|
+
- SNPs and MNPs with a frequency <10% are encoded with the ancestral base from the reference genome.
|
|
93
|
+
|
|
94
|
+
- Large deletions (coverage 0) are encoded by dashes (-).
|
|
95
|
+
- Small deletions with frequency >90% are encoded by dashes (-).
|
|
96
|
+
- Small deletions with frequency between 10% and 90% are encoded with a 'N'.
|
|
97
|
+
- Small deletions with frequency <10% are encoded with the ancestral base from the reference genome.
|
|
98
|
+
|
|
99
|
+
- Small insertions (alternative bases longer than reference base) are encoded with the ancestral state from the reference genome.
|
|
100
|
+
|
|
101
|
+
- Sites to exclude specified in the bed files are encoded with a 'N'.
|
|
102
|
+
- Variants that have a phred score < 20 are encoded with a 'N'.
|
|
103
|
+
- Sites that are not in the VCF and are covered by less than 5 reads (via depth file) are encoded with a 'N'.
|
|
104
|
+
|
|
105
|
+
- If more than one ALT allele is present, we consider the one with the highest allele frequency.
|
|
106
|
+
```
|
|
107
|
+
Since the VCF for which the script is tailored to does not have the allele frequency `AF` calculated, we calculate it using
|
|
108
|
+
`RO = REF allele occurance` and `AO = ALT allele occurance` with
|
|
109
|
+
`AF = AO / ( sum(all AO's) + RO )`.
|
|
110
|
+
|
|
111
|
+
## Logic snp_aligner_galaxy.py
|
|
112
|
+
|
|
113
|
+
This script produces a SNP alignment i.e. multi-sequence alignment of polymorphic positions in FASTA format from consensus FASTA files.
|
|
114
|
+
The consensus FASTAs are generated in the previous step by `consensus_galaxy.py` wrapped in `TBConsensusAligner`.
|
|
115
|
+
|
|
116
|
+
***algorithm***
|
|
117
|
+
|
|
118
|
+
The script creates a multi-sequence alignment from the consensus FASTAs in form of an 2D array where each row represents a sequence
|
|
119
|
+
and each column a genomic position. Each column is checked for the abundance of a polymorphism. If such a polymorphism is detected,
|
|
120
|
+
the column is appended to the alignment of polymorphic positions. For each polymorphic position, the corresponding nucleotide from
|
|
121
|
+
outgroup VCF is retrieved to get the outgroup's sequence.
|
|
122
|
+
|
|
123
|
+
The algorithm to populate the multi-sequence alignment of polymorphic positions is asd follows.
|
|
124
|
+
The user can control the proportion of gaps or undefined states (`-`or`N`) for a polymorphic position
|
|
125
|
+
to be kept in the alignment (argument `-g`). By default, this is set to `0.9`, meaning that if a polymorphic position has
|
|
126
|
+
more than 90% gaps or undefined states, it will not be in the final alignment.
|
|
127
|
+
|
|
128
|
+
## Directory structure used for development
|
|
129
|
+
|
|
130
|
+
* TBConsensusAligner
|
|
131
|
+
* test_data
|
|
132
|
+
- snp_alignment.fasta
|
|
133
|
+
- test_G77777.consensus.fasta
|
|
134
|
+
- test_G77777.vcf.gz
|
|
135
|
+
- test_G88888_k1.consensus.fasta
|
|
136
|
+
- test_G88888_k1.vcf.gz
|
|
137
|
+
- test_G99999.k2.consensus.fasta
|
|
138
|
+
- test_G99999.k2.vcf.gz
|
|
139
|
+
- test_Galaxy_multiple_depths_header.tabular
|
|
140
|
+
- test_reference_200bp.reference.fasta
|
|
141
|
+
- test_regions_blindspots_modlin_farhat_and_PE_PPE_PGRS.bed
|
|
142
|
+
- test.outgroup.all.pos.vcf.gz
|
|
143
|
+
- consensus_galaxy.py
|
|
144
|
+
- main_galaxy.py
|
|
145
|
+
- README.md
|
|
146
|
+
- snp_aligner_galaxy.py
|
|
147
|
+
- TBConsensusAligner.xml
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
# TBConsensusAligner
|
|
2
|
+
TBConsensusAligner is a tool designed to create a multi-sequence alignment via consensus
|
|
3
|
+
genomes created from VCF files.
|
|
4
|
+
It can currently be used in different modes as follows
|
|
5
|
+
- VCF to SNP alignment `-s all`
|
|
6
|
+
- input = VCFs, output = SNP alignment
|
|
7
|
+
- runs `consensus_galaxy.py` and `snp_aligner_galaxy.py`
|
|
8
|
+
|
|
9
|
+
- VCF to consensus FASTA files `-s consensus`
|
|
10
|
+
- input = VCFs, output = consensus FASTA files
|
|
11
|
+
- runs only `consensus_galaxy.py`
|
|
12
|
+
|
|
13
|
+
If the script produces a SNP alignment from VCFs, the user can choose whether to ouptut just the SNP alignment
|
|
14
|
+
or the used consensus FASTA files as well with the `-m` option.
|
|
15
|
+
|
|
16
|
+
When running the `consensus_galaxy.py` the user has to provide the VCF files, the reference genome used in their creation and the multisample
|
|
17
|
+
depth file from bamtools depth. Optionally, the user can provide one or several BED files to mask certain regions of the genome.
|
|
18
|
+
|
|
19
|
+
## Usage
|
|
20
|
+
The script is run via the command line.
|
|
21
|
+
|
|
22
|
+
***usage***:
|
|
23
|
+
|
|
24
|
+
```
|
|
25
|
+
Usage: TBConsensusAligner [options]
|
|
26
|
+
|
|
27
|
+
Options: -s STR ['consensus' or 'all'] Create either consensus files or consensus files and a variable alignment
|
|
28
|
+
|
|
29
|
+
-m STR ['everything' or 'alignment_only'] If -s all is chosen, output either consensus files and the alignment or just the alignment
|
|
30
|
+
|
|
31
|
+
-v list[STR] Path to the input VCF files
|
|
32
|
+
|
|
33
|
+
-r STR Path to the reference genome
|
|
34
|
+
|
|
35
|
+
-d STR Path to the depth file created by samtools depth
|
|
36
|
+
|
|
37
|
+
-c STR Path to the outgroup VCF for the alignment
|
|
38
|
+
|
|
39
|
+
-b list[STR] Path to the BED files to mask certain genomic regions
|
|
40
|
+
|
|
41
|
+
-o STR Path to the output directory
|
|
42
|
+
|
|
43
|
+
-g FLOAT Percentage of undefined states allowed per polymorphic position in the alignment default at 90 percent value=0.9
|
|
44
|
+
|
|
45
|
+
-t BOOLEAN Test Mode to run the script with smaller genomes
|
|
46
|
+
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## Example usages command line
|
|
50
|
+
|
|
51
|
+
### From 3 VCFs to alignment, output consensus FASTAs and SNP alignment, masked by two bedfiles
|
|
52
|
+
|
|
53
|
+
```
|
|
54
|
+
python yourfolder/galaxy_main.py \
|
|
55
|
+
-s all \
|
|
56
|
+
-m everything \
|
|
57
|
+
-v path/to/vcf1/vcf1.vcf -v path/to/vcf2/vcf2.vcf -v path/to/vcf2/vcf2.vcf \
|
|
58
|
+
-r path/to/reference/reference_genome.reference.fasta \
|
|
59
|
+
-d path/to/depthfile/depthfile.tabular \
|
|
60
|
+
-c path/to/outgroupvcf/outgroup.vcf \
|
|
61
|
+
-b path/to/bedfile1/bed1.bed -b path/to/bedfile2/bed2.bed \
|
|
62
|
+
-o output
|
|
63
|
+
-g 0.9
|
|
64
|
+
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## Logic consensus_galaxy.py
|
|
68
|
+
|
|
69
|
+
This script produces consensus FASTA files from VCFs. The user provides the VCFs, the reference genome,
|
|
70
|
+
the multisample depth file obtained from `bamtools depth` and optionally BED files to mask certain genomic regions.
|
|
71
|
+
|
|
72
|
+
***algorithm***
|
|
73
|
+
|
|
74
|
+
We loop through each position in the reference genome and build the consensus sequence base per base with the following rules:
|
|
75
|
+
|
|
76
|
+
```
|
|
77
|
+
- SNPs and MNPs with a frequency >90% are encoded with the alternative base from the VCF.
|
|
78
|
+
- SNPs and MNPs with a frequency between 10% and 90% are encoded with the ambiguity base.
|
|
79
|
+
- SNPs and MNPs with a frequency <10% are encoded with the ancestral base from the reference genome.
|
|
80
|
+
|
|
81
|
+
- Large deletions (coverage 0) are encoded by dashes (-).
|
|
82
|
+
- Small deletions with frequency >90% are encoded by dashes (-).
|
|
83
|
+
- Small deletions with frequency between 10% and 90% are encoded with a 'N'.
|
|
84
|
+
- Small deletions with frequency <10% are encoded with the ancestral base from the reference genome.
|
|
85
|
+
|
|
86
|
+
- Small insertions (alternative bases longer than reference base) are encoded with the ancestral state from the reference genome.
|
|
87
|
+
|
|
88
|
+
- Sites to exclude specified in the bed files are encoded with a 'N'.
|
|
89
|
+
- Variants that have a phred score < 20 are encoded with a 'N'.
|
|
90
|
+
- Sites that are not in the VCF and are covered by less than 5 reads (via depth file) are encoded with a 'N'.
|
|
91
|
+
|
|
92
|
+
- If more than one ALT allele is present, we consider the one with the highest allele frequency.
|
|
93
|
+
```
|
|
94
|
+
Since the VCF for which the script is tailored to does not have the allele frequency `AF` calculated, we calculate it using
|
|
95
|
+
`RO = REF allele occurance` and `AO = ALT allele occurance` with
|
|
96
|
+
`AF = AO / ( sum(all AO's) + RO )`.
|
|
97
|
+
|
|
98
|
+
## Logic snp_aligner_galaxy.py
|
|
99
|
+
|
|
100
|
+
This script produces a SNP alignment i.e. multi-sequence alignment of polymorphic positions in FASTA format from consensus FASTA files.
|
|
101
|
+
The consensus FASTAs are generated in the previous step by `consensus_galaxy.py` wrapped in `TBConsensusAligner`.
|
|
102
|
+
|
|
103
|
+
***algorithm***
|
|
104
|
+
|
|
105
|
+
The script creates a multi-sequence alignment from the consensus FASTAs in form of an 2D array where each row represents a sequence
|
|
106
|
+
and each column a genomic position. Each column is checked for the abundance of a polymorphism. If such a polymorphism is detected,
|
|
107
|
+
the column is appended to the alignment of polymorphic positions. For each polymorphic position, the corresponding nucleotide from
|
|
108
|
+
outgroup VCF is retrieved to get the outgroup's sequence.
|
|
109
|
+
|
|
110
|
+
The algorithm to populate the multi-sequence alignment of polymorphic positions is asd follows.
|
|
111
|
+
The user can control the proportion of gaps or undefined states (`-`or`N`) for a polymorphic position
|
|
112
|
+
to be kept in the alignment (argument `-g`). By default, this is set to `0.9`, meaning that if a polymorphic position has
|
|
113
|
+
more than 90% gaps or undefined states, it will not be in the final alignment.
|
|
114
|
+
|
|
115
|
+
## Directory structure used for development
|
|
116
|
+
|
|
117
|
+
* TBConsensusAligner
|
|
118
|
+
* test_data
|
|
119
|
+
- snp_alignment.fasta
|
|
120
|
+
- test_G77777.consensus.fasta
|
|
121
|
+
- test_G77777.vcf.gz
|
|
122
|
+
- test_G88888_k1.consensus.fasta
|
|
123
|
+
- test_G88888_k1.vcf.gz
|
|
124
|
+
- test_G99999.k2.consensus.fasta
|
|
125
|
+
- test_G99999.k2.vcf.gz
|
|
126
|
+
- test_Galaxy_multiple_depths_header.tabular
|
|
127
|
+
- test_reference_200bp.reference.fasta
|
|
128
|
+
- test_regions_blindspots_modlin_farhat_and_PE_PPE_PGRS.bed
|
|
129
|
+
- test.outgroup.all.pos.vcf.gz
|
|
130
|
+
- consensus_galaxy.py
|
|
131
|
+
- main_galaxy.py
|
|
132
|
+
- README.md
|
|
133
|
+
- snp_aligner_galaxy.py
|
|
134
|
+
- TBConsensusAligner.xml
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
[build-system] # Define the tools needed to build the project
|
|
2
|
+
requires = ["setuptools>=61.0.0", "wheel"] # Use Version 61.0.0 or newer
|
|
3
|
+
build-backend = "setuptools.build_meta" # USe setuptools to process metadata configuration
|
|
4
|
+
|
|
5
|
+
[project] # Human readable description
|
|
6
|
+
name = "tb_consensus_aligner" # Official name of the package
|
|
7
|
+
version = "1.0.0" # Current version
|
|
8
|
+
description = "Building consensus fasta files and variable multi-sequence alignments for mycobacterial genomes."
|
|
9
|
+
readme = "README.md" # Use the README.md file as the official documentation that is being opened
|
|
10
|
+
requires-python = ">=3.8"
|
|
11
|
+
authors = [
|
|
12
|
+
{name = "scg40", email = "gian.schuepbach@swisstph.ch"}
|
|
13
|
+
]
|
|
14
|
+
classifiers = [ # Key words used to categorize the tool
|
|
15
|
+
"Programming Language :: Python :: 3",
|
|
16
|
+
"Operating System :: OS Independent",
|
|
17
|
+
"Intended Audience :: Science/Research",
|
|
18
|
+
"Topic :: Scientific/Engineering :: Bio-Informatics"
|
|
19
|
+
]
|
|
20
|
+
dependencies = [ # Automatically checks if Biopython is installed, if not, installs installed
|
|
21
|
+
"biopython>=1.80"
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
[project.scripts] # Craete command line shortcut
|
|
25
|
+
tb-consensus-aligner = "tb_consensus_aligner.main_galaxy:main"
|
|
26
|
+
|
|
27
|
+
[tool.setuptools] # Dictate to setuptools which folder contains the scripts
|
|
28
|
+
packages = ["tb_consensus_aligner"]
|
|
File without changes
|