plastburstalign 0.9.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- plastburstalign-0.9.1/LICENSE +29 -0
- plastburstalign-0.9.1/PKG-INFO +122 -0
- plastburstalign-0.9.1/README.md +98 -0
- plastburstalign-0.9.1/plastburstalign/__init__.py +20 -0
- plastburstalign-0.9.1/plastburstalign/__main__.py +10 -0
- plastburstalign-0.9.1/plastburstalign/alignment_ops.py +644 -0
- plastburstalign-0.9.1/plastburstalign/extraction_ops.py +574 -0
- plastburstalign-0.9.1/plastburstalign/helpers.py +30 -0
- plastburstalign-0.9.1/plastburstalign/logging_ops.py +76 -0
- plastburstalign-0.9.1/plastburstalign/plastome_burst_and_align.py +38 -0
- plastburstalign-0.9.1/plastburstalign/seqfeature_ops.py +484 -0
- plastburstalign-0.9.1/plastburstalign/user_parameters.py +283 -0
- plastburstalign-0.9.1/plastburstalign.egg-info/PKG-INFO +122 -0
- plastburstalign-0.9.1/plastburstalign.egg-info/SOURCES.txt +17 -0
- plastburstalign-0.9.1/plastburstalign.egg-info/dependency_links.txt +1 -0
- plastburstalign-0.9.1/plastburstalign.egg-info/requires.txt +3 -0
- plastburstalign-0.9.1/plastburstalign.egg-info/top_level.txt +1 -0
- plastburstalign-0.9.1/pyproject.toml +52 -0
- plastburstalign-0.9.1/setup.cfg +4 -0
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
BSD 3-Clause License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2022, Michael Gruenstaeudl, PhD
|
|
4
|
+
All rights reserved.
|
|
5
|
+
|
|
6
|
+
Redistribution and use in source and binary forms, with or without
|
|
7
|
+
modification, are permitted provided that the following conditions are met:
|
|
8
|
+
|
|
9
|
+
1. Redistributions of source code must retain the above copyright notice, this
|
|
10
|
+
list of conditions and the following disclaimer.
|
|
11
|
+
|
|
12
|
+
2. Redistributions in binary form must reproduce the above copyright notice,
|
|
13
|
+
this list of conditions and the following disclaimer in the documentation
|
|
14
|
+
and/or other materials provided with the distribution.
|
|
15
|
+
|
|
16
|
+
3. Neither the name of the copyright holder nor the names of its
|
|
17
|
+
contributors may be used to endorse or promote products derived from
|
|
18
|
+
this software without specific prior written permission.
|
|
19
|
+
|
|
20
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
21
|
+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
22
|
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
23
|
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
|
24
|
+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
25
|
+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
26
|
+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
27
|
+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
28
|
+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
29
|
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: plastburstalign
|
|
3
|
+
Version: 0.9.1
|
|
4
|
+
Summary: A Python tool to extract and align genes, introns, and intergenic spacers across hundreds of plastid genomes using associative arrays
|
|
5
|
+
Author-email: "Michael Gruenstaeudl, PhD" <m_gruenstaeudl@fhsu.edu>, Gregory Smith <g_smith10@mail.fhsu.edu>
|
|
6
|
+
Project-URL: Homepage, https://gruenstaeudl-lab.com/
|
|
7
|
+
Project-URL: Repository, https://github.com/michaelgruenstaeudl/PlastomeBurstAndAlign
|
|
8
|
+
Keywords: bioinformatics,alignment,flatfile,MAFFT
|
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
|
10
|
+
Classifier: License :: OSI Approved :: BSD License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Intended Audience :: Science/Research
|
|
16
|
+
Classifier: Intended Audience :: Developers
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
18
|
+
Requires-Python: >=3.9
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
License-File: LICENSE
|
|
21
|
+
Requires-Dist: biopython>=1.78
|
|
22
|
+
Requires-Dist: coloredlogs>=14.0
|
|
23
|
+
Requires-Dist: requests
|
|
24
|
+
|
|
25
|
+
# plastburstalign
|
|
26
|
+
A Python tool to extract and align genes, introns, and intergenic spacers across hundreds of plastid genomes using associative arrays
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
### Installation on Linux (Debian)
|
|
30
|
+
```bash
|
|
31
|
+
# Alignment software
|
|
32
|
+
apt install mafft
|
|
33
|
+
|
|
34
|
+
# Other dependencies
|
|
35
|
+
apt install python3-biopython
|
|
36
|
+
apt install python3-coloredlogs
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
### Overview of process
|
|
40
|
+

|
|
41
|
+
|
|
42
|
+
### Main features
|
|
43
|
+
- Extracts all sequences from each of three different genome marker types (i.e., genes, introns, or intergenic spacers) from a set of plastid genomes in GenBank flatfile format, groups and aligns homologous extracted sequences, and then saves the alignments to file
|
|
44
|
+
- Saves both the individual alignments and the concatenation of all alignments
|
|
45
|
+
- Automatic removal of any duplicate regions (i.e., relevant for features duplicated through the IRs)
|
|
46
|
+
- Exon splicing operations #1: Automatic merging of all exons of any cis-splied gene [see functions of class `ExonSpliceHandler`]
|
|
47
|
+
- Exon splicing operations #2: Automatic grouping of all exons of any trans-spliced gene (e.g., _rps12_), followed by merging the exons [see functions of class `ExonSpliceHandler`]
|
|
48
|
+
- Automatic removal of regions that do not fulfill
|
|
49
|
+
- a minimum, user-specified sequence length
|
|
50
|
+
- a minimum, user-specified number of taxa in the dataset that the region must be found in [see function `DataCleaning()` for both]
|
|
51
|
+
|
|
52
|
+
### Additional features
|
|
53
|
+
- Rapid sequence extraction and alignment of the genes/introns/intergenic spacers due to process parallelization using multiple CPUs [see internal function `_nuc_MSA()`]
|
|
54
|
+
- Automatic removal of any user-specified genes/introns/intergenic spacers
|
|
55
|
+
- Choice of
|
|
56
|
+
- the order of concatenation of the aligned genes/introns/intergenic spacers to either the natural order of the first input genome (commandline option `seq`) or an alphabetic order (commandline option `alpha`)
|
|
57
|
+
- automatic case standardization of gene names to adjust for letter-case differences between gene annotations of different genome records (which is especially relevant for anticodon and amino acid abbreviations of tRNAs); includes the option to remove anticodon and amino acid abbreviations from tRNA gene names altogether [see function `clean_gene()`]
|
|
58
|
+
- If a gene/intron/intergenic spacer cannot be extracted from a GenBank record, provision of explanation why the extraction failed
|
|
59
|
+
- Availability of two log levels:
|
|
60
|
+
- default (suitable for regular software execution), and
|
|
61
|
+
- verbose (suitable for debugging)
|
|
62
|
+
- Package works out of the box on Unix-like systems due to inclusion of the alignment software executable (MAFFT) into the package.
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
### Usage
|
|
66
|
+
|
|
67
|
+
#### Option 1: As a script
|
|
68
|
+
If current working directory within `plastburstalign`, execute the package via:
|
|
69
|
+
```bash
|
|
70
|
+
python -m plastburstalign
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
#### Option 2: As a module
|
|
74
|
+
From within Python, execute the package functions via:
|
|
75
|
+
```python
|
|
76
|
+
from plastburstalign import PlastomeRegionBurstAndAlign
|
|
77
|
+
burst = PlastomeRegionBurstAndAlign()
|
|
78
|
+
burst.execute()
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
#### Usage of individual package components
|
|
82
|
+
Individual components can be used as well. For example, to use the class `MAFFT` by itself (e.g., instantiate a configuration of MAFFT that will execute with 1 thread; institute another that will execute with 10 threads), type:
|
|
83
|
+
|
|
84
|
+
```python
|
|
85
|
+
from plastburstalign import MAFFT
|
|
86
|
+
|
|
87
|
+
mafft_1 = MAFFT()
|
|
88
|
+
mafft_10 = MAFFT({"num_threads": 10})
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
### Explanation of exon splicing
|
|
93
|
+
|
|
94
|
+
As the gene list produced through parsing all input genomes is iterated over, genes that comprise multiple exons are automatically flagged and treated according to the distance between their exons. Cis-spliced genes only comprise exons that are adjacent to each other, trans-spliced genes comprise one or more exons that are not adjacent to each other. This software merges the exons of any cis-spliced gene in place (i.e., according to the location specified by the source GenBank file; no repositioning of the exons necessary). The exons of any trans-spliced gene (e.g., _rps12_), by contrast, undergo a repositioning before being merged. Specifically, the software accommodates the fact that GenBank flatfiles list trans-spliced genes (e.g., _rps12_) out of their natural order along the genome sequence and additionally repositions the exons of trans-spliced genes by converting them to adjacent exons and then merges these exons.
|
|
95
|
+
|
|
96
|
+
For the repositioning of trans-spliced gene, all annotations of that gene are first moved from the main gene list to a separate list. Then, the annotations are split into simple location features for each contiguous group of exons. Third, the expected location of each of these simple gene features is determined by comparing its end location with the end locations of the gene features in the main gene list: if the expected location has no overlap with either the proceeding and succeeding genes and the feature is different in name from either, it is directly inserted into that location. Alternatively, if the expected location of the feature results in a flanking gene (strictly adjacent or overlapping) with the same name, the annotations are merged; the merging is true for both the proceeding and the succeeding gene.
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
### Testing
|
|
100
|
+
```bash
|
|
101
|
+
cd benchmarking
|
|
102
|
+
# CDS
|
|
103
|
+
python test_script_cds.py benchmarking1
|
|
104
|
+
python test_script_cds.py benchmarking2
|
|
105
|
+
# INT
|
|
106
|
+
python test_script_int.py benchmarking1
|
|
107
|
+
python test_script_int.py benchmarking2
|
|
108
|
+
# IGS
|
|
109
|
+
python test_script_igs.py benchmarking1
|
|
110
|
+
python test_script_igs.py benchmarking2
|
|
111
|
+
```
|
|
112
|
+
- Dataset `benchmarking1.tar.gz`: all Asteraceae (n=155) listed in [Yang et al. 2022](https://www.frontiersin.org/journals/plant-science/articles/10.3389/fpls.2022.808156)
|
|
113
|
+
- Dataset `benchmarking2.tar.gz`: all monocots (n=733) listed in [Yang et al. 2022](https://www.frontiersin.org/journals/plant-science/articles/10.3389/fpls.2022.808156)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
### Exemplary usage
|
|
117
|
+
See [this document](https://github.com/michaelgruenstaeudl/PlastomeBurstAndAlign/blob/main/docs/exemplary_usage.md)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
### Generating more test data
|
|
121
|
+
See [this document](https://github.com/michaelgruenstaeudl/PlastomeBurstAndAlign/blob/main/docs/generating_test_data.md)
|
|
122
|
+
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
# plastburstalign
|
|
2
|
+
A Python tool to extract and align genes, introns, and intergenic spacers across hundreds of plastid genomes using associative arrays
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
### Installation on Linux (Debian)
|
|
6
|
+
```bash
|
|
7
|
+
# Alignment software
|
|
8
|
+
apt install mafft
|
|
9
|
+
|
|
10
|
+
# Other dependencies
|
|
11
|
+
apt install python3-biopython
|
|
12
|
+
apt install python3-coloredlogs
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
### Overview of process
|
|
16
|
+

|
|
17
|
+
|
|
18
|
+
### Main features
|
|
19
|
+
- Extracts all sequences from each of three different genome marker types (i.e., genes, introns, or intergenic spacers) from a set of plastid genomes in GenBank flatfile format, groups and aligns homologous extracted sequences, and then saves the alignments to file
|
|
20
|
+
- Saves both the individual alignments and the concatenation of all alignments
|
|
21
|
+
- Automatic removal of any duplicate regions (i.e., relevant for features duplicated through the IRs)
|
|
22
|
+
- Exon splicing operations #1: Automatic merging of all exons of any cis-splied gene [see functions of class `ExonSpliceHandler`]
|
|
23
|
+
- Exon splicing operations #2: Automatic grouping of all exons of any trans-spliced gene (e.g., _rps12_), followed by merging the exons [see functions of class `ExonSpliceHandler`]
|
|
24
|
+
- Automatic removal of regions that do not fulfill
|
|
25
|
+
- a minimum, user-specified sequence length
|
|
26
|
+
- a minimum, user-specified number of taxa in the dataset that the region must be found in [see function `DataCleaning()` for both]
|
|
27
|
+
|
|
28
|
+
### Additional features
|
|
29
|
+
- Rapid sequence extraction and alignment of the genes/introns/intergenic spacers due to process parallelization using multiple CPUs [see internal function `_nuc_MSA()`]
|
|
30
|
+
- Automatic removal of any user-specified genes/introns/intergenic spacers
|
|
31
|
+
- Choice of
|
|
32
|
+
- the order of concatenation of the aligned genes/introns/intergenic spacers to either the natural order of the first input genome (commandline option `seq`) or an alphabetic order (commandline option `alpha`)
|
|
33
|
+
- automatic case standardization of gene names to adjust for letter-case differences between gene annotations of different genome records (which is especially relevant for anticodon and amino acid abbreviations of tRNAs); includes the option to remove anticodon and amino acid abbreviations from tRNA gene names altogether [see function `clean_gene()`]
|
|
34
|
+
- If a gene/intron/intergenic spacer cannot be extracted from a GenBank record, provision of explanation why the extraction failed
|
|
35
|
+
- Availability of two log levels:
|
|
36
|
+
- default (suitable for regular software execution), and
|
|
37
|
+
- verbose (suitable for debugging)
|
|
38
|
+
- Package works out of the box on Unix-like systems due to inclusion of the alignment software executable (MAFFT) into the package.
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
### Usage
|
|
42
|
+
|
|
43
|
+
#### Option 1: As a script
|
|
44
|
+
If current working directory within `plastburstalign`, execute the package via:
|
|
45
|
+
```bash
|
|
46
|
+
python -m plastburstalign
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
#### Option 2: As a module
|
|
50
|
+
From within Python, execute the package functions via:
|
|
51
|
+
```python
|
|
52
|
+
from plastburstalign import PlastomeRegionBurstAndAlign
|
|
53
|
+
burst = PlastomeRegionBurstAndAlign()
|
|
54
|
+
burst.execute()
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
#### Usage of individual package components
|
|
58
|
+
Individual components can be used as well. For example, to use the class `MAFFT` by itself (e.g., instantiate a configuration of MAFFT that will execute with 1 thread; institute another that will execute with 10 threads), type:
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
from plastburstalign import MAFFT
|
|
62
|
+
|
|
63
|
+
mafft_1 = MAFFT()
|
|
64
|
+
mafft_10 = MAFFT({"num_threads": 10})
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
### Explanation of exon splicing
|
|
69
|
+
|
|
70
|
+
As the gene list produced through parsing all input genomes is iterated over, genes that comprise multiple exons are automatically flagged and treated according to the distance between their exons. Cis-spliced genes only comprise exons that are adjacent to each other, trans-spliced genes comprise one or more exons that are not adjacent to each other. This software merges the exons of any cis-spliced gene in place (i.e., according to the location specified by the source GenBank file; no repositioning of the exons necessary). The exons of any trans-spliced gene (e.g., _rps12_), by contrast, undergo a repositioning before being merged. Specifically, the software accommodates the fact that GenBank flatfiles list trans-spliced genes (e.g., _rps12_) out of their natural order along the genome sequence and additionally repositions the exons of trans-spliced genes by converting them to adjacent exons and then merges these exons.
|
|
71
|
+
|
|
72
|
+
For the repositioning of trans-spliced gene, all annotations of that gene are first moved from the main gene list to a separate list. Then, the annotations are split into simple location features for each contiguous group of exons. Third, the expected location of each of these simple gene features is determined by comparing its end location with the end locations of the gene features in the main gene list: if the expected location has no overlap with either the proceeding and succeeding genes and the feature is different in name from either, it is directly inserted into that location. Alternatively, if the expected location of the feature results in a flanking gene (strictly adjacent or overlapping) with the same name, the annotations are merged; the merging is true for both the proceeding and the succeeding gene.
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
### Testing
|
|
76
|
+
```bash
|
|
77
|
+
cd benchmarking
|
|
78
|
+
# CDS
|
|
79
|
+
python test_script_cds.py benchmarking1
|
|
80
|
+
python test_script_cds.py benchmarking2
|
|
81
|
+
# INT
|
|
82
|
+
python test_script_int.py benchmarking1
|
|
83
|
+
python test_script_int.py benchmarking2
|
|
84
|
+
# IGS
|
|
85
|
+
python test_script_igs.py benchmarking1
|
|
86
|
+
python test_script_igs.py benchmarking2
|
|
87
|
+
```
|
|
88
|
+
- Dataset `benchmarking1.tar.gz`: all Asteraceae (n=155) listed in [Yang et al. 2022](https://www.frontiersin.org/journals/plant-science/articles/10.3389/fpls.2022.808156)
|
|
89
|
+
- Dataset `benchmarking2.tar.gz`: all monocots (n=733) listed in [Yang et al. 2022](https://www.frontiersin.org/journals/plant-science/articles/10.3389/fpls.2022.808156)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
### Exemplary usage
|
|
93
|
+
See [this document](https://github.com/michaelgruenstaeudl/PlastomeBurstAndAlign/blob/main/docs/exemplary_usage.md)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
### Generating more test data
|
|
97
|
+
See [this document](https://github.com/michaelgruenstaeudl/PlastomeBurstAndAlign/blob/main/docs/generating_test_data.md)
|
|
98
|
+
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from datetime import date
|
|
2
|
+
from importlib.metadata import version
|
|
3
|
+
|
|
4
|
+
__name__ = "plastburstalign"
|
|
5
|
+
__author__ = "Michael Gruenstaeudl, PhD"
|
|
6
|
+
__email__ = "m_gruenstaeudl@fhsu.edu"
|
|
7
|
+
|
|
8
|
+
try:
|
|
9
|
+
__version__ = version(__name__)
|
|
10
|
+
except ModuleNotFoundError:
|
|
11
|
+
__version__ = date.today()
|
|
12
|
+
|
|
13
|
+
from .user_parameters import UserParameters
|
|
14
|
+
from .seqfeature_ops import PlastidData
|
|
15
|
+
from .extraction_ops import ExtractAndCollect, DataCleaning
|
|
16
|
+
from .alignment_ops import AlignmentCoordination, MAFFT
|
|
17
|
+
from .plastome_burst_and_align import PlastomeRegionBurstAndAlign
|
|
18
|
+
|
|
19
|
+
__all__ = ['PlastomeRegionBurstAndAlign', 'UserParameters', 'PlastidData',
|
|
20
|
+
'ExtractAndCollect', 'DataCleaning', 'AlignmentCoordination', 'MAFFT']
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
from .user_parameters import UserParametersScript
|
|
2
|
+
from .plastome_burst_and_align import PlastomeRegionBurstAndAlign
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
if __name__ == "__main__":
|
|
6
|
+
params = UserParametersScript()
|
|
7
|
+
burst_align = PlastomeRegionBurstAndAlign(params)
|
|
8
|
+
burst_align.execute()
|
|
9
|
+
|
|
10
|
+
print("\nend of script\n")
|