dnaapler 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. dnaapler-0.1.0/HISTORY.md +20 -0
  2. dnaapler-0.1.0/LICENSE +22 -0
  3. dnaapler-0.1.0/PKG-INFO +144 -0
  4. dnaapler-0.1.0/README.md +118 -0
  5. dnaapler-0.1.0/pyproject.toml +45 -0
  6. dnaapler-0.1.0/src/dnaapler/__init__.py +415 -0
  7. dnaapler-0.1.0/src/dnaapler/db/dnaA.faa +6199 -0
  8. dnaapler-0.1.0/src/dnaapler/db/dnaA_db.pdb +0 -0
  9. dnaapler-0.1.0/src/dnaapler/db/dnaA_db.phr +0 -0
  10. dnaapler-0.1.0/src/dnaapler/db/dnaA_db.pin +0 -0
  11. dnaapler-0.1.0/src/dnaapler/db/dnaA_db.pjs +22 -0
  12. dnaapler-0.1.0/src/dnaapler/db/dnaA_db.psq +0 -0
  13. dnaapler-0.1.0/src/dnaapler/db/dnaA_db.ptf +0 -0
  14. dnaapler-0.1.0/src/dnaapler/db/dnaA_db.pto +0 -0
  15. dnaapler-0.1.0/src/dnaapler/db/repA.faa +1298 -0
  16. dnaapler-0.1.0/src/dnaapler/db/repA_db.pdb +0 -0
  17. dnaapler-0.1.0/src/dnaapler/db/repA_db.phr +0 -0
  18. dnaapler-0.1.0/src/dnaapler/db/repA_db.pin +0 -0
  19. dnaapler-0.1.0/src/dnaapler/db/repA_db.pjs +22 -0
  20. dnaapler-0.1.0/src/dnaapler/db/repA_db.psq +0 -0
  21. dnaapler-0.1.0/src/dnaapler/db/repA_db.ptf +0 -0
  22. dnaapler-0.1.0/src/dnaapler/db/repA_db.pto +0 -0
  23. dnaapler-0.1.0/src/dnaapler/db/terL.faa +75053 -0
  24. dnaapler-0.1.0/src/dnaapler/db/terL_db.pdb +0 -0
  25. dnaapler-0.1.0/src/dnaapler/db/terL_db.phr +0 -0
  26. dnaapler-0.1.0/src/dnaapler/db/terL_db.pin +0 -0
  27. dnaapler-0.1.0/src/dnaapler/db/terL_db.pjs +22 -0
  28. dnaapler-0.1.0/src/dnaapler/db/terL_db.psq +0 -0
  29. dnaapler-0.1.0/src/dnaapler/db/terL_db.ptf +0 -0
  30. dnaapler-0.1.0/src/dnaapler/db/terL_db.pto +0 -0
  31. dnaapler-0.1.0/src/dnaapler/tests/helpers/logs/python_bf95c699e4089695c4540a121d356e28ed2abda25716ac4730d2a13e753a71d3.err +2 -0
  32. dnaapler-0.1.0/src/dnaapler/tests/helpers/logs/python_bf95c699e4089695c4540a121d356e28ed2abda25716ac4730d2a13e753a71d3.out +0 -0
  33. dnaapler-0.1.0/src/dnaapler/utils/CITATION +4 -0
  34. dnaapler-0.1.0/src/dnaapler/utils/VERSION +1 -0
  35. dnaapler-0.1.0/src/dnaapler/utils/constants.py +6 -0
  36. dnaapler-0.1.0/src/dnaapler/utils/external_tools.py +94 -0
  37. dnaapler-0.1.0/src/dnaapler/utils/processing.py +176 -0
  38. dnaapler-0.1.0/src/dnaapler/utils/util.py +103 -0
  39. dnaapler-0.1.0/src/dnaapler/utils/validation.py +102 -0
@@ -0,0 +1,20 @@
1
+ =======
2
+ History
3
+ =======
4
+
5
+ 0.1.0 (2022-10-12)
6
+ ------------------
7
+
8
+ * Completely overhauled
9
+ * First stable released with pypi and conda
10
+ * `plassembler chromosome` added
11
+ * `plassembler custom` added
12
+ * `plassembler mystery` added
13
+ * `plassembler phage` added
14
+ * `plassembler plasmid` added
15
+
16
+
17
+ 0.0.1 (2022-10-12)
18
+ ------------------
19
+
20
+ * First release (conda only `conda install -c gbouras dnaapler`)
dnaapler-0.1.0/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2021, George Bouras
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
22
+
@@ -0,0 +1,144 @@
1
+ Metadata-Version: 2.1
2
+ Name: dnaapler
3
+ Version: 0.1.0
4
+ Summary: Reorients assembled microbial sequences
5
+ Home-page: https://github.com/gbouras13/dnaapler
6
+ License: MIT
7
+ Keywords: microbial,bioinformatics
8
+ Author: George Bouras
9
+ Author-email: george.bouras@adelaide.edu.au
10
+ Requires-Python: >=3.8,<4.0
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.8
14
+ Classifier: Programming Language :: Python :: 3.9
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Requires-Dist: biopython (>=1.76)
18
+ Requires-Dist: click (>=8.0.0)
19
+ Requires-Dist: loguru (>=0.5.3)
20
+ Requires-Dist: pandas (>=1.4.2)
21
+ Requires-Dist: pyrodigal (>=2.0.0)
22
+ Requires-Dist: pyyaml (>=6.0)
23
+ Project-URL: Repository, https://github.com/gbouras13/dnaapler
24
+ Description-Content-Type: text/markdown
25
+
26
+ [![CI](https://github.com/gbouras13/dnaapler/actions/workflows/ci.yaml/badge.svg)](https://github.com/gbouras13/dnaapler/actions/workflows/ci.yaml)
27
+ [![codecov](https://codecov.io/gh/gbouras13/dnaapler/branch/refactor/graph/badge.svg?token=4B1T2PGM9V)](https://codecov.io/gh/gbouras13/dnaapler)
28
+ [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
29
+
30
+
31
+ # dnaapler
32
+
33
+ Description
34
+ ----------
35
+
36
+ `dnaapler` is a simple python program that takes a single nucleotide input sequence (in FASTA format), finds the desired start gene using `blastx` against an amino acid database, checks that the start of a gene is found, and if so, then reorients the chromosome to begin with this genes on the forward strand.
37
+
38
+ It was designed to replicate the reorientation functionality of [Unicycler](https://github.com/rrwick/Unicycler/blob/main/unicycler/gene_data/repA.fasta) with dnaA, but for FASTA input and for long-read first assembled chromosomes. I have extended it to work with plasmids and phages, or for any input FASTA desired with `plassembler custom` or `plassembler mystery`.
39
+
40
+ For bacterial chromosomes, `dnaapler chromosome` should ensure the chromosome breakpoint never interrupts genes or mobile genetic elements like prophages. It is intended to be used with good-quality completed bacterial genomes, generated with methods such as [Trycycler](https://github.com/rrwick/Trycycler/wiki), [Dragonflye](https://github.com/rpetit3/dragonflye) or my own pipleine [hybracter](https://github.com/gbouras13/hybracter).
41
+
42
+ ## Installation
43
+
44
+ dnaapler requires only BLAST as an external dependency.
45
+
46
+ Installation from conda is recommended as this will install BLAST automatically when it becomes available.
47
+
48
+ ### Conda
49
+
50
+ ```
51
+ conda install -c bioconda dnaapler
52
+ ```
53
+
54
+ ### Pip
55
+
56
+ ```
57
+ pip install dnaapler
58
+ ```
59
+
60
+ You will need to install BLAST separately.
61
+
62
+ e.g.
63
+ `conda install -c bioconda blast`
64
+
65
+
66
+
67
+ Usage
68
+ ----------
69
+
70
+ ```
71
+ Usage: dnaapler [OPTIONS] COMMAND [ARGS]...
72
+
73
+ Options:
74
+ -h, --help Show this message and exit.
75
+ -V, --version Show the version and exit.
76
+
77
+ Commands:
78
+ chromosome Reorients your sequence to begin with the dnaA chromosomal...
79
+ citation Print the citation(s) for this tool
80
+ custom Reorients your sequence with a custom database
81
+ mystery Reorients your sequence with a random gene
82
+ phage Reorients your sequence to begin with the terL large...
83
+ plasmid Reorients your sequence to begin with the repA replication...
84
+ ```
85
+
86
+ ```
87
+ Usage: dnaapler chromosome [OPTIONS]
88
+
89
+ Reorients your sequence to begin with the dnaA chromosomal replication
90
+ initiation gene
91
+
92
+ Options:
93
+ -h, --help Show this message and exit.
94
+ -V, --version Show the version and exit.
95
+ -i, --input PATH Path to input file in FASTA format [required]
96
+ -o, --output PATH Output directory [default: output.dnaapler]
97
+ -t, --threads INTEGER Number of threads to use with BLAST. [default: 1]
98
+ -p, --prefix TEXT Prefix for output files. [default :dnaapler]
99
+ -f, --force Force overwrites the output directory
100
+ ```
101
+
102
+
103
+
104
+ Databases
105
+ =============
106
+
107
+ `dnaapler chromosome` uses 733 proteins downloaded from Swissprot with the query "Chromosomal replication initiator protein DnaA" on 24 May 2023 as its database for dnaA.
108
+
109
+ `dnaapler plasmid` uses the repA database curated by Ryan Wick in [Unicycler](https://github.com/rrwick/Unicycler/blob/main/unicycler/gene_data/repA.fasta).
110
+
111
+ `dnaapler phage` uses a terL database curated using [PHROGs](https://phrogs.lmge.uca.fr). I downloaded all the AA sequences of the 55 phrogs annotated as 'large terminase subunit', combined them depduplicated them using [seqkit](https://github.com/shenwei356/seqkit) `seqkit rmdup -s -o terL.faa phrog_terL.faa`.
112
+
113
+ `dnaapler custom` uses a custom amino acid FASTA format gene(s) that you specify using `-c`.
114
+
115
+ The matching is strict - it requires a strong BLAST match (e-value 1E-10), and the first amino acid of a BLAST hit gene to be identified as Methionine, Valine or Leucine, the 3 most used start codons in bacteria/phages.
116
+
117
+ For the most commonly studied microbes (ESKAPE pathogens, etc), the dnaA database should suffice.
118
+
119
+ If you try dnaapler on a more novel or under-studied microbe with a dnaA gene that has little sequence similarity to the database, you may need to provide your own dnaA gene(s) in amino acid FASTA format using `dnaapler custom`.
120
+
121
+ After this [issue](https://github.com/gbouras13/dnaapler/issues/1), `dnaapler mystery` was added. It predicted all ORFs in the input, then picks a random sequence to re-orient your sequence with.s
122
+
123
+
124
+ Motivation
125
+ ------------
126
+
127
+ 1. I couldn't get [Circlator](https://sanger-pathogens.github.io/circlator/) to work and it is no longer supported.
128
+ 2. [berokka](https://github.com/tseemann/berokka) doesn't orient chromosomes to begin with dnaa.
129
+ 3. After reading Ryan Wick's masterful bacterial genome assembly [tutorial](https://github.com/rrwick/Perfect-bacterial-genome-tutorial/wiki), I realised that it is probably optimal to run 2 polishing steps, once before then once after rotating the chromosome, to ensure the breakpoint is polished. Further, for some "complete" assemblies that didn't circularise properly, I figured that as long as you have a complete assembly (even if not "circular" as marked as in Flye), polishing after a re-orientation would be likely to circularise the chromosome. A bit like Ryan's [rotate_circular_gfa.py](https://github.com/rrwick/Perfect-bacterial-genome-tutorial/blob/main/scripts/rotate_circular_gfa.py) script, without the requirement of strict circularity.
130
+ 4. While researching MGEs in _S. aureus_ whole genome sequences, I repeatedly found instances where MGEs were interrupted by the chromosome breakpoint. So I thought I'd add a tool to automate it in my pipeline.
131
+ 5. It's probably good to have all your sequences start at the same location for synteny analyses.
132
+
133
+ Polishing Afterwards
134
+ -----------
135
+
136
+ I recommend that you undertake 2 rounds of polishing. The first prior to running dnaapler, and then again after. I'd highly recommend a conservative polisher like [Polypolish](https://github.com/rrwick/Polypolish) if you have short reads, otherwise 2 rounds of medaka.
137
+
138
+ Acknowledgements
139
+ =============
140
+
141
+ Thanks to Torsten Seemann, Ryan Wick and the Circlator team for their existing work in the space. Also to [Michael Hall](https://github.com/mbhall88), whose repository [tbpore](https://github.com/mbhall88/tbpore) I took and adapted a lot of scaffolding code from because he writes really nice code, [Rob Edwards](https://github.com/linsalrob), because everything always comes back to phages, and especially [Vijini Mallawaarachchi](https://github.com/Vini2) who taught me how to actually do something resembling legitimate software development.
142
+
143
+
144
+
@@ -0,0 +1,118 @@
1
+ [![CI](https://github.com/gbouras13/dnaapler/actions/workflows/ci.yaml/badge.svg)](https://github.com/gbouras13/dnaapler/actions/workflows/ci.yaml)
2
+ [![codecov](https://codecov.io/gh/gbouras13/dnaapler/branch/refactor/graph/badge.svg?token=4B1T2PGM9V)](https://codecov.io/gh/gbouras13/dnaapler)
3
+ [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
4
+
5
+
6
+ # dnaapler
7
+
8
+ Description
9
+ ----------
10
+
11
+ `dnaapler` is a simple python program that takes a single nucleotide input sequence (in FASTA format), finds the desired start gene using `blastx` against an amino acid database, checks that the start of a gene is found, and if so, then reorients the chromosome to begin with this genes on the forward strand.
12
+
13
+ It was designed to replicate the reorientation functionality of [Unicycler](https://github.com/rrwick/Unicycler/blob/main/unicycler/gene_data/repA.fasta) with dnaA, but for FASTA input and for long-read first assembled chromosomes. I have extended it to work with plasmids and phages, or for any input FASTA desired with `plassembler custom` or `plassembler mystery`.
14
+
15
+ For bacterial chromosomes, `dnaapler chromosome` should ensure the chromosome breakpoint never interrupts genes or mobile genetic elements like prophages. It is intended to be used with good-quality completed bacterial genomes, generated with methods such as [Trycycler](https://github.com/rrwick/Trycycler/wiki), [Dragonflye](https://github.com/rpetit3/dragonflye) or my own pipleine [hybracter](https://github.com/gbouras13/hybracter).
16
+
17
+ ## Installation
18
+
19
+ dnaapler requires only BLAST as an external dependency.
20
+
21
+ Installation from conda is recommended as this will install BLAST automatically when it becomes available.
22
+
23
+ ### Conda
24
+
25
+ ```
26
+ conda install -c bioconda dnaapler
27
+ ```
28
+
29
+ ### Pip
30
+
31
+ ```
32
+ pip install dnaapler
33
+ ```
34
+
35
+ You will need to install BLAST separately.
36
+
37
+ e.g.
38
+ `conda install -c bioconda blast`
39
+
40
+
41
+
42
+ Usage
43
+ ----------
44
+
45
+ ```
46
+ Usage: dnaapler [OPTIONS] COMMAND [ARGS]...
47
+
48
+ Options:
49
+ -h, --help Show this message and exit.
50
+ -V, --version Show the version and exit.
51
+
52
+ Commands:
53
+ chromosome Reorients your sequence to begin with the dnaA chromosomal...
54
+ citation Print the citation(s) for this tool
55
+ custom Reorients your sequence with a custom database
56
+ mystery Reorients your sequence with a random gene
57
+ phage Reorients your sequence to begin with the terL large...
58
+ plasmid Reorients your sequence to begin with the repA replication...
59
+ ```
60
+
61
+ ```
62
+ Usage: dnaapler chromosome [OPTIONS]
63
+
64
+ Reorients your sequence to begin with the dnaA chromosomal replication
65
+ initiation gene
66
+
67
+ Options:
68
+ -h, --help Show this message and exit.
69
+ -V, --version Show the version and exit.
70
+ -i, --input PATH Path to input file in FASTA format [required]
71
+ -o, --output PATH Output directory [default: output.dnaapler]
72
+ -t, --threads INTEGER Number of threads to use with BLAST. [default: 1]
73
+ -p, --prefix TEXT Prefix for output files. [default :dnaapler]
74
+ -f, --force Force overwrites the output directory
75
+ ```
76
+
77
+
78
+
79
+ Databases
80
+ =============
81
+
82
+ `dnaapler chromosome` uses 733 proteins downloaded from Swissprot with the query "Chromosomal replication initiator protein DnaA" on 24 May 2023 as its database for dnaA.
83
+
84
+ `dnaapler plasmid` uses the repA database curated by Ryan Wick in [Unicycler](https://github.com/rrwick/Unicycler/blob/main/unicycler/gene_data/repA.fasta).
85
+
86
+ `dnaapler phage` uses a terL database curated using [PHROGs](https://phrogs.lmge.uca.fr). I downloaded all the AA sequences of the 55 phrogs annotated as 'large terminase subunit', combined them depduplicated them using [seqkit](https://github.com/shenwei356/seqkit) `seqkit rmdup -s -o terL.faa phrog_terL.faa`.
87
+
88
+ `dnaapler custom` uses a custom amino acid FASTA format gene(s) that you specify using `-c`.
89
+
90
+ The matching is strict - it requires a strong BLAST match (e-value 1E-10), and the first amino acid of a BLAST hit gene to be identified as Methionine, Valine or Leucine, the 3 most used start codons in bacteria/phages.
91
+
92
+ For the most commonly studied microbes (ESKAPE pathogens, etc), the dnaA database should suffice.
93
+
94
+ If you try dnaapler on a more novel or under-studied microbe with a dnaA gene that has little sequence similarity to the database, you may need to provide your own dnaA gene(s) in amino acid FASTA format using `dnaapler custom`.
95
+
96
+ After this [issue](https://github.com/gbouras13/dnaapler/issues/1), `dnaapler mystery` was added. It predicted all ORFs in the input, then picks a random sequence to re-orient your sequence with.s
97
+
98
+
99
+ Motivation
100
+ ------------
101
+
102
+ 1. I couldn't get [Circlator](https://sanger-pathogens.github.io/circlator/) to work and it is no longer supported.
103
+ 2. [berokka](https://github.com/tseemann/berokka) doesn't orient chromosomes to begin with dnaa.
104
+ 3. After reading Ryan Wick's masterful bacterial genome assembly [tutorial](https://github.com/rrwick/Perfect-bacterial-genome-tutorial/wiki), I realised that it is probably optimal to run 2 polishing steps, once before then once after rotating the chromosome, to ensure the breakpoint is polished. Further, for some "complete" assemblies that didn't circularise properly, I figured that as long as you have a complete assembly (even if not "circular" as marked as in Flye), polishing after a re-orientation would be likely to circularise the chromosome. A bit like Ryan's [rotate_circular_gfa.py](https://github.com/rrwick/Perfect-bacterial-genome-tutorial/blob/main/scripts/rotate_circular_gfa.py) script, without the requirement of strict circularity.
105
+ 4. While researching MGEs in _S. aureus_ whole genome sequences, I repeatedly found instances where MGEs were interrupted by the chromosome breakpoint. So I thought I'd add a tool to automate it in my pipeline.
106
+ 5. It's probably good to have all your sequences start at the same location for synteny analyses.
107
+
108
+ Polishing Afterwards
109
+ -----------
110
+
111
+ I recommend that you undertake 2 rounds of polishing. The first prior to running dnaapler, and then again after. I'd highly recommend a conservative polisher like [Polypolish](https://github.com/rrwick/Polypolish) if you have short reads, otherwise 2 rounds of medaka.
112
+
113
+ Acknowledgements
114
+ =============
115
+
116
+ Thanks to Torsten Seemann, Ryan Wick and the Circlator team for their existing work in the space. Also to [Michael Hall](https://github.com/mbhall88), whose repository [tbpore](https://github.com/mbhall88/tbpore) I took and adapted a lot of scaffolding code from because he writes really nice code, [Rob Edwards](https://github.com/linsalrob), because everything always comes back to phages, and especially [Vijini Mallawaarachchi](https://github.com/Vini2) who taught me how to actually do something resembling legitimate software development.
117
+
118
+
@@ -0,0 +1,45 @@
1
+ [tool.poetry]
2
+ name = "dnaapler"
3
+ version = "0.1.0" # change VERSION too
4
+ description = "Reorients assembled microbial sequences"
5
+ authors = ["George Bouras <george.bouras@adelaide.edu.au>"]
6
+ license = "MIT"
7
+ readme = "README.md"
8
+ homepage = "https://github.com/gbouras13/dnaapler"
9
+ repository = "https://github.com/gbouras13/dnaapler"
10
+ keywords = ["microbial", "bioinformatics"]
11
+ include = [
12
+ "HISTORY.md"
13
+ ]
14
+
15
+ [tool.poetry.scripts]
16
+ dnaapler = 'dnaapler:main'
17
+
18
+ [tool.poetry.dependencies]
19
+ python = ">=3.8,<4.0"
20
+ click = ">=8.0.0"
21
+ loguru = ">=0.5.3"
22
+ pyyaml = ">=6.0"
23
+ pandas = ">=1.4.2"
24
+ biopython = ">=1.76"
25
+ pyrodigal = ">=2.0.0"
26
+
27
+
28
+ [tool.poetry.dev-dependencies]
29
+ black = ">=22.3.0"
30
+ isort = ">=5.10.1"
31
+ pytest = ">=6.2.5"
32
+ pytest-cov = ">=3.0.0"
33
+ flake8 = ">=3.0.1"
34
+
35
+ [[tool.poetry.source]]
36
+ name = "pypi-test"
37
+ url = "https://test.pypi.org/simple/"
38
+ priority = "primary"
39
+
40
+ [build-system]
41
+ requires = ["poetry-core>=1.0.0"]
42
+ build-backend = "poetry.core.masonry.api"
43
+
44
+ [tool.isort]
45
+ profile = "black"