telomore 0.4.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- telomore-0.4.1/.gitignore +84 -0
- telomore-0.4.1/.pre-commit-config.yaml +40 -0
- telomore-0.4.1/CITATION.cff +47 -0
- telomore-0.4.1/LICENSE +21 -0
- telomore-0.4.1/PKG-INFO +149 -0
- telomore-0.4.1/README.md +124 -0
- telomore-0.4.1/pyproject.toml +139 -0
- telomore-0.4.1/src/telomore/__init__.py +5 -0
- telomore-0.4.1/src/telomore/_version.py +34 -0
- telomore-0.4.1/src/telomore/app.py +536 -0
- telomore-0.4.1/src/telomore/utils/__init__.py +1 -0
- telomore-0.4.1/src/telomore/utils/arg_parser.py +220 -0
- telomore-0.4.1/src/telomore/utils/classes_and_small_func.py +289 -0
- telomore-0.4.1/src/telomore/utils/cmd_tools.py +732 -0
- telomore-0.4.1/src/telomore/utils/fasta_tools.py +595 -0
- telomore-0.4.1/src/telomore/utils/map_tools.py +1333 -0
- telomore-0.4.1/src/telomore/utils/qc_reports.py +493 -0
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
# Package notes
|
|
2
|
+
.ideas/
|
|
3
|
+
|
|
4
|
+
# Mac stuff
|
|
5
|
+
.DS_Store
|
|
6
|
+
|
|
7
|
+
# Versioning
|
|
8
|
+
src/telomore/_version.py
|
|
9
|
+
|
|
10
|
+
# Byte-compiled / optimized / DLL files
|
|
11
|
+
__pycache__/
|
|
12
|
+
*.py[cod]
|
|
13
|
+
*$py.class
|
|
14
|
+
|
|
15
|
+
# C extensions
|
|
16
|
+
*.so
|
|
17
|
+
|
|
18
|
+
# Distribution / packaging
|
|
19
|
+
.Python
|
|
20
|
+
build/
|
|
21
|
+
develop-eggs/
|
|
22
|
+
dist/
|
|
23
|
+
downloads/
|
|
24
|
+
eggs/
|
|
25
|
+
.eggs/
|
|
26
|
+
lib/
|
|
27
|
+
lib64/
|
|
28
|
+
parts/
|
|
29
|
+
sdist/
|
|
30
|
+
var/
|
|
31
|
+
wheels/
|
|
32
|
+
*.egg-info/
|
|
33
|
+
.installed.cfg
|
|
34
|
+
*.egg
|
|
35
|
+
MANIFEST
|
|
36
|
+
|
|
37
|
+
# PyInstaller
|
|
38
|
+
# Usually these files are written by a python script from a template
|
|
39
|
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
|
40
|
+
*.manifest
|
|
41
|
+
*.spec
|
|
42
|
+
|
|
43
|
+
# Installer logs
|
|
44
|
+
pip-log.txt
|
|
45
|
+
pip-delete-this-directory.txt
|
|
46
|
+
|
|
47
|
+
# Unit test / coverage reports
|
|
48
|
+
htmlcov/
|
|
49
|
+
.tox/
|
|
50
|
+
.coverage
|
|
51
|
+
.coverage.*
|
|
52
|
+
.cache
|
|
53
|
+
nosetests.xml
|
|
54
|
+
coverage.xml
|
|
55
|
+
*.cover
|
|
56
|
+
.hypothesis/
|
|
57
|
+
.pytest_cache/
|
|
58
|
+
|
|
59
|
+
# Sphinx documentation
|
|
60
|
+
docs/_build/
|
|
61
|
+
|
|
62
|
+
# PyBuilder
|
|
63
|
+
target/
|
|
64
|
+
|
|
65
|
+
# Jupyter Notebook
|
|
66
|
+
.ipynb_checkpoints
|
|
67
|
+
|
|
68
|
+
# pyenv
|
|
69
|
+
.python-version
|
|
70
|
+
|
|
71
|
+
# Environments
|
|
72
|
+
.env
|
|
73
|
+
.venv
|
|
74
|
+
env/
|
|
75
|
+
venv/
|
|
76
|
+
ENV/
|
|
77
|
+
env.bak/
|
|
78
|
+
venv.bak/
|
|
79
|
+
|
|
80
|
+
# mkdocs documentation
|
|
81
|
+
/site
|
|
82
|
+
|
|
83
|
+
# mypy
|
|
84
|
+
.mypy_cache/
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
# See https://pre-commit.com for more information
|
|
2
|
+
# See https://pre-commit.com/hooks.html for more hooks
|
|
3
|
+
repos:
|
|
4
|
+
- repo: https://github.com/pre-commit/pre-commit-hooks
|
|
5
|
+
rev: v6.0.0
|
|
6
|
+
hooks:
|
|
7
|
+
- id: check-added-large-files
|
|
8
|
+
args: [--maxkb=515, --enforce-all]
|
|
9
|
+
- id: check-ast
|
|
10
|
+
- id: check-executables-have-shebangs
|
|
11
|
+
- id: check-json
|
|
12
|
+
- id: check-merge-conflict
|
|
13
|
+
- id: check-symlinks
|
|
14
|
+
- id: check-toml
|
|
15
|
+
- id: check-yaml
|
|
16
|
+
- id: debug-statements
|
|
17
|
+
- id: end-of-file-fixer
|
|
18
|
+
- id: end-of-file-fixer
|
|
19
|
+
- id: name-tests-test
|
|
20
|
+
args: [--pytest-test-first]
|
|
21
|
+
- id: trailing-whitespace
|
|
22
|
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
|
23
|
+
rev: v0.15.0
|
|
24
|
+
hooks:
|
|
25
|
+
- id: ruff-check
|
|
26
|
+
args: [--fix, --exit-non-zero-on-fix, --show-fixes]
|
|
27
|
+
- id: ruff-format
|
|
28
|
+
- repo: https://github.com/rhysd/actionlint
|
|
29
|
+
rev: v1.7.10
|
|
30
|
+
hooks:
|
|
31
|
+
- id: actionlint
|
|
32
|
+
- repo: https://github.com/google/yamlfmt
|
|
33
|
+
rev: v0.21.0
|
|
34
|
+
hooks:
|
|
35
|
+
- id: yamlfmt
|
|
36
|
+
- repo: https://github.com/numpy/numpydoc
|
|
37
|
+
rev: v1.10.0
|
|
38
|
+
hooks:
|
|
39
|
+
- id: numpydoc-validation
|
|
40
|
+
exclude: (tests/|docs/).*
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
cff-version: 1.2.0
|
|
2
|
+
message: "If you use this software, please cite it as below."
|
|
3
|
+
title: "telomore"
|
|
4
|
+
version: 0.4.0
|
|
5
|
+
date-released: 2025-10-14
|
|
6
|
+
authors:
|
|
7
|
+
- family-names: Faurdal
|
|
8
|
+
given-names: David
|
|
9
|
+
affiliation: "Technical University of Denmark"
|
|
10
|
+
- family-names: Booth
|
|
11
|
+
given-names: Thom J.
|
|
12
|
+
affiliation: "Technical University of Denmark"
|
|
13
|
+
- family-names: Weber
|
|
14
|
+
given-names: Tilmann
|
|
15
|
+
affiliation: "Technical University of Denmark"
|
|
16
|
+
- family-names: Sparholt Jørgensen
|
|
17
|
+
given-names: Tue
|
|
18
|
+
affiliation: "Technical University of Denmark"
|
|
19
|
+
repository-code: "https://github.com/dalofa/telomore"
|
|
20
|
+
license: MIT
|
|
21
|
+
abstract: >-
|
|
22
|
+
Members of the Gram-positive Streptomycetaceae family of bacteria have linear chromosomes and carry linear plasmids, which end in telomeres bound by proteins. In a large-scale analysis of 762 linear complete genomes, we discovered that the telomeres were truncated in most assemblies, as they are not captured by Oxford Nanopore sequencing. To address this issue, we present Telomore, a tool to reconstitute this missing telomeric sequence using ONT and Illumina data. In the studied dataset, Telomore increased detection of archetypal telomeres from 0% to 37%, which could be near the occurrence rate in nature. Combining these reconstituted telomeres with previously published telomeres and all complete Streptomycetaceae RefSeq genomes, we created a compendium of more than 2000 telomeres. Similarity-based clustering identified 137 telomere clusters. We find that 78% of Telomore-extended chromosomes encode both telomeres, while this is only the case for 15% of comparable RefSeq chromosomes. Therefore, most assignments of “complete” to Streptomycetaceae are erroneous. Finally, we mined the 762 genomes for known telomeric maintenance proteins and used those to identify a plasmid-specific archetypal telomere and to identify a previously unidentified protein family likely involved with the maintenance of Sg2247-class telomeres. Together, these results highlight a common issue assembling complete linear Streptomycetaceae genomes and provide a programmatic solution and identify a candidate for a new telomeric protein.
|
|
23
|
+
keywords:
|
|
24
|
+
- genomics
|
|
25
|
+
- telomeres
|
|
26
|
+
- bioinformatics
|
|
27
|
+
preferred-citation:
|
|
28
|
+
type: article
|
|
29
|
+
authors:
|
|
30
|
+
- family-names: Faurdal
|
|
31
|
+
given-names: David
|
|
32
|
+
affiliation: "Technical University of Denmark"
|
|
33
|
+
- family-names: Booth
|
|
34
|
+
given-names: Thom J.
|
|
35
|
+
affiliation: "Technical University of Denmark"
|
|
36
|
+
- family-names: Weber
|
|
37
|
+
given-names: Tilmann
|
|
38
|
+
affiliation: "Technical University of Denmark"
|
|
39
|
+
- family-names: Sparholt Jørgensen
|
|
40
|
+
given-names: Tue
|
|
41
|
+
affiliation: "Technical University of Denmark"
|
|
42
|
+
title: "Tying up loose ends: Recovering thousands of missing telomeres from Streptomyces and other Streptomycetaceae genomes."
|
|
43
|
+
year: 2025
|
|
44
|
+
journal: BioRxiv
|
|
45
|
+
url: "https://www.biorxiv.org/content/early/2025/10/14/2025.10.14.682034"
|
|
46
|
+
repository-code: "https://github.com/dalofa/telomore"
|
|
47
|
+
doi: 10.1101/2025.10.14.682034
|
telomore-0.4.1/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Technical University of Denmark
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
telomore-0.4.1/PKG-INFO
ADDED
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: telomore
|
|
3
|
+
Version: 0.4.1
|
|
4
|
+
Summary: Identify and extract telomeric sequences from Oxford Nanopore or Illumina sequencing reads to extend Streptomycetes assemblies.
|
|
5
|
+
Project-URL: documentation, https://github.com/dalofa/telomore
|
|
6
|
+
Project-URL: homepage, https://github.com/dalofa/telomore
|
|
7
|
+
Project-URL: repository, https://github.com/dalofa/telomore
|
|
8
|
+
Author-email: David Faurdal <dalofa@biosustain.dtu.dk>
|
|
9
|
+
License: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
14
|
+
Requires-Python: >=3.9
|
|
15
|
+
Requires-Dist: biopython
|
|
16
|
+
Requires-Dist: pysam
|
|
17
|
+
Provides-Extra: dev
|
|
18
|
+
Requires-Dist: hatch; extra == 'dev'
|
|
19
|
+
Requires-Dist: isort; extra == 'dev'
|
|
20
|
+
Requires-Dist: numpydoc-validation; extra == 'dev'
|
|
21
|
+
Requires-Dist: pre-commit; extra == 'dev'
|
|
22
|
+
Requires-Dist: pydocstyle; extra == 'dev'
|
|
23
|
+
Requires-Dist: ruff; extra == 'dev'
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
|
|
26
|
+
# TELOMORE
|
|
27
|
+
|
|
28
|
+
Telomore is a tool for identifying and extracting telomeric sequences from
|
|
29
|
+
**Oxford Nanopore** or **Illumina** sequencing reads of *Streptomycetes spp.*
|
|
30
|
+
that have been excluded from a *de novo* assembly. It processes sequencing data
|
|
31
|
+
to extend assemblies, generate quality control (QC) maps, and produce finalized
|
|
32
|
+
assemblies with the telomere/recessed bases included.
|
|
33
|
+
|
|
34
|
+
## Before running Telomore
|
|
35
|
+
|
|
36
|
+
Telomore does not identify linear contigs but rather rely on the user to provide
|
|
37
|
+
that information in the header of the fasta-reference file.
|
|
38
|
+
|
|
39
|
+
## Usage
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
telomore --mode <mode> --reference <reference.fasta> [options]
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
Required Arguments
|
|
46
|
+
|
|
47
|
+
- `--mode` Specify the sequencing platform. Options: nanopore or illumina.
|
|
48
|
+
- `--reference` Path to the reference genome file in FASTA format.
|
|
49
|
+
|
|
50
|
+
Nanopore-Specific Arguments
|
|
51
|
+
|
|
52
|
+
- `--single` Path to a single gzipped FASTQ file containing Nanopore reads.
|
|
53
|
+
|
|
54
|
+
Illumina-Specific Arguments
|
|
55
|
+
|
|
56
|
+
- `--read1` Path to gzipped FASTQ file for Illumina read 1.
|
|
57
|
+
- `--read2` Path to gzipped FASTQ file for Illumina read 2.
|
|
58
|
+
|
|
59
|
+
Optional Arguments
|
|
60
|
+
|
|
61
|
+
- `--coverage_threshold` Set the threshold for coverage to stop trimming during
|
|
62
|
+
consensus trimming (Default is coverage=5 for ONT reads and coverage=1 for
|
|
63
|
+
Illumina reads).
|
|
64
|
+
- `--quality_threshold` Set the Q-score required to count a read position in the
|
|
65
|
+
coverage calculation during consensus trimming (Default is Q-score=10 for ONT
|
|
66
|
+
reads and Q-score=30 for Illumina reads).
|
|
67
|
+
- `--threads` Number of threads to use (default: 1).
|
|
68
|
+
- `--keep` Retain intermediate files (default: False).
|
|
69
|
+
- `--quiet` Suppress console logging.
|
|
70
|
+
|
|
71
|
+
## Process overview
|
|
72
|
+
|
|
73
|
+
The process is as follows:
|
|
74
|
+
|
|
75
|
+
1. **Map Reads:**
|
|
76
|
+
Reads are mapped against all contigs in a reference using either minimap2 or
|
|
77
|
+
Bowtie2.
|
|
78
|
+
2. **Extract Extending Reads**
|
|
79
|
+
Extending reads that are mapped to the ends of linear contigs are extracted.
|
|
80
|
+
3. **Build Consensus**
|
|
81
|
+
The terminal extending reads from each end is used to construct a consensus
|
|
82
|
+
using either lamassemble or mafft + EMBOSS cons
|
|
83
|
+
4. **Align and Attach consensus**
|
|
84
|
+
The consensus for each end is aligned to the reference and used to extend it.
|
|
85
|
+
5. **Trim Extended Replicon**
|
|
86
|
+
In a final step, all terminally mapped reads are mapped to the new extended
|
|
87
|
+
reference and used to trim away spurious sequence, based on read-support.
|
|
88
|
+
|
|
89
|
+
## Outputs
|
|
90
|
+
|
|
91
|
+
At the end of a run Telomore produces the following outputs:
|
|
92
|
+
|
|
93
|
+
```Output
|
|
94
|
+
├── {fasta_basename}_{seqtype}_telomore
|
|
95
|
+
│ ├── {contig_name}_telomore_extended.fasta
|
|
96
|
+
│ ├── {contig_name}_telomore_ext_{seqtype}.log
|
|
97
|
+
│ ├── {contig_name}_telomore_QC.bam
|
|
98
|
+
│ ├── {contig_name}_telomore_QC.bam.bai
|
|
99
|
+
│ ├── {contig_name}_telomore_untrimmed.fasta
|
|
100
|
+
│ └── {fasta_basename}_telomore.fasta
|
|
101
|
+
└── telomore.log # log containing run information.
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
In the folder there is a number of files generated for each contig considered:
|
|
105
|
+
|
|
106
|
+
| File Name | Description |
|
|
107
|
+
|-----------|-------------|
|
|
108
|
+
| `{contig_name}_telomore_extended.fasta` | Original contig sequence + added terminal bases - trimmed bases |
|
|
109
|
+
| `{contig_name}_telomore_ext_{seqtype}.log` | Log contianing information about bases added, trimmed off and final result. |
|
|
110
|
+
| `{contig_name}_telomore_QC.bam` | BAM file containing terminal reads mapped to `{contig_name}_telomore_extended.fasta`. Useful for manual inspection of the extension|
|
|
111
|
+
| `{contig_name}_telomore_QC.bam.bai` | Index file for the corresponding BAM file. |
|
|
112
|
+
| `{contig_name}_telomore_untrimmed.fasta` | Original contig sequence + added terminal bases |
|
|
113
|
+
|
|
114
|
+
Additionally, there is a fasta-file collecting all tagged linear contigs as they
|
|
115
|
+
appear in `{contig_name}_telomore_extended.fasta` together with all non-linear
|
|
116
|
+
contigs in the order they appear in the original file.
|
|
117
|
+
|
|
118
|
+
Inspecting the {contig_name}_QC.bam-file in IGV (Integrative Genomics Viewer)
|
|
119
|
+
can be informative in evaluating the extended contig.
|
|
120
|
+
|
|
121
|
+
## Dependencies (CLI-tools)
|
|
122
|
+
|
|
123
|
+
- Bowtie2
|
|
124
|
+
- Emboss tools (cons specifically)
|
|
125
|
+
- Lamassemble
|
|
126
|
+
- LAST-DB
|
|
127
|
+
- Mafft
|
|
128
|
+
- Minimap2, version 2.25 or higher
|
|
129
|
+
- Samtools
|
|
130
|
+
|
|
131
|
+
These can be installed using the conda recipe in this repo:
|
|
132
|
+
|
|
133
|
+
```bash
|
|
134
|
+
conda env create -f environment.yml -y
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
This repo can then be downloaded using git clone, the conda enviroment activated
|
|
138
|
+
and the tool installed
|
|
139
|
+
|
|
140
|
+
```bash
|
|
141
|
+
# Activate telomore conda env
|
|
142
|
+
conda activate telomore
|
|
143
|
+
|
|
144
|
+
# Clone telomore repo
|
|
145
|
+
git clone https://github.com/dalofa/telomore && cd telomore
|
|
146
|
+
|
|
147
|
+
# Install package
|
|
148
|
+
pip install -e '.[dev]'
|
|
149
|
+
```
|
telomore-0.4.1/README.md
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
# TELOMORE
|
|
2
|
+
|
|
3
|
+
Telomore is a tool for identifying and extracting telomeric sequences from
|
|
4
|
+
**Oxford Nanopore** or **Illumina** sequencing reads of *Streptomycetes spp.*
|
|
5
|
+
that have been excluded from a *de novo* assembly. It processes sequencing data
|
|
6
|
+
to extend assemblies, generate quality control (QC) maps, and produce finalized
|
|
7
|
+
assemblies with the telomere/recessed bases included.
|
|
8
|
+
|
|
9
|
+
## Before running Telomore
|
|
10
|
+
|
|
11
|
+
Telomore does not identify linear contigs but rather rely on the user to provide
|
|
12
|
+
that information in the header of the fasta-reference file.
|
|
13
|
+
|
|
14
|
+
## Usage
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
telomore --mode <mode> --reference <reference.fasta> [options]
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
Required Arguments
|
|
21
|
+
|
|
22
|
+
- `--mode` Specify the sequencing platform. Options: nanopore or illumina.
|
|
23
|
+
- `--reference` Path to the reference genome file in FASTA format.
|
|
24
|
+
|
|
25
|
+
Nanopore-Specific Arguments
|
|
26
|
+
|
|
27
|
+
- `--single` Path to a single gzipped FASTQ file containing Nanopore reads.
|
|
28
|
+
|
|
29
|
+
Illumina-Specific Arguments
|
|
30
|
+
|
|
31
|
+
- `--read1` Path to gzipped FASTQ file for Illumina read 1.
|
|
32
|
+
- `--read2` Path to gzipped FASTQ file for Illumina read 2.
|
|
33
|
+
|
|
34
|
+
Optional Arguments
|
|
35
|
+
|
|
36
|
+
- `--coverage_threshold` Set the threshold for coverage to stop trimming during
|
|
37
|
+
consensus trimming (Default is coverage=5 for ONT reads and coverage=1 for
|
|
38
|
+
Illumina reads).
|
|
39
|
+
- `--quality_threshold` Set the Q-score required to count a read position in the
|
|
40
|
+
coverage calculation during consensus trimming (Default is Q-score=10 for ONT
|
|
41
|
+
reads and Q-score=30 for Illumina reads).
|
|
42
|
+
- `--threads` Number of threads to use (default: 1).
|
|
43
|
+
- `--keep` Retain intermediate files (default: False).
|
|
44
|
+
- `--quiet` Suppress console logging.
|
|
45
|
+
|
|
46
|
+
## Process overview
|
|
47
|
+
|
|
48
|
+
The process is as follows:
|
|
49
|
+
|
|
50
|
+
1. **Map Reads:**
|
|
51
|
+
Reads are mapped against all contigs in a reference using either minimap2 or
|
|
52
|
+
Bowtie2.
|
|
53
|
+
2. **Extract Extending Reads**
|
|
54
|
+
Extending reads that are mapped to the ends of linear contigs are extracted.
|
|
55
|
+
3. **Build Consensus**
|
|
56
|
+
The terminal extending reads from each end is used to construct a consensus
|
|
57
|
+
using either lamassemble or mafft + EMBOSS cons
|
|
58
|
+
4. **Align and Attach consensus**
|
|
59
|
+
The consensus for each end is aligned to the reference and used to extend it.
|
|
60
|
+
5. **Trim Extended Replicon**
|
|
61
|
+
In a final step, all terminally mapped reads are mapped to the new extended
|
|
62
|
+
reference and used to trim away spurious sequence, based on read-support.
|
|
63
|
+
|
|
64
|
+
## Outputs
|
|
65
|
+
|
|
66
|
+
At the end of a run Telomore produces the following outputs:
|
|
67
|
+
|
|
68
|
+
```Output
|
|
69
|
+
├── {fasta_basename}_{seqtype}_telomore
|
|
70
|
+
│ ├── {contig_name}_telomore_extended.fasta
|
|
71
|
+
│ ├── {contig_name}_telomore_ext_{seqtype}.log
|
|
72
|
+
│ ├── {contig_name}_telomore_QC.bam
|
|
73
|
+
│ ├── {contig_name}_telomore_QC.bam.bai
|
|
74
|
+
│ ├── {contig_name}_telomore_untrimmed.fasta
|
|
75
|
+
│ └── {fasta_basename}_telomore.fasta
|
|
76
|
+
└── telomore.log # log containing run information.
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
In the folder there is a number of files generated for each contig considered:
|
|
80
|
+
|
|
81
|
+
| File Name | Description |
|
|
82
|
+
|-----------|-------------|
|
|
83
|
+
| `{contig_name}_telomore_extended.fasta` | Original contig sequence + added terminal bases - trimmed bases |
|
|
84
|
+
| `{contig_name}_telomore_ext_{seqtype}.log` | Log contianing information about bases added, trimmed off and final result. |
|
|
85
|
+
| `{contig_name}_telomore_QC.bam` | BAM file containing terminal reads mapped to `{contig_name}_telomore_extended.fasta`. Useful for manual inspection of the extension|
|
|
86
|
+
| `{contig_name}_telomore_QC.bam.bai` | Index file for the corresponding BAM file. |
|
|
87
|
+
| `{contig_name}_telomore_untrimmed.fasta` | Original contig sequence + added terminal bases |
|
|
88
|
+
|
|
89
|
+
Additionally, there is a fasta-file collecting all tagged linear contigs as they
|
|
90
|
+
appear in `{contig_name}_telomore_extended.fasta` together with all non-linear
|
|
91
|
+
contigs in the order they appear in the original file.
|
|
92
|
+
|
|
93
|
+
Inspecting the {contig_name}_QC.bam-file in IGV (Integrative Genomics Viewer)
|
|
94
|
+
can be informative in evaluating the extended contig.
|
|
95
|
+
|
|
96
|
+
## Dependencies (CLI-tools)
|
|
97
|
+
|
|
98
|
+
- Bowtie2
|
|
99
|
+
- Emboss tools (cons specifically)
|
|
100
|
+
- Lamassemble
|
|
101
|
+
- LAST-DB
|
|
102
|
+
- Mafft
|
|
103
|
+
- Minimap2, version 2.25 or higher
|
|
104
|
+
- Samtools
|
|
105
|
+
|
|
106
|
+
These can be installed using the conda recipe in this repo:
|
|
107
|
+
|
|
108
|
+
```bash
|
|
109
|
+
conda env create -f environment.yml -y
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
This repo can then be downloaded using git clone, the conda enviroment activated
|
|
113
|
+
and the tool installed
|
|
114
|
+
|
|
115
|
+
```bash
|
|
116
|
+
# Activate telomore conda env
|
|
117
|
+
conda activate telomore
|
|
118
|
+
|
|
119
|
+
# Clone telomore repo
|
|
120
|
+
git clone https://github.com/dalofa/telomore && cd telomore
|
|
121
|
+
|
|
122
|
+
# Install package
|
|
123
|
+
pip install -e '.[dev]'
|
|
124
|
+
```
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling", "hatch-vcs"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "telomore"
|
|
7
|
+
description = "Identify and extract telomeric sequences from Oxford Nanopore or Illumina sequencing reads to extend Streptomycetes assemblies."
|
|
8
|
+
readme = "README.md"
|
|
9
|
+
requires-python = ">=3.9"
|
|
10
|
+
license = { text = "MIT" }
|
|
11
|
+
authors = [{ name = "David Faurdal", email = "dalofa@biosustain.dtu.dk" }]
|
|
12
|
+
|
|
13
|
+
classifiers = [
|
|
14
|
+
"Programming Language :: Python :: 3",
|
|
15
|
+
"Intended Audience :: Science/Research",
|
|
16
|
+
"Topic :: Scientific/Engineering :: Bio-Informatics",
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
dependencies = ["biopython", "pysam"]
|
|
20
|
+
|
|
21
|
+
dynamic = ["version"]
|
|
22
|
+
|
|
23
|
+
# Optional dependencies for testing
|
|
24
|
+
[project.optional-dependencies]
|
|
25
|
+
dev = ["hatch", "isort", "numpydoc-validation", "pre-commit", "pydocstyle", "ruff"]
|
|
26
|
+
|
|
27
|
+
[project.urls]
|
|
28
|
+
documentation = "https://github.com/dalofa/telomore"
|
|
29
|
+
homepage = "https://github.com/dalofa/telomore"
|
|
30
|
+
repository = "https://github.com/dalofa/telomore"
|
|
31
|
+
|
|
32
|
+
[project.scripts]
|
|
33
|
+
telomore = "telomore.app:entrypoint"
|
|
34
|
+
|
|
35
|
+
[tool.hatch.build]
|
|
36
|
+
source = "src"
|
|
37
|
+
exclude = ["environment.yml", ".github", ".vscode"]
|
|
38
|
+
|
|
39
|
+
[tool.hatch.build.targets.wheel]
|
|
40
|
+
packages = ["src/telomore"]
|
|
41
|
+
|
|
42
|
+
[tool.hatch.version]
|
|
43
|
+
source = "vcs"
|
|
44
|
+
|
|
45
|
+
[tool.hatch.version.vcs]
|
|
46
|
+
tag-pattern = "v*" # Git tags starting with 'v' will be used for versioning
|
|
47
|
+
fallback-version = "0.0.0"
|
|
48
|
+
|
|
49
|
+
[tool.hatch.build.hooks.vcs]
|
|
50
|
+
version-file = "src/telomore/_version.py"
|
|
51
|
+
|
|
52
|
+
[tool.pytest.ini_options]
|
|
53
|
+
addopts = "-v --cov=telomore --cov-branch --cov-report=xml --cov-report=term"
|
|
54
|
+
testpaths = ["tests"]
|
|
55
|
+
python_files = ["test_*.py"]
|
|
56
|
+
|
|
57
|
+
[tool.ruff]
|
|
58
|
+
target-version = "py310"
|
|
59
|
+
line-length = 88
|
|
60
|
+
fix = true
|
|
61
|
+
|
|
62
|
+
[tool.ruff.lint]
|
|
63
|
+
select = [
|
|
64
|
+
"C", # mccabe rules
|
|
65
|
+
"F", # pyflakes rules
|
|
66
|
+
"E", # pycodestyle error rules
|
|
67
|
+
"W", # pycodestyle warning rules
|
|
68
|
+
"B", # flake8-bugbear rules
|
|
69
|
+
"I", # isort rules
|
|
70
|
+
"D", # pydocstyle rules
|
|
71
|
+
]
|
|
72
|
+
ignore = [
|
|
73
|
+
"C901", # max-complexity-10
|
|
74
|
+
"E501", # line-too-long
|
|
75
|
+
"I001", # isort-imports
|
|
76
|
+
"B905", # `zip()` without an explicit `strict=` parameter
|
|
77
|
+
]
|
|
78
|
+
|
|
79
|
+
# Don't auto-fix docstring issues - they're too fragile
|
|
80
|
+
unfixable = ["D"]
|
|
81
|
+
|
|
82
|
+
[tool.ruff.lint.per-file-ignores]
|
|
83
|
+
"tests/*" = ["D"] # Ignore all pydocstyle rules in tests
|
|
84
|
+
|
|
85
|
+
[tool.ruff.lint.pydocstyle]
|
|
86
|
+
convention = "numpy"
|
|
87
|
+
|
|
88
|
+
[tool.ruff.format]
|
|
89
|
+
indent-style = "space"
|
|
90
|
+
quote-style = "single"
|
|
91
|
+
|
|
92
|
+
[tool.ruff.lint.isort]
|
|
93
|
+
known-third-party = ["biopython", "pysam", "Bio"]
|
|
94
|
+
known-first-party = ["telomore"]
|
|
95
|
+
force-sort-within-sections = true
|
|
96
|
+
|
|
97
|
+
[tool.isort]
|
|
98
|
+
profile = "black"
|
|
99
|
+
known_third_party = ["biopython", "pysam", "Bio"]
|
|
100
|
+
known_first_party = ["telomore"]
|
|
101
|
+
default_section = "THIRDPARTY"
|
|
102
|
+
force_sort_within_sections = true
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
[tool.numpydoc_validation]
|
|
106
|
+
checks = [
|
|
107
|
+
"all", # report on all checks
|
|
108
|
+
"ES01", # but don't require an extended summary
|
|
109
|
+
"EX01", # or examples
|
|
110
|
+
"SA01", # or a see also section
|
|
111
|
+
"SS06", # and don't require the summary to fit on one line
|
|
112
|
+
]
|
|
113
|
+
exclude = [ # don't report on checks for these
|
|
114
|
+
'\.__init__$',
|
|
115
|
+
'\.__repr__$',
|
|
116
|
+
'\.__str__$',
|
|
117
|
+
]
|
|
118
|
+
override_SS05 = [ # allow docstrings to start with these words
|
|
119
|
+
'^Process ',
|
|
120
|
+
'^Assess ',
|
|
121
|
+
'^Access ',
|
|
122
|
+
]
|
|
123
|
+
|
|
124
|
+
# Don't process filepaths that match these regex patterns
|
|
125
|
+
exclude_files = [
|
|
126
|
+
'^_version\\.py$',
|
|
127
|
+
]
|
|
128
|
+
|
|
129
|
+
[tool.mypy]
|
|
130
|
+
# Type hint strictness settings
|
|
131
|
+
python_version = "3.10"
|
|
132
|
+
warn_return_any = true
|
|
133
|
+
warn_unused_configs = true
|
|
134
|
+
disallow_untyped_defs = true
|
|
135
|
+
|
|
136
|
+
[tool.pydocstyle]
|
|
137
|
+
convention = "numpy"
|
|
138
|
+
match-dir = "[^\\.].*" # matches all dirs that don't start with a dot
|
|
139
|
+
match = "(?!test_).*\\.py" # matches files that don't start with 'test_' but end with '.py'
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# file generated by setuptools-scm
|
|
2
|
+
# don't change, don't track in version control
|
|
3
|
+
|
|
4
|
+
__all__ = [
|
|
5
|
+
"__version__",
|
|
6
|
+
"__version_tuple__",
|
|
7
|
+
"version",
|
|
8
|
+
"version_tuple",
|
|
9
|
+
"__commit_id__",
|
|
10
|
+
"commit_id",
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
TYPE_CHECKING = False
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from typing import Tuple
|
|
16
|
+
from typing import Union
|
|
17
|
+
|
|
18
|
+
VERSION_TUPLE = Tuple[Union[int, str], ...]
|
|
19
|
+
COMMIT_ID = Union[str, None]
|
|
20
|
+
else:
|
|
21
|
+
VERSION_TUPLE = object
|
|
22
|
+
COMMIT_ID = object
|
|
23
|
+
|
|
24
|
+
version: str
|
|
25
|
+
__version__: str
|
|
26
|
+
__version_tuple__: VERSION_TUPLE
|
|
27
|
+
version_tuple: VERSION_TUPLE
|
|
28
|
+
commit_id: COMMIT_ID
|
|
29
|
+
__commit_id__: COMMIT_ID
|
|
30
|
+
|
|
31
|
+
__version__ = version = '0.4.1'
|
|
32
|
+
__version_tuple__ = version_tuple = (0, 4, 1)
|
|
33
|
+
|
|
34
|
+
__commit_id__ = commit_id = None
|