RNApolis 0.2.1__tar.gz → 0.3.1__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {RNApolis-0.2.1 → RNApolis-0.3.1}/PKG-INFO +39 -1
- RNApolis-0.2.1/src/RNApolis.egg-info/PKG-INFO → RNApolis-0.3.1/README.md +35 -25
- {RNApolis-0.2.1 → RNApolis-0.3.1}/setup.py +5 -1
- RNApolis-0.2.1/README.md → RNApolis-0.3.1/src/RNApolis.egg-info/PKG-INFO +63 -0
- {RNApolis-0.2.1 → RNApolis-0.3.1}/src/RNApolis.egg-info/SOURCES.txt +1 -0
- {RNApolis-0.2.1 → RNApolis-0.3.1}/src/RNApolis.egg-info/entry_points.txt +1 -0
- {RNApolis-0.2.1 → RNApolis-0.3.1}/src/RNApolis.egg-info/requires.txt +3 -0
- RNApolis-0.3.1/src/rnapolis/rfam_folder.py +294 -0
- {RNApolis-0.2.1 → RNApolis-0.3.1}/LICENSE +0 -0
- {RNApolis-0.2.1 → RNApolis-0.3.1}/pyproject.toml +0 -0
- {RNApolis-0.2.1 → RNApolis-0.3.1}/setup.cfg +0 -0
- {RNApolis-0.2.1 → RNApolis-0.3.1}/src/RNApolis.egg-info/dependency_links.txt +0 -0
- {RNApolis-0.2.1 → RNApolis-0.3.1}/src/RNApolis.egg-info/top_level.txt +0 -0
- {RNApolis-0.2.1 → RNApolis-0.3.1}/src/rnapolis/annotator.py +0 -0
- {RNApolis-0.2.1 → RNApolis-0.3.1}/src/rnapolis/clashfinder.py +0 -0
- {RNApolis-0.2.1 → RNApolis-0.3.1}/src/rnapolis/common.py +0 -0
- {RNApolis-0.2.1 → RNApolis-0.3.1}/src/rnapolis/metareader.py +0 -0
- {RNApolis-0.2.1 → RNApolis-0.3.1}/src/rnapolis/molecule_filter.py +0 -0
- {RNApolis-0.2.1 → RNApolis-0.3.1}/src/rnapolis/motif_extractor.py +0 -0
- {RNApolis-0.2.1 → RNApolis-0.3.1}/src/rnapolis/parser.py +0 -0
- {RNApolis-0.2.1 → RNApolis-0.3.1}/src/rnapolis/tertiary.py +0 -0
- {RNApolis-0.2.1 → RNApolis-0.3.1}/src/rnapolis/transformer.py +0 -0
- {RNApolis-0.2.1 → RNApolis-0.3.1}/src/rnapolis/util.py +0 -0
- {RNApolis-0.2.1 → RNApolis-0.3.1}/tests/test_annotator.py +0 -0
- {RNApolis-0.2.1 → RNApolis-0.3.1}/tests/test_bugfixes.py +0 -0
- {RNApolis-0.2.1 → RNApolis-0.3.1}/tests/test_common.py +0 -0
- {RNApolis-0.2.1 → RNApolis-0.3.1}/tests/test_metareader.py +0 -0
- {RNApolis-0.2.1 → RNApolis-0.3.1}/tests/test_parser.py +0 -0
- {RNApolis-0.2.1 → RNApolis-0.3.1}/tests/test_quadruplexes.py +0 -0
- {RNApolis-0.2.1 → RNApolis-0.3.1}/tests/test_tertiary.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: RNApolis
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.3.1
|
4
4
|
Summary: A Python library containing RNA-related bioinformatics functions and classes
|
5
5
|
Home-page: https://github.com/tzok/rnapolis-py
|
6
6
|
Author: Tomasz Zok
|
@@ -15,13 +15,16 @@ Classifier: Programming Language :: Python :: 3
|
|
15
15
|
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
16
16
|
Description-Content-Type: text/markdown
|
17
17
|
License-File: LICENSE
|
18
|
+
Requires-Dist: appdirs
|
18
19
|
Requires-Dist: graphviz
|
19
20
|
Requires-Dist: mmcif
|
20
21
|
Requires-Dist: numpy
|
21
22
|
Requires-Dist: ordered-set
|
22
23
|
Requires-Dist: orjson
|
23
24
|
Requires-Dist: pulp
|
25
|
+
Requires-Dist: requests
|
24
26
|
Requires-Dist: scipy
|
27
|
+
Requires-Dist: viennarna
|
25
28
|
|
26
29
|
# RNApolis
|
27
30
|
|
@@ -2133,3 +2136,38 @@ To use `transformer.py`, specify the path to your input mmCIF file and the desir
|
|
2133
2136
|
- `--copy-to COPY_TO`: Indicate the column name to copy data to (e.g., `auth_asym_id`).
|
2134
2137
|
|
2135
2138
|
For additional guidance, use `-h` or `--help`.
|
2139
|
+
|
2140
|
+
### `rfam-folder`
|
2141
|
+
|
2142
|
+
`rfam-folder` is a command-line tool for generating consensus secondary structures for RNA sequences. This tool can process a single RNA sequence or multiple sequences from a FASTA file. It offers the flexibility to specify an Rfam family for targeted analysis, control the folding of the secondary structure, and set a limit on the number of structures generated.
|
2143
|
+
|
2144
|
+
**Important!** You need to have [Infernal software](http://eddylab.org/infernal/) installed for this script to run.
|
2145
|
+
|
2146
|
+
#### Usage:
|
2147
|
+
|
2148
|
+
The general usage pattern for rfam_folder.py is as follows:
|
2149
|
+
|
2150
|
+
```bash
|
2151
|
+
usage: rfam_folder.py [-h] [--family FAMILY] [--no-fold] [--count COUNT] sequence
|
2152
|
+
```
|
2153
|
+
|
2154
|
+
Positional Arguments:
|
2155
|
+
|
2156
|
+
- sequence: An RNA sequence directly or a path to a FASTA file containing one or more sequences.
|
2157
|
+
|
2158
|
+
Options:
|
2159
|
+
|
2160
|
+
- `--family FAMILY`: (Optional) Specify the name of the Rfam family to use. If not given, the entire Rfam database is searched for the sequence.
|
2161
|
+
- `--no-fold`: (Optional) Disable folding of the consensus secondary structure by RNAfold with constraints.
|
2162
|
+
- `--count COUNT`: (Optional) Set the maximum number of consensus secondary structures to generate per sequence, with a default of 1.
|
2163
|
+
|
2164
|
+
#### Examples
|
2165
|
+
|
2166
|
+
Generate a consensus structure for a single RNA sequence given specific Rfam family:
|
2167
|
+
|
2168
|
+
```
|
2169
|
+
$ rfam-folder AUCGUUCACCUCAUAAAUUGAGUGAGACGGAAGUAGGUUAAAACCGAAGGAACGCAGU --family RF01739
|
2170
|
+
>header
|
2171
|
+
AUCGUUCACCUCAUAAAUUGAGUGAGACGGAAGUAGGUUAAAACCGAAGGAACGCAGU
|
2172
|
+
..(((((..(((((.......)))))((....)).(((....)))....)))))....
|
2173
|
+
```
|
@@ -1,28 +1,3 @@
|
|
1
|
-
Metadata-Version: 2.1
|
2
|
-
Name: RNApolis
|
3
|
-
Version: 0.2.1
|
4
|
-
Summary: A Python library containing RNA-related bioinformatics functions and classes
|
5
|
-
Home-page: https://github.com/tzok/rnapolis-py
|
6
|
-
Author: Tomasz Zok
|
7
|
-
Author-email: tomasz.zok@cs.put.poznan.pl
|
8
|
-
Project-URL: Bug Tracker, https://github.com/tzok/rnapolis-py/issues
|
9
|
-
Classifier: Development Status :: 5 - Production/Stable
|
10
|
-
Classifier: Environment :: Console
|
11
|
-
Classifier: Intended Audience :: Science/Research
|
12
|
-
Classifier: License :: OSI Approved :: MIT License
|
13
|
-
Classifier: Operating System :: OS Independent
|
14
|
-
Classifier: Programming Language :: Python :: 3
|
15
|
-
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
16
|
-
Description-Content-Type: text/markdown
|
17
|
-
License-File: LICENSE
|
18
|
-
Requires-Dist: graphviz
|
19
|
-
Requires-Dist: mmcif
|
20
|
-
Requires-Dist: numpy
|
21
|
-
Requires-Dist: ordered-set
|
22
|
-
Requires-Dist: orjson
|
23
|
-
Requires-Dist: pulp
|
24
|
-
Requires-Dist: scipy
|
25
|
-
|
26
1
|
# RNApolis
|
27
2
|
|
28
3
|
A Python library and utilities containing RNA-related bioinformatics functions and classes.
|
@@ -2133,3 +2108,38 @@ To use `transformer.py`, specify the path to your input mmCIF file and the desir
|
|
2133
2108
|
- `--copy-to COPY_TO`: Indicate the column name to copy data to (e.g., `auth_asym_id`).
|
2134
2109
|
|
2135
2110
|
For additional guidance, use `-h` or `--help`.
|
2111
|
+
|
2112
|
+
### `rfam-folder`
|
2113
|
+
|
2114
|
+
`rfam-folder` is a command-line tool for generating consensus secondary structures for RNA sequences. This tool can process a single RNA sequence or multiple sequences from a FASTA file. It offers the flexibility to specify an Rfam family for targeted analysis, control the folding of the secondary structure, and set a limit on the number of structures generated.
|
2115
|
+
|
2116
|
+
**Important!** You need to have [Infernal software](http://eddylab.org/infernal/) installed for this script to run.
|
2117
|
+
|
2118
|
+
#### Usage:
|
2119
|
+
|
2120
|
+
The general usage pattern for rfam_folder.py is as follows:
|
2121
|
+
|
2122
|
+
```bash
|
2123
|
+
usage: rfam_folder.py [-h] [--family FAMILY] [--no-fold] [--count COUNT] sequence
|
2124
|
+
```
|
2125
|
+
|
2126
|
+
Positional Arguments:
|
2127
|
+
|
2128
|
+
- sequence: An RNA sequence directly or a path to a FASTA file containing one or more sequences.
|
2129
|
+
|
2130
|
+
Options:
|
2131
|
+
|
2132
|
+
- `--family FAMILY`: (Optional) Specify the name of the Rfam family to use. If not given, the entire Rfam database is searched for the sequence.
|
2133
|
+
- `--no-fold`: (Optional) Disable folding of the consensus secondary structure by RNAfold with constraints.
|
2134
|
+
- `--count COUNT`: (Optional) Set the maximum number of consensus secondary structures to generate per sequence, with a default of 1.
|
2135
|
+
|
2136
|
+
#### Examples
|
2137
|
+
|
2138
|
+
Generate a consensus structure for a single RNA sequence given specific Rfam family:
|
2139
|
+
|
2140
|
+
```
|
2141
|
+
$ rfam-folder AUCGUUCACCUCAUAAAUUGAGUGAGACGGAAGUAGGUUAAAACCGAAGGAACGCAGU --family RF01739
|
2142
|
+
>header
|
2143
|
+
AUCGUUCACCUCAUAAAUUGAGUGAGACGGAAGUAGGUUAAAACCGAAGGAACGCAGU
|
2144
|
+
..(((((..(((((.......)))))((....)).(((....)))....)))))....
|
2145
|
+
```
|
@@ -5,7 +5,7 @@ with open("README.md") as f:
|
|
5
5
|
|
6
6
|
setup(
|
7
7
|
name="RNApolis",
|
8
|
-
version="0.
|
8
|
+
version="0.3.1",
|
9
9
|
packages=["rnapolis"],
|
10
10
|
package_dir={"": "src"},
|
11
11
|
author="Tomasz Zok",
|
@@ -32,15 +32,19 @@ setup(
|
|
32
32
|
"molecule-filter=rnapolis.molecule_filter:main",
|
33
33
|
"motif-extractor=rnapolis.motif_extractor:main",
|
34
34
|
"transformer=rnapolis.transformer:main",
|
35
|
+
"rfam-folder=rnapolis.rfam_folder:main",
|
35
36
|
]
|
36
37
|
},
|
37
38
|
install_requires=[
|
39
|
+
"appdirs",
|
38
40
|
"graphviz",
|
39
41
|
"mmcif",
|
40
42
|
"numpy",
|
41
43
|
"ordered-set",
|
42
44
|
"orjson",
|
43
45
|
"pulp",
|
46
|
+
"requests",
|
44
47
|
"scipy",
|
48
|
+
"viennarna",
|
45
49
|
],
|
46
50
|
)
|
@@ -1,3 +1,31 @@
|
|
1
|
+
Metadata-Version: 2.1
|
2
|
+
Name: RNApolis
|
3
|
+
Version: 0.3.1
|
4
|
+
Summary: A Python library containing RNA-related bioinformatics functions and classes
|
5
|
+
Home-page: https://github.com/tzok/rnapolis-py
|
6
|
+
Author: Tomasz Zok
|
7
|
+
Author-email: tomasz.zok@cs.put.poznan.pl
|
8
|
+
Project-URL: Bug Tracker, https://github.com/tzok/rnapolis-py/issues
|
9
|
+
Classifier: Development Status :: 5 - Production/Stable
|
10
|
+
Classifier: Environment :: Console
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
13
|
+
Classifier: Operating System :: OS Independent
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
16
|
+
Description-Content-Type: text/markdown
|
17
|
+
License-File: LICENSE
|
18
|
+
Requires-Dist: appdirs
|
19
|
+
Requires-Dist: graphviz
|
20
|
+
Requires-Dist: mmcif
|
21
|
+
Requires-Dist: numpy
|
22
|
+
Requires-Dist: ordered-set
|
23
|
+
Requires-Dist: orjson
|
24
|
+
Requires-Dist: pulp
|
25
|
+
Requires-Dist: requests
|
26
|
+
Requires-Dist: scipy
|
27
|
+
Requires-Dist: viennarna
|
28
|
+
|
1
29
|
# RNApolis
|
2
30
|
|
3
31
|
A Python library and utilities containing RNA-related bioinformatics functions and classes.
|
@@ -2108,3 +2136,38 @@ To use `transformer.py`, specify the path to your input mmCIF file and the desir
|
|
2108
2136
|
- `--copy-to COPY_TO`: Indicate the column name to copy data to (e.g., `auth_asym_id`).
|
2109
2137
|
|
2110
2138
|
For additional guidance, use `-h` or `--help`.
|
2139
|
+
|
2140
|
+
### `rfam-folder`
|
2141
|
+
|
2142
|
+
`rfam-folder` is a command-line tool for generating consensus secondary structures for RNA sequences. This tool can process a single RNA sequence or multiple sequences from a FASTA file. It offers the flexibility to specify an Rfam family for targeted analysis, control the folding of the secondary structure, and set a limit on the number of structures generated.
|
2143
|
+
|
2144
|
+
**Important!** You need to have [Infernal software](http://eddylab.org/infernal/) installed for this script to run.
|
2145
|
+
|
2146
|
+
#### Usage:
|
2147
|
+
|
2148
|
+
The general usage pattern for rfam_folder.py is as follows:
|
2149
|
+
|
2150
|
+
```bash
|
2151
|
+
usage: rfam_folder.py [-h] [--family FAMILY] [--no-fold] [--count COUNT] sequence
|
2152
|
+
```
|
2153
|
+
|
2154
|
+
Positional Arguments:
|
2155
|
+
|
2156
|
+
- sequence: An RNA sequence directly or a path to a FASTA file containing one or more sequences.
|
2157
|
+
|
2158
|
+
Options:
|
2159
|
+
|
2160
|
+
- `--family FAMILY`: (Optional) Specify the name of the Rfam family to use. If not given, the entire Rfam database is searched for the sequence.
|
2161
|
+
- `--no-fold`: (Optional) Disable folding of the consensus secondary structure by RNAfold with constraints.
|
2162
|
+
- `--count COUNT`: (Optional) Set the maximum number of consensus secondary structures to generate per sequence, with a default of 1.
|
2163
|
+
|
2164
|
+
#### Examples
|
2165
|
+
|
2166
|
+
Generate a consensus structure for a single RNA sequence given specific Rfam family:
|
2167
|
+
|
2168
|
+
```
|
2169
|
+
$ rfam-folder AUCGUUCACCUCAUAAAUUGAGUGAGACGGAAGUAGGUUAAAACCGAAGGAACGCAGU --family RF01739
|
2170
|
+
>header
|
2171
|
+
AUCGUUCACCUCAUAAAUUGAGUGAGACGGAAGUAGGUUAAAACCGAAGGAACGCAGU
|
2172
|
+
..(((((..(((((.......)))))((....)).(((....)))....)))))....
|
2173
|
+
```
|
@@ -0,0 +1,294 @@
|
|
1
|
+
#! /usr/bin/env python
|
2
|
+
import argparse
|
3
|
+
import gzip
|
4
|
+
import os
|
5
|
+
import re
|
6
|
+
import shutil
|
7
|
+
import subprocess
|
8
|
+
import tempfile
|
9
|
+
from typing import List
|
10
|
+
|
11
|
+
import appdirs
|
12
|
+
import requests
|
13
|
+
import RNA
|
14
|
+
|
15
|
+
from rnapolis.common import BpSeq, DotBracket
|
16
|
+
|
17
|
+
COMBINED_CM = "https://ftp.ebi.ac.uk/pub/databases/Rfam/CURRENT/Rfam.cm.gz"
|
18
|
+
SEPARATE_CM = "https://ftp.ebi.ac.uk/pub/databases/Rfam/CURRENT/Rfam.tar.gz"
|
19
|
+
|
20
|
+
|
21
|
+
class FASTA:
|
22
|
+
header: str
|
23
|
+
sequence: str
|
24
|
+
|
25
|
+
def __init__(self, header: str, sequence: str):
|
26
|
+
self.header = header
|
27
|
+
self.sequence = sequence
|
28
|
+
|
29
|
+
def __str__(self):
|
30
|
+
return f">{self.header}\n{self.sequence}"
|
31
|
+
|
32
|
+
|
33
|
+
def parse_fasta(fasta_path: str) -> List[FASTA]:
|
34
|
+
"""
|
35
|
+
Read FASTA entries from a file.
|
36
|
+
|
37
|
+
Args:
|
38
|
+
fasta_path (str): The path to the FASTA file.
|
39
|
+
|
40
|
+
Returns:
|
41
|
+
List[Fasta]: A list of FASTA objects representing the entries in the file.
|
42
|
+
"""
|
43
|
+
with open(fasta_path) as f:
|
44
|
+
content = f.read()
|
45
|
+
|
46
|
+
entries = content.split(">")[1:]
|
47
|
+
fastas = []
|
48
|
+
|
49
|
+
for entry in entries:
|
50
|
+
lines = entry.splitlines()
|
51
|
+
header = lines[0]
|
52
|
+
sequence = "".join(lines[1:])
|
53
|
+
fastas.append(FASTA(header, sequence))
|
54
|
+
|
55
|
+
return fastas
|
56
|
+
|
57
|
+
|
58
|
+
def ensure_cm(family: str = None):
|
59
|
+
if not os.path.exists(appdirs.user_data_dir("rnapolis")):
|
60
|
+
os.makedirs(appdirs.user_data_dir("rnapolis"))
|
61
|
+
|
62
|
+
if family is None:
|
63
|
+
cm_gz_path = appdirs.user_data_dir("rnapolis") + "/Rfam.cm.gz"
|
64
|
+
cm_path = appdirs.user_data_dir("rnapolis") + "/Rfam.cm"
|
65
|
+
|
66
|
+
if not os.path.exists(cm_gz_path):
|
67
|
+
response = requests.get(COMBINED_CM)
|
68
|
+
|
69
|
+
with open(cm_gz_path, "wb") as f:
|
70
|
+
f.write(response.content)
|
71
|
+
|
72
|
+
if not os.path.exists(cm_path):
|
73
|
+
with gzip.open(cm_gz_path, "rb") as f_in, open(cm_path, "wb") as f_out:
|
74
|
+
f_out.write(f_in.read())
|
75
|
+
else:
|
76
|
+
cm_gz_path = appdirs.user_data_dir("rnapolis") + "/Rfam.tar.gz"
|
77
|
+
cm_path = appdirs.user_data_dir("rnapolis") + f"/{family}.cm"
|
78
|
+
|
79
|
+
if not os.path.exists(cm_gz_path):
|
80
|
+
response = requests.get(SEPARATE_CM)
|
81
|
+
|
82
|
+
with open(cm_gz_path, "wb") as f:
|
83
|
+
f.write(response.content)
|
84
|
+
|
85
|
+
if not os.path.exists(cm_path):
|
86
|
+
shutil.unpack_archive(cm_gz_path, appdirs.user_data_dir("rnapolis"))
|
87
|
+
|
88
|
+
if not os.path.exists(cm_path):
|
89
|
+
raise RuntimeError(
|
90
|
+
f"Failed to find covariance model for {family} from Rfam."
|
91
|
+
)
|
92
|
+
|
93
|
+
if not os.path.exists(cm_path + ".i1m"):
|
94
|
+
subprocess.run(["cmpress", cm_path], check=True, capture_output=True)
|
95
|
+
|
96
|
+
return cm_path
|
97
|
+
|
98
|
+
|
99
|
+
def analyze_cmsearch(cmsearch: str, fasta: FASTA, count: int = 1):
|
100
|
+
result = []
|
101
|
+
lines = cmsearch.splitlines()
|
102
|
+
begins = [i for i, line in enumerate(lines) if line.startswith(">>")]
|
103
|
+
|
104
|
+
for i, begin in enumerate(begins):
|
105
|
+
nc_index, cs_index = None, None
|
106
|
+
|
107
|
+
for j in range(begin, begins[i + 1] if i + 1 < len(begins) else len(lines)):
|
108
|
+
if lines[j].endswith(" NC"):
|
109
|
+
nc_index = j
|
110
|
+
if lines[j].endswith(" CS"):
|
111
|
+
cs_index = j
|
112
|
+
|
113
|
+
assert len(lines[cs_index].split()) == 2
|
114
|
+
|
115
|
+
structure = lines[cs_index]
|
116
|
+
sequence = lines[cs_index + 3]
|
117
|
+
|
118
|
+
match = re.match(r"\s*.+?\s+(\d+)\s+.+\s+(\d+)", sequence)
|
119
|
+
assert match is not None, sequence
|
120
|
+
first, last = int(match.group(1)), int(match.group(2))
|
121
|
+
|
122
|
+
for i in range(len(structure)):
|
123
|
+
if structure[i] != " ":
|
124
|
+
break
|
125
|
+
j = structure.find(" CS")
|
126
|
+
|
127
|
+
structure = structure[i:j]
|
128
|
+
sequence = sequence[i:j].upper()
|
129
|
+
|
130
|
+
# remove pairs which did not match to consensus
|
131
|
+
if nc_index is not None:
|
132
|
+
non_canonical = lines[nc_index][i:j]
|
133
|
+
for match in re.finditer(r"[v?]", non_canonical):
|
134
|
+
i = match.start()
|
135
|
+
structure = structure[:i] + "." + structure[i + 1 :]
|
136
|
+
|
137
|
+
# replace *[n]* placeholders
|
138
|
+
while True:
|
139
|
+
match = re.search(r"[<*]\[ *(\d+)\][*>]", sequence)
|
140
|
+
|
141
|
+
if match is None:
|
142
|
+
break
|
143
|
+
|
144
|
+
i, j = match.start(), match.end()
|
145
|
+
n = int(match.group(1))
|
146
|
+
sequence = sequence[:i] + "." * n + sequence[j:]
|
147
|
+
structure = structure[:i] + "." * n + structure[j:]
|
148
|
+
|
149
|
+
# replace gaps
|
150
|
+
while True:
|
151
|
+
match = re.search(r"-+", sequence)
|
152
|
+
|
153
|
+
if match is None:
|
154
|
+
break
|
155
|
+
|
156
|
+
i, j = match.start(), match.end()
|
157
|
+
sequence = sequence[:i] + sequence[j:]
|
158
|
+
structure = structure[:i] + structure[j:]
|
159
|
+
|
160
|
+
assert len(sequence) == len(structure)
|
161
|
+
|
162
|
+
if first > last:
|
163
|
+
# https://en.wikipedia.org/wiki/Nucleic_acid_notation
|
164
|
+
complementary = {
|
165
|
+
"A": "U",
|
166
|
+
"C": "G",
|
167
|
+
"G": "C",
|
168
|
+
"U": "A",
|
169
|
+
"W": "W",
|
170
|
+
"S": "S",
|
171
|
+
"M": "K",
|
172
|
+
"K": "M",
|
173
|
+
"R": "Y",
|
174
|
+
"Y": "R",
|
175
|
+
"B": "V",
|
176
|
+
"D": "H",
|
177
|
+
"H": "D",
|
178
|
+
"V": "B",
|
179
|
+
"N": "N",
|
180
|
+
".": ".",
|
181
|
+
}
|
182
|
+
assert set(sequence) <= set(complementary.keys()), (
|
183
|
+
set(sequence) - set(complementary.keys()),
|
184
|
+
sequence,
|
185
|
+
)
|
186
|
+
sequence_comp = "".join([complementary[c] for c in sequence[::-1]])
|
187
|
+
match = re.search(sequence_comp, fasta.sequence)
|
188
|
+
assert match is not None, (sequence, fasta.sequence)
|
189
|
+
sequence_comp = match.group()
|
190
|
+
sequence = "".join([complementary[c] for c in sequence_comp[::-1]])
|
191
|
+
else:
|
192
|
+
match = re.search(sequence, fasta.sequence)
|
193
|
+
assert match is not None, (sequence, fasta.sequence)
|
194
|
+
sequence = match.group()
|
195
|
+
|
196
|
+
assert len(sequence) == len(structure)
|
197
|
+
|
198
|
+
structure = (
|
199
|
+
structure.replace(":", ".")
|
200
|
+
.replace("-", ".")
|
201
|
+
.replace("_", ".")
|
202
|
+
.replace(",", ".")
|
203
|
+
.replace("~", ".")
|
204
|
+
)
|
205
|
+
if set(structure) == {"."}:
|
206
|
+
continue
|
207
|
+
|
208
|
+
dot_bracket = DotBracket.from_string("N" * len(structure), structure)
|
209
|
+
structure = BpSeq.from_dotbracket(dot_bracket).dot_bracket.structure
|
210
|
+
result.append([sequence, structure])
|
211
|
+
|
212
|
+
if len(result) >= count:
|
213
|
+
break
|
214
|
+
|
215
|
+
return result
|
216
|
+
|
217
|
+
|
218
|
+
def generate_consensus_secondary_structure(
|
219
|
+
fasta: FASTA, family: str = None, fold: bool = True, count: int = 1
|
220
|
+
):
|
221
|
+
if shutil.which("cmpress") is None or shutil.which("cmsearch") is None:
|
222
|
+
raise RuntimeError(
|
223
|
+
"cmpress/cmsearch not found in PATH, please install Infernal first."
|
224
|
+
)
|
225
|
+
|
226
|
+
cm_path = ensure_cm(family)
|
227
|
+
|
228
|
+
with tempfile.NamedTemporaryFile(suffix=".fa") as fin:
|
229
|
+
fin.write(str(fasta).encode())
|
230
|
+
fin.seek(0)
|
231
|
+
|
232
|
+
completed = subprocess.run(
|
233
|
+
["cmsearch", "--notextw", cm_path, fin.name],
|
234
|
+
check=True,
|
235
|
+
capture_output=True,
|
236
|
+
)
|
237
|
+
|
238
|
+
results = analyze_cmsearch(completed.stdout.decode(), fasta, count)
|
239
|
+
|
240
|
+
if fold:
|
241
|
+
for i in range(len(results)):
|
242
|
+
RNAfold = RNA.fold_compound(results[i][0])
|
243
|
+
RNAfold.hc_add_from_db(results[i][1])
|
244
|
+
structure, _ = RNAfold.mfe()
|
245
|
+
results[i][1] = structure
|
246
|
+
|
247
|
+
return [
|
248
|
+
f">{fasta.header}\n{sequence}\n{structure}" for sequence, structure in results
|
249
|
+
]
|
250
|
+
|
251
|
+
|
252
|
+
def main():
|
253
|
+
parser = argparse.ArgumentParser(
|
254
|
+
description="Generate consensus secondary structure for a given sequence. IMPORTANT! You need to have Infernal software installed to use this script."
|
255
|
+
)
|
256
|
+
parser.add_argument(
|
257
|
+
"sequence",
|
258
|
+
type=str,
|
259
|
+
help="an RNA sequence or a path to FASTA file, possibly containing multiple sequences",
|
260
|
+
)
|
261
|
+
parser.add_argument(
|
262
|
+
"--family",
|
263
|
+
type=str,
|
264
|
+
help="(optional) name of the Rfam family to use, if not given, the whole Rfam will be checked for the given sequence",
|
265
|
+
)
|
266
|
+
parser.add_argument(
|
267
|
+
"--no-fold",
|
268
|
+
action="store_true",
|
269
|
+
help="(optional) whether to disable folding of the consensus secondary structure by RNAfold with constraints",
|
270
|
+
)
|
271
|
+
parser.add_argument(
|
272
|
+
"--count",
|
273
|
+
type=int,
|
274
|
+
default=1,
|
275
|
+
help="(optional) maximum number of consensus secondary structures to generate per sequence, default is 1",
|
276
|
+
)
|
277
|
+
|
278
|
+
args = parser.parse_args()
|
279
|
+
|
280
|
+
if os.path.exists(args.sequence):
|
281
|
+
fastas = parse_fasta(args.sequence)
|
282
|
+
else:
|
283
|
+
fastas = [FASTA("header", args.sequence)]
|
284
|
+
|
285
|
+
for fasta in fastas:
|
286
|
+
results = generate_consensus_secondary_structure(
|
287
|
+
fasta, args.family, not args.no_fold, args.count
|
288
|
+
)
|
289
|
+
for result in results:
|
290
|
+
print(result)
|
291
|
+
|
292
|
+
|
293
|
+
if __name__ == "__main__":
|
294
|
+
main()
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|