RNApolis 0.2.1__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {RNApolis-0.2.1.dist-info → RNApolis-0.3.1.dist-info}/METADATA +39 -1
- {RNApolis-0.2.1.dist-info → RNApolis-0.3.1.dist-info}/RECORD +7 -6
- {RNApolis-0.2.1.dist-info → RNApolis-0.3.1.dist-info}/WHEEL +1 -1
- {RNApolis-0.2.1.dist-info → RNApolis-0.3.1.dist-info}/entry_points.txt +1 -0
- rnapolis/rfam_folder.py +294 -0
- {RNApolis-0.2.1.dist-info → RNApolis-0.3.1.dist-info}/LICENSE +0 -0
- {RNApolis-0.2.1.dist-info → RNApolis-0.3.1.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: RNApolis
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.3.1
|
4
4
|
Summary: A Python library containing RNA-related bioinformatics functions and classes
|
5
5
|
Home-page: https://github.com/tzok/rnapolis-py
|
6
6
|
Author: Tomasz Zok
|
@@ -15,13 +15,16 @@ Classifier: Programming Language :: Python :: 3
|
|
15
15
|
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
16
16
|
Description-Content-Type: text/markdown
|
17
17
|
License-File: LICENSE
|
18
|
+
Requires-Dist: appdirs
|
18
19
|
Requires-Dist: graphviz
|
19
20
|
Requires-Dist: mmcif
|
20
21
|
Requires-Dist: numpy
|
21
22
|
Requires-Dist: ordered-set
|
22
23
|
Requires-Dist: orjson
|
23
24
|
Requires-Dist: pulp
|
25
|
+
Requires-Dist: requests
|
24
26
|
Requires-Dist: scipy
|
27
|
+
Requires-Dist: viennarna
|
25
28
|
|
26
29
|
# RNApolis
|
27
30
|
|
@@ -2133,3 +2136,38 @@ To use `transformer.py`, specify the path to your input mmCIF file and the desir
|
|
2133
2136
|
- `--copy-to COPY_TO`: Indicate the column name to copy data to (e.g., `auth_asym_id`).
|
2134
2137
|
|
2135
2138
|
For additional guidance, use `-h` or `--help`.
|
2139
|
+
|
2140
|
+
### `rfam-folder`
|
2141
|
+
|
2142
|
+
`rfam-folder` is a command-line tool for generating consensus secondary structures for RNA sequences. This tool can process a single RNA sequence or multiple sequences from a FASTA file. It offers the flexibility to specify an Rfam family for targeted analysis, control the folding of the secondary structure, and set a limit on the number of structures generated.
|
2143
|
+
|
2144
|
+
**Important!** You need to have [Infernal software](http://eddylab.org/infernal/) installed for this script to run.
|
2145
|
+
|
2146
|
+
#### Usage:
|
2147
|
+
|
2148
|
+
The general usage pattern for rfam_folder.py is as follows:
|
2149
|
+
|
2150
|
+
```bash
|
2151
|
+
usage: rfam_folder.py [-h] [--family FAMILY] [--no-fold] [--count COUNT] sequence
|
2152
|
+
```
|
2153
|
+
|
2154
|
+
Positional Arguments:
|
2155
|
+
|
2156
|
+
- sequence: An RNA sequence directly or a path to a FASTA file containing one or more sequences.
|
2157
|
+
|
2158
|
+
Options:
|
2159
|
+
|
2160
|
+
- `--family FAMILY`: (Optional) Specify the name of the Rfam family to use. If not given, the entire Rfam database is searched for the sequence.
|
2161
|
+
- `--no-fold`: (Optional) Disable folding of the consensus secondary structure by RNAfold with constraints.
|
2162
|
+
- `--count COUNT`: (Optional) Set the maximum number of consensus secondary structures to generate per sequence, with a default of 1.
|
2163
|
+
|
2164
|
+
#### Examples
|
2165
|
+
|
2166
|
+
Generate a consensus structure for a single RNA sequence given specific Rfam family:
|
2167
|
+
|
2168
|
+
```
|
2169
|
+
$ rfam-folder AUCGUUCACCUCAUAAAUUGAGUGAGACGGAAGUAGGUUAAAACCGAAGGAACGCAGU --family RF01739
|
2170
|
+
>header
|
2171
|
+
AUCGUUCACCUCAUAAAUUGAGUGAGACGGAAGUAGGUUAAAACCGAAGGAACGCAGU
|
2172
|
+
..(((((..(((((.......)))))((....)).(((....)))....)))))....
|
2173
|
+
```
|
@@ -5,12 +5,13 @@ rnapolis/metareader.py,sha256=4qtMKRvww2sUStLeV8WVrLEt-ScydHUv4Gxx96tnf-M,1683
|
|
5
5
|
rnapolis/molecule_filter.py,sha256=NhjuqdCRnXgPefWZPeTq77tifmnAzamQtA0ODqPPG9k,6918
|
6
6
|
rnapolis/motif_extractor.py,sha256=duHvpi9Ulcny9K60E6VBpz5RpJZw-KdTB4_Ph0iP478,774
|
7
7
|
rnapolis/parser.py,sha256=Z3Zd_IuRyOP45x5BStgu7UgoyHthhw55fT3udHUhAE4,11905
|
8
|
+
rnapolis/rfam_folder.py,sha256=MggwxechIE5f2K-p5nhwNqsL4ckuQw5bJQaFohC2u4c,8918
|
8
9
|
rnapolis/tertiary.py,sha256=iWMPD9c21rjMPpEdBd7mPCQgds65IbOr4_Fy06s0NoU,18957
|
9
10
|
rnapolis/transformer.py,sha256=V9nOQvdq4-p7yUWo0vQg0CDQMpmyxz9t4TMSRVEKHnw,1817
|
10
11
|
rnapolis/util.py,sha256=IdquFO3PV1_KDqodjupzm0Rqvgy0CeSzxGHaGEHYXVU,543
|
11
|
-
RNApolis-0.
|
12
|
-
RNApolis-0.
|
13
|
-
RNApolis-0.
|
14
|
-
RNApolis-0.
|
15
|
-
RNApolis-0.
|
16
|
-
RNApolis-0.
|
12
|
+
RNApolis-0.3.1.dist-info/LICENSE,sha256=ZGRu12MzCgbYA-Lt8MyBlmjvPZh7xfiD5u5wBx0enq4,1066
|
13
|
+
RNApolis-0.3.1.dist-info/METADATA,sha256=quiuTEU3oKIvg6Mkpa4CN8-MBgUjjxzFjytjGtDd2eE,54300
|
14
|
+
RNApolis-0.3.1.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
15
|
+
RNApolis-0.3.1.dist-info/entry_points.txt,sha256=foN2Pn5e-OzEz0fFmNoX6PnFSZFQntOlY8LbognP5F0,308
|
16
|
+
RNApolis-0.3.1.dist-info/top_level.txt,sha256=LcO18koxZcWoJ21KDRRRo_tyIbmXL5z61dPitZpy8yc,9
|
17
|
+
RNApolis-0.3.1.dist-info/RECORD,,
|
rnapolis/rfam_folder.py
ADDED
@@ -0,0 +1,294 @@
|
|
1
|
+
#! /usr/bin/env python
|
2
|
+
import argparse
|
3
|
+
import gzip
|
4
|
+
import os
|
5
|
+
import re
|
6
|
+
import shutil
|
7
|
+
import subprocess
|
8
|
+
import tempfile
|
9
|
+
from typing import List
|
10
|
+
|
11
|
+
import appdirs
|
12
|
+
import requests
|
13
|
+
import RNA
|
14
|
+
|
15
|
+
from rnapolis.common import BpSeq, DotBracket
|
16
|
+
|
17
|
+
COMBINED_CM = "https://ftp.ebi.ac.uk/pub/databases/Rfam/CURRENT/Rfam.cm.gz"
|
18
|
+
SEPARATE_CM = "https://ftp.ebi.ac.uk/pub/databases/Rfam/CURRENT/Rfam.tar.gz"
|
19
|
+
|
20
|
+
|
21
|
+
class FASTA:
|
22
|
+
header: str
|
23
|
+
sequence: str
|
24
|
+
|
25
|
+
def __init__(self, header: str, sequence: str):
|
26
|
+
self.header = header
|
27
|
+
self.sequence = sequence
|
28
|
+
|
29
|
+
def __str__(self):
|
30
|
+
return f">{self.header}\n{self.sequence}"
|
31
|
+
|
32
|
+
|
33
|
+
def parse_fasta(fasta_path: str) -> List[FASTA]:
|
34
|
+
"""
|
35
|
+
Read FASTA entries from a file.
|
36
|
+
|
37
|
+
Args:
|
38
|
+
fasta_path (str): The path to the FASTA file.
|
39
|
+
|
40
|
+
Returns:
|
41
|
+
List[Fasta]: A list of FASTA objects representing the entries in the file.
|
42
|
+
"""
|
43
|
+
with open(fasta_path) as f:
|
44
|
+
content = f.read()
|
45
|
+
|
46
|
+
entries = content.split(">")[1:]
|
47
|
+
fastas = []
|
48
|
+
|
49
|
+
for entry in entries:
|
50
|
+
lines = entry.splitlines()
|
51
|
+
header = lines[0]
|
52
|
+
sequence = "".join(lines[1:])
|
53
|
+
fastas.append(FASTA(header, sequence))
|
54
|
+
|
55
|
+
return fastas
|
56
|
+
|
57
|
+
|
58
|
+
def ensure_cm(family: str = None):
|
59
|
+
if not os.path.exists(appdirs.user_data_dir("rnapolis")):
|
60
|
+
os.makedirs(appdirs.user_data_dir("rnapolis"))
|
61
|
+
|
62
|
+
if family is None:
|
63
|
+
cm_gz_path = appdirs.user_data_dir("rnapolis") + "/Rfam.cm.gz"
|
64
|
+
cm_path = appdirs.user_data_dir("rnapolis") + "/Rfam.cm"
|
65
|
+
|
66
|
+
if not os.path.exists(cm_gz_path):
|
67
|
+
response = requests.get(COMBINED_CM)
|
68
|
+
|
69
|
+
with open(cm_gz_path, "wb") as f:
|
70
|
+
f.write(response.content)
|
71
|
+
|
72
|
+
if not os.path.exists(cm_path):
|
73
|
+
with gzip.open(cm_gz_path, "rb") as f_in, open(cm_path, "wb") as f_out:
|
74
|
+
f_out.write(f_in.read())
|
75
|
+
else:
|
76
|
+
cm_gz_path = appdirs.user_data_dir("rnapolis") + "/Rfam.tar.gz"
|
77
|
+
cm_path = appdirs.user_data_dir("rnapolis") + f"/{family}.cm"
|
78
|
+
|
79
|
+
if not os.path.exists(cm_gz_path):
|
80
|
+
response = requests.get(SEPARATE_CM)
|
81
|
+
|
82
|
+
with open(cm_gz_path, "wb") as f:
|
83
|
+
f.write(response.content)
|
84
|
+
|
85
|
+
if not os.path.exists(cm_path):
|
86
|
+
shutil.unpack_archive(cm_gz_path, appdirs.user_data_dir("rnapolis"))
|
87
|
+
|
88
|
+
if not os.path.exists(cm_path):
|
89
|
+
raise RuntimeError(
|
90
|
+
f"Failed to find covariance model for {family} from Rfam."
|
91
|
+
)
|
92
|
+
|
93
|
+
if not os.path.exists(cm_path + ".i1m"):
|
94
|
+
subprocess.run(["cmpress", cm_path], check=True, capture_output=True)
|
95
|
+
|
96
|
+
return cm_path
|
97
|
+
|
98
|
+
|
99
|
+
def analyze_cmsearch(cmsearch: str, fasta: FASTA, count: int = 1):
|
100
|
+
result = []
|
101
|
+
lines = cmsearch.splitlines()
|
102
|
+
begins = [i for i, line in enumerate(lines) if line.startswith(">>")]
|
103
|
+
|
104
|
+
for i, begin in enumerate(begins):
|
105
|
+
nc_index, cs_index = None, None
|
106
|
+
|
107
|
+
for j in range(begin, begins[i + 1] if i + 1 < len(begins) else len(lines)):
|
108
|
+
if lines[j].endswith(" NC"):
|
109
|
+
nc_index = j
|
110
|
+
if lines[j].endswith(" CS"):
|
111
|
+
cs_index = j
|
112
|
+
|
113
|
+
assert len(lines[cs_index].split()) == 2
|
114
|
+
|
115
|
+
structure = lines[cs_index]
|
116
|
+
sequence = lines[cs_index + 3]
|
117
|
+
|
118
|
+
match = re.match(r"\s*.+?\s+(\d+)\s+.+\s+(\d+)", sequence)
|
119
|
+
assert match is not None, sequence
|
120
|
+
first, last = int(match.group(1)), int(match.group(2))
|
121
|
+
|
122
|
+
for i in range(len(structure)):
|
123
|
+
if structure[i] != " ":
|
124
|
+
break
|
125
|
+
j = structure.find(" CS")
|
126
|
+
|
127
|
+
structure = structure[i:j]
|
128
|
+
sequence = sequence[i:j].upper()
|
129
|
+
|
130
|
+
# remove pairs which did not match to consensus
|
131
|
+
if nc_index is not None:
|
132
|
+
non_canonical = lines[nc_index][i:j]
|
133
|
+
for match in re.finditer(r"[v?]", non_canonical):
|
134
|
+
i = match.start()
|
135
|
+
structure = structure[:i] + "." + structure[i + 1 :]
|
136
|
+
|
137
|
+
# replace *[n]* placeholders
|
138
|
+
while True:
|
139
|
+
match = re.search(r"[<*]\[ *(\d+)\][*>]", sequence)
|
140
|
+
|
141
|
+
if match is None:
|
142
|
+
break
|
143
|
+
|
144
|
+
i, j = match.start(), match.end()
|
145
|
+
n = int(match.group(1))
|
146
|
+
sequence = sequence[:i] + "." * n + sequence[j:]
|
147
|
+
structure = structure[:i] + "." * n + structure[j:]
|
148
|
+
|
149
|
+
# replace gaps
|
150
|
+
while True:
|
151
|
+
match = re.search(r"-+", sequence)
|
152
|
+
|
153
|
+
if match is None:
|
154
|
+
break
|
155
|
+
|
156
|
+
i, j = match.start(), match.end()
|
157
|
+
sequence = sequence[:i] + sequence[j:]
|
158
|
+
structure = structure[:i] + structure[j:]
|
159
|
+
|
160
|
+
assert len(sequence) == len(structure)
|
161
|
+
|
162
|
+
if first > last:
|
163
|
+
# https://en.wikipedia.org/wiki/Nucleic_acid_notation
|
164
|
+
complementary = {
|
165
|
+
"A": "U",
|
166
|
+
"C": "G",
|
167
|
+
"G": "C",
|
168
|
+
"U": "A",
|
169
|
+
"W": "W",
|
170
|
+
"S": "S",
|
171
|
+
"M": "K",
|
172
|
+
"K": "M",
|
173
|
+
"R": "Y",
|
174
|
+
"Y": "R",
|
175
|
+
"B": "V",
|
176
|
+
"D": "H",
|
177
|
+
"H": "D",
|
178
|
+
"V": "B",
|
179
|
+
"N": "N",
|
180
|
+
".": ".",
|
181
|
+
}
|
182
|
+
assert set(sequence) <= set(complementary.keys()), (
|
183
|
+
set(sequence) - set(complementary.keys()),
|
184
|
+
sequence,
|
185
|
+
)
|
186
|
+
sequence_comp = "".join([complementary[c] for c in sequence[::-1]])
|
187
|
+
match = re.search(sequence_comp, fasta.sequence)
|
188
|
+
assert match is not None, (sequence, fasta.sequence)
|
189
|
+
sequence_comp = match.group()
|
190
|
+
sequence = "".join([complementary[c] for c in sequence_comp[::-1]])
|
191
|
+
else:
|
192
|
+
match = re.search(sequence, fasta.sequence)
|
193
|
+
assert match is not None, (sequence, fasta.sequence)
|
194
|
+
sequence = match.group()
|
195
|
+
|
196
|
+
assert len(sequence) == len(structure)
|
197
|
+
|
198
|
+
structure = (
|
199
|
+
structure.replace(":", ".")
|
200
|
+
.replace("-", ".")
|
201
|
+
.replace("_", ".")
|
202
|
+
.replace(",", ".")
|
203
|
+
.replace("~", ".")
|
204
|
+
)
|
205
|
+
if set(structure) == {"."}:
|
206
|
+
continue
|
207
|
+
|
208
|
+
dot_bracket = DotBracket.from_string("N" * len(structure), structure)
|
209
|
+
structure = BpSeq.from_dotbracket(dot_bracket).dot_bracket.structure
|
210
|
+
result.append([sequence, structure])
|
211
|
+
|
212
|
+
if len(result) >= count:
|
213
|
+
break
|
214
|
+
|
215
|
+
return result
|
216
|
+
|
217
|
+
|
218
|
+
def generate_consensus_secondary_structure(
|
219
|
+
fasta: FASTA, family: str = None, fold: bool = True, count: int = 1
|
220
|
+
):
|
221
|
+
if shutil.which("cmpress") is None or shutil.which("cmsearch") is None:
|
222
|
+
raise RuntimeError(
|
223
|
+
"cmpress/cmsearch not found in PATH, please install Infernal first."
|
224
|
+
)
|
225
|
+
|
226
|
+
cm_path = ensure_cm(family)
|
227
|
+
|
228
|
+
with tempfile.NamedTemporaryFile(suffix=".fa") as fin:
|
229
|
+
fin.write(str(fasta).encode())
|
230
|
+
fin.seek(0)
|
231
|
+
|
232
|
+
completed = subprocess.run(
|
233
|
+
["cmsearch", "--notextw", cm_path, fin.name],
|
234
|
+
check=True,
|
235
|
+
capture_output=True,
|
236
|
+
)
|
237
|
+
|
238
|
+
results = analyze_cmsearch(completed.stdout.decode(), fasta, count)
|
239
|
+
|
240
|
+
if fold:
|
241
|
+
for i in range(len(results)):
|
242
|
+
RNAfold = RNA.fold_compound(results[i][0])
|
243
|
+
RNAfold.hc_add_from_db(results[i][1])
|
244
|
+
structure, _ = RNAfold.mfe()
|
245
|
+
results[i][1] = structure
|
246
|
+
|
247
|
+
return [
|
248
|
+
f">{fasta.header}\n{sequence}\n{structure}" for sequence, structure in results
|
249
|
+
]
|
250
|
+
|
251
|
+
|
252
|
+
def main():
|
253
|
+
parser = argparse.ArgumentParser(
|
254
|
+
description="Generate consensus secondary structure for a given sequence. IMPORTANT! You need to have Infernal software installed to use this script."
|
255
|
+
)
|
256
|
+
parser.add_argument(
|
257
|
+
"sequence",
|
258
|
+
type=str,
|
259
|
+
help="an RNA sequence or a path to FASTA file, possibly containing multiple sequences",
|
260
|
+
)
|
261
|
+
parser.add_argument(
|
262
|
+
"--family",
|
263
|
+
type=str,
|
264
|
+
help="(optional) name of the Rfam family to use, if not given, the whole Rfam will be checked for the given sequence",
|
265
|
+
)
|
266
|
+
parser.add_argument(
|
267
|
+
"--no-fold",
|
268
|
+
action="store_true",
|
269
|
+
help="(optional) whether to disable folding of the consensus secondary structure by RNAfold with constraints",
|
270
|
+
)
|
271
|
+
parser.add_argument(
|
272
|
+
"--count",
|
273
|
+
type=int,
|
274
|
+
default=1,
|
275
|
+
help="(optional) maximum number of consensus secondary structures to generate per sequence, default is 1",
|
276
|
+
)
|
277
|
+
|
278
|
+
args = parser.parse_args()
|
279
|
+
|
280
|
+
if os.path.exists(args.sequence):
|
281
|
+
fastas = parse_fasta(args.sequence)
|
282
|
+
else:
|
283
|
+
fastas = [FASTA("header", args.sequence)]
|
284
|
+
|
285
|
+
for fasta in fastas:
|
286
|
+
results = generate_consensus_secondary_structure(
|
287
|
+
fasta, args.family, not args.no_fold, args.count
|
288
|
+
)
|
289
|
+
for result in results:
|
290
|
+
print(result)
|
291
|
+
|
292
|
+
|
293
|
+
if __name__ == "__main__":
|
294
|
+
main()
|
File without changes
|
File without changes
|