RNApolis 0.2.1__py3-none-any.whl → 0.3.1__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: RNApolis
3
- Version: 0.2.1
3
+ Version: 0.3.1
4
4
  Summary: A Python library containing RNA-related bioinformatics functions and classes
5
5
  Home-page: https://github.com/tzok/rnapolis-py
6
6
  Author: Tomasz Zok
@@ -15,13 +15,16 @@ Classifier: Programming Language :: Python :: 3
15
15
  Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
16
16
  Description-Content-Type: text/markdown
17
17
  License-File: LICENSE
18
+ Requires-Dist: appdirs
18
19
  Requires-Dist: graphviz
19
20
  Requires-Dist: mmcif
20
21
  Requires-Dist: numpy
21
22
  Requires-Dist: ordered-set
22
23
  Requires-Dist: orjson
23
24
  Requires-Dist: pulp
25
+ Requires-Dist: requests
24
26
  Requires-Dist: scipy
27
+ Requires-Dist: viennarna
25
28
 
26
29
  # RNApolis
27
30
 
@@ -2133,3 +2136,38 @@ To use `transformer.py`, specify the path to your input mmCIF file and the desir
2133
2136
  - `--copy-to COPY_TO`: Indicate the column name to copy data to (e.g., `auth_asym_id`).
2134
2137
 
2135
2138
  For additional guidance, use `-h` or `--help`.
2139
+
2140
+ ### `rfam-folder`
2141
+
2142
+ `rfam-folder` is a command-line tool for generating consensus secondary structures for RNA sequences. This tool can process a single RNA sequence or multiple sequences from a FASTA file. It offers the flexibility to specify an Rfam family for targeted analysis, control the folding of the secondary structure, and set a limit on the number of structures generated.
2143
+
2144
+ **Important!** You need to have [Infernal software](http://eddylab.org/infernal/) installed for this script to run.
2145
+
2146
+ #### Usage:
2147
+
2148
+ The general usage pattern for rfam_folder.py is as follows:
2149
+
2150
+ ```bash
2151
+ usage: rfam_folder.py [-h] [--family FAMILY] [--no-fold] [--count COUNT] sequence
2152
+ ```
2153
+
2154
+ Positional Arguments:
2155
+
2156
+ - sequence: An RNA sequence directly or a path to a FASTA file containing one or more sequences.
2157
+
2158
+ Options:
2159
+
2160
+ - `--family FAMILY`: (Optional) Specify the name of the Rfam family to use. If not given, the entire Rfam database is searched for the sequence.
2161
+ - `--no-fold`: (Optional) Disable folding of the consensus secondary structure by RNAfold with constraints.
2162
+ - `--count COUNT`: (Optional) Set the maximum number of consensus secondary structures to generate per sequence, with a default of 1.
2163
+
2164
+ #### Examples
2165
+
2166
+ Generate a consensus structure for a single RNA sequence given specific Rfam family:
2167
+
2168
+ ```
2169
+ $ rfam-folder AUCGUUCACCUCAUAAAUUGAGUGAGACGGAAGUAGGUUAAAACCGAAGGAACGCAGU --family RF01739
2170
+ >header
2171
+ AUCGUUCACCUCAUAAAUUGAGUGAGACGGAAGUAGGUUAAAACCGAAGGAACGCAGU
2172
+ ..(((((..(((((.......)))))((....)).(((....)))....)))))....
2173
+ ```
@@ -5,12 +5,13 @@ rnapolis/metareader.py,sha256=4qtMKRvww2sUStLeV8WVrLEt-ScydHUv4Gxx96tnf-M,1683
5
5
  rnapolis/molecule_filter.py,sha256=NhjuqdCRnXgPefWZPeTq77tifmnAzamQtA0ODqPPG9k,6918
6
6
  rnapolis/motif_extractor.py,sha256=duHvpi9Ulcny9K60E6VBpz5RpJZw-KdTB4_Ph0iP478,774
7
7
  rnapolis/parser.py,sha256=Z3Zd_IuRyOP45x5BStgu7UgoyHthhw55fT3udHUhAE4,11905
8
+ rnapolis/rfam_folder.py,sha256=MggwxechIE5f2K-p5nhwNqsL4ckuQw5bJQaFohC2u4c,8918
8
9
  rnapolis/tertiary.py,sha256=iWMPD9c21rjMPpEdBd7mPCQgds65IbOr4_Fy06s0NoU,18957
9
10
  rnapolis/transformer.py,sha256=V9nOQvdq4-p7yUWo0vQg0CDQMpmyxz9t4TMSRVEKHnw,1817
10
11
  rnapolis/util.py,sha256=IdquFO3PV1_KDqodjupzm0Rqvgy0CeSzxGHaGEHYXVU,543
11
- RNApolis-0.2.1.dist-info/LICENSE,sha256=ZGRu12MzCgbYA-Lt8MyBlmjvPZh7xfiD5u5wBx0enq4,1066
12
- RNApolis-0.2.1.dist-info/METADATA,sha256=PQMiXyedtnIVHGawMug4hssMACTxbHBQjtW7_NrxbpQ,52712
13
- RNApolis-0.2.1.dist-info/WHEEL,sha256=Xo9-1PvkuimrydujYJAjF7pCkriuXBpUPEjma1nZyJ0,92
14
- RNApolis-0.2.1.dist-info/entry_points.txt,sha256=oI0ywRPjBQJBR_k4MIQIqwsy5MZu6D5dkj_rfQNZTV4,268
15
- RNApolis-0.2.1.dist-info/top_level.txt,sha256=LcO18koxZcWoJ21KDRRRo_tyIbmXL5z61dPitZpy8yc,9
16
- RNApolis-0.2.1.dist-info/RECORD,,
12
+ RNApolis-0.3.1.dist-info/LICENSE,sha256=ZGRu12MzCgbYA-Lt8MyBlmjvPZh7xfiD5u5wBx0enq4,1066
13
+ RNApolis-0.3.1.dist-info/METADATA,sha256=quiuTEU3oKIvg6Mkpa4CN8-MBgUjjxzFjytjGtDd2eE,54300
14
+ RNApolis-0.3.1.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
15
+ RNApolis-0.3.1.dist-info/entry_points.txt,sha256=foN2Pn5e-OzEz0fFmNoX6PnFSZFQntOlY8LbognP5F0,308
16
+ RNApolis-0.3.1.dist-info/top_level.txt,sha256=LcO18koxZcWoJ21KDRRRo_tyIbmXL5z61dPitZpy8yc,9
17
+ RNApolis-0.3.1.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.41.3)
2
+ Generator: bdist_wheel (0.42.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -4,4 +4,5 @@ clashfinder = rnapolis.clashfinder:main
4
4
  metareader = rnapolis.metareader:main
5
5
  molecule-filter = rnapolis.molecule_filter:main
6
6
  motif-extractor = rnapolis.motif_extractor:main
7
+ rfam-folder = rnapolis.rfam_folder:main
7
8
  transformer = rnapolis.transformer:main
@@ -0,0 +1,294 @@
1
+ #! /usr/bin/env python
2
+ import argparse
3
+ import gzip
4
+ import os
5
+ import re
6
+ import shutil
7
+ import subprocess
8
+ import tempfile
9
+ from typing import List
10
+
11
+ import appdirs
12
+ import requests
13
+ import RNA
14
+
15
+ from rnapolis.common import BpSeq, DotBracket
16
+
17
+ COMBINED_CM = "https://ftp.ebi.ac.uk/pub/databases/Rfam/CURRENT/Rfam.cm.gz"
18
+ SEPARATE_CM = "https://ftp.ebi.ac.uk/pub/databases/Rfam/CURRENT/Rfam.tar.gz"
19
+
20
+
21
+ class FASTA:
22
+ header: str
23
+ sequence: str
24
+
25
+ def __init__(self, header: str, sequence: str):
26
+ self.header = header
27
+ self.sequence = sequence
28
+
29
+ def __str__(self):
30
+ return f">{self.header}\n{self.sequence}"
31
+
32
+
33
+ def parse_fasta(fasta_path: str) -> List[FASTA]:
34
+ """
35
+ Read FASTA entries from a file.
36
+
37
+ Args:
38
+ fasta_path (str): The path to the FASTA file.
39
+
40
+ Returns:
41
+ List[Fasta]: A list of FASTA objects representing the entries in the file.
42
+ """
43
+ with open(fasta_path) as f:
44
+ content = f.read()
45
+
46
+ entries = content.split(">")[1:]
47
+ fastas = []
48
+
49
+ for entry in entries:
50
+ lines = entry.splitlines()
51
+ header = lines[0]
52
+ sequence = "".join(lines[1:])
53
+ fastas.append(FASTA(header, sequence))
54
+
55
+ return fastas
56
+
57
+
58
+ def ensure_cm(family: str = None):
59
+ if not os.path.exists(appdirs.user_data_dir("rnapolis")):
60
+ os.makedirs(appdirs.user_data_dir("rnapolis"))
61
+
62
+ if family is None:
63
+ cm_gz_path = appdirs.user_data_dir("rnapolis") + "/Rfam.cm.gz"
64
+ cm_path = appdirs.user_data_dir("rnapolis") + "/Rfam.cm"
65
+
66
+ if not os.path.exists(cm_gz_path):
67
+ response = requests.get(COMBINED_CM)
68
+
69
+ with open(cm_gz_path, "wb") as f:
70
+ f.write(response.content)
71
+
72
+ if not os.path.exists(cm_path):
73
+ with gzip.open(cm_gz_path, "rb") as f_in, open(cm_path, "wb") as f_out:
74
+ f_out.write(f_in.read())
75
+ else:
76
+ cm_gz_path = appdirs.user_data_dir("rnapolis") + "/Rfam.tar.gz"
77
+ cm_path = appdirs.user_data_dir("rnapolis") + f"/{family}.cm"
78
+
79
+ if not os.path.exists(cm_gz_path):
80
+ response = requests.get(SEPARATE_CM)
81
+
82
+ with open(cm_gz_path, "wb") as f:
83
+ f.write(response.content)
84
+
85
+ if not os.path.exists(cm_path):
86
+ shutil.unpack_archive(cm_gz_path, appdirs.user_data_dir("rnapolis"))
87
+
88
+ if not os.path.exists(cm_path):
89
+ raise RuntimeError(
90
+ f"Failed to find covariance model for {family} from Rfam."
91
+ )
92
+
93
+ if not os.path.exists(cm_path + ".i1m"):
94
+ subprocess.run(["cmpress", cm_path], check=True, capture_output=True)
95
+
96
+ return cm_path
97
+
98
+
99
+ def analyze_cmsearch(cmsearch: str, fasta: FASTA, count: int = 1):
100
+ result = []
101
+ lines = cmsearch.splitlines()
102
+ begins = [i for i, line in enumerate(lines) if line.startswith(">>")]
103
+
104
+ for i, begin in enumerate(begins):
105
+ nc_index, cs_index = None, None
106
+
107
+ for j in range(begin, begins[i + 1] if i + 1 < len(begins) else len(lines)):
108
+ if lines[j].endswith(" NC"):
109
+ nc_index = j
110
+ if lines[j].endswith(" CS"):
111
+ cs_index = j
112
+
113
+ assert len(lines[cs_index].split()) == 2
114
+
115
+ structure = lines[cs_index]
116
+ sequence = lines[cs_index + 3]
117
+
118
+ match = re.match(r"\s*.+?\s+(\d+)\s+.+\s+(\d+)", sequence)
119
+ assert match is not None, sequence
120
+ first, last = int(match.group(1)), int(match.group(2))
121
+
122
+ for i in range(len(structure)):
123
+ if structure[i] != " ":
124
+ break
125
+ j = structure.find(" CS")
126
+
127
+ structure = structure[i:j]
128
+ sequence = sequence[i:j].upper()
129
+
130
+ # remove pairs which did not match to consensus
131
+ if nc_index is not None:
132
+ non_canonical = lines[nc_index][i:j]
133
+ for match in re.finditer(r"[v?]", non_canonical):
134
+ i = match.start()
135
+ structure = structure[:i] + "." + structure[i + 1 :]
136
+
137
+ # replace *[n]* placeholders
138
+ while True:
139
+ match = re.search(r"[<*]\[ *(\d+)\][*>]", sequence)
140
+
141
+ if match is None:
142
+ break
143
+
144
+ i, j = match.start(), match.end()
145
+ n = int(match.group(1))
146
+ sequence = sequence[:i] + "." * n + sequence[j:]
147
+ structure = structure[:i] + "." * n + structure[j:]
148
+
149
+ # replace gaps
150
+ while True:
151
+ match = re.search(r"-+", sequence)
152
+
153
+ if match is None:
154
+ break
155
+
156
+ i, j = match.start(), match.end()
157
+ sequence = sequence[:i] + sequence[j:]
158
+ structure = structure[:i] + structure[j:]
159
+
160
+ assert len(sequence) == len(structure)
161
+
162
+ if first > last:
163
+ # https://en.wikipedia.org/wiki/Nucleic_acid_notation
164
+ complementary = {
165
+ "A": "U",
166
+ "C": "G",
167
+ "G": "C",
168
+ "U": "A",
169
+ "W": "W",
170
+ "S": "S",
171
+ "M": "K",
172
+ "K": "M",
173
+ "R": "Y",
174
+ "Y": "R",
175
+ "B": "V",
176
+ "D": "H",
177
+ "H": "D",
178
+ "V": "B",
179
+ "N": "N",
180
+ ".": ".",
181
+ }
182
+ assert set(sequence) <= set(complementary.keys()), (
183
+ set(sequence) - set(complementary.keys()),
184
+ sequence,
185
+ )
186
+ sequence_comp = "".join([complementary[c] for c in sequence[::-1]])
187
+ match = re.search(sequence_comp, fasta.sequence)
188
+ assert match is not None, (sequence, fasta.sequence)
189
+ sequence_comp = match.group()
190
+ sequence = "".join([complementary[c] for c in sequence_comp[::-1]])
191
+ else:
192
+ match = re.search(sequence, fasta.sequence)
193
+ assert match is not None, (sequence, fasta.sequence)
194
+ sequence = match.group()
195
+
196
+ assert len(sequence) == len(structure)
197
+
198
+ structure = (
199
+ structure.replace(":", ".")
200
+ .replace("-", ".")
201
+ .replace("_", ".")
202
+ .replace(",", ".")
203
+ .replace("~", ".")
204
+ )
205
+ if set(structure) == {"."}:
206
+ continue
207
+
208
+ dot_bracket = DotBracket.from_string("N" * len(structure), structure)
209
+ structure = BpSeq.from_dotbracket(dot_bracket).dot_bracket.structure
210
+ result.append([sequence, structure])
211
+
212
+ if len(result) >= count:
213
+ break
214
+
215
+ return result
216
+
217
+
218
+ def generate_consensus_secondary_structure(
219
+ fasta: FASTA, family: str = None, fold: bool = True, count: int = 1
220
+ ):
221
+ if shutil.which("cmpress") is None or shutil.which("cmsearch") is None:
222
+ raise RuntimeError(
223
+ "cmpress/cmsearch not found in PATH, please install Infernal first."
224
+ )
225
+
226
+ cm_path = ensure_cm(family)
227
+
228
+ with tempfile.NamedTemporaryFile(suffix=".fa") as fin:
229
+ fin.write(str(fasta).encode())
230
+ fin.seek(0)
231
+
232
+ completed = subprocess.run(
233
+ ["cmsearch", "--notextw", cm_path, fin.name],
234
+ check=True,
235
+ capture_output=True,
236
+ )
237
+
238
+ results = analyze_cmsearch(completed.stdout.decode(), fasta, count)
239
+
240
+ if fold:
241
+ for i in range(len(results)):
242
+ RNAfold = RNA.fold_compound(results[i][0])
243
+ RNAfold.hc_add_from_db(results[i][1])
244
+ structure, _ = RNAfold.mfe()
245
+ results[i][1] = structure
246
+
247
+ return [
248
+ f">{fasta.header}\n{sequence}\n{structure}" for sequence, structure in results
249
+ ]
250
+
251
+
252
+ def main():
253
+ parser = argparse.ArgumentParser(
254
+ description="Generate consensus secondary structure for a given sequence. IMPORTANT! You need to have Infernal software installed to use this script."
255
+ )
256
+ parser.add_argument(
257
+ "sequence",
258
+ type=str,
259
+ help="an RNA sequence or a path to FASTA file, possibly containing multiple sequences",
260
+ )
261
+ parser.add_argument(
262
+ "--family",
263
+ type=str,
264
+ help="(optional) name of the Rfam family to use, if not given, the whole Rfam will be checked for the given sequence",
265
+ )
266
+ parser.add_argument(
267
+ "--no-fold",
268
+ action="store_true",
269
+ help="(optional) whether to disable folding of the consensus secondary structure by RNAfold with constraints",
270
+ )
271
+ parser.add_argument(
272
+ "--count",
273
+ type=int,
274
+ default=1,
275
+ help="(optional) maximum number of consensus secondary structures to generate per sequence, default is 1",
276
+ )
277
+
278
+ args = parser.parse_args()
279
+
280
+ if os.path.exists(args.sequence):
281
+ fastas = parse_fasta(args.sequence)
282
+ else:
283
+ fastas = [FASTA("header", args.sequence)]
284
+
285
+ for fasta in fastas:
286
+ results = generate_consensus_secondary_structure(
287
+ fasta, args.family, not args.no_fold, args.count
288
+ )
289
+ for result in results:
290
+ print(result)
291
+
292
+
293
+ if __name__ == "__main__":
294
+ main()