RNApolis 0.3.18__tar.gz → 0.4.1__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
Files changed (31) hide show
  1. {rnapolis-0.3.18/src/RNApolis.egg-info → rnapolis-0.4.1}/PKG-INFO +1 -1
  2. {rnapolis-0.3.18 → rnapolis-0.4.1}/setup.py +1 -1
  3. {rnapolis-0.3.18 → rnapolis-0.4.1/src/RNApolis.egg-info}/PKG-INFO +1 -1
  4. {rnapolis-0.3.18 → rnapolis-0.4.1}/src/rnapolis/parser.py +82 -32
  5. {rnapolis-0.3.18 → rnapolis-0.4.1}/src/rnapolis/tertiary.py +42 -5
  6. {rnapolis-0.3.18 → rnapolis-0.4.1}/tests/test_annotator.py +1 -3
  7. {rnapolis-0.3.18 → rnapolis-0.4.1}/tests/test_bugfixes.py +13 -2
  8. {rnapolis-0.3.18 → rnapolis-0.4.1}/tests/test_common.py +1 -2
  9. {rnapolis-0.3.18 → rnapolis-0.4.1}/tests/test_rfam_folder.py +0 -1
  10. {rnapolis-0.3.18 → rnapolis-0.4.1}/tests/test_tertiary.py +4 -4
  11. {rnapolis-0.3.18 → rnapolis-0.4.1}/LICENSE +0 -0
  12. {rnapolis-0.3.18 → rnapolis-0.4.1}/README.md +0 -0
  13. {rnapolis-0.3.18 → rnapolis-0.4.1}/pyproject.toml +0 -0
  14. {rnapolis-0.3.18 → rnapolis-0.4.1}/setup.cfg +0 -0
  15. {rnapolis-0.3.18 → rnapolis-0.4.1}/src/RNApolis.egg-info/SOURCES.txt +0 -0
  16. {rnapolis-0.3.18 → rnapolis-0.4.1}/src/RNApolis.egg-info/dependency_links.txt +0 -0
  17. {rnapolis-0.3.18 → rnapolis-0.4.1}/src/RNApolis.egg-info/entry_points.txt +0 -0
  18. {rnapolis-0.3.18 → rnapolis-0.4.1}/src/RNApolis.egg-info/requires.txt +0 -0
  19. {rnapolis-0.3.18 → rnapolis-0.4.1}/src/RNApolis.egg-info/top_level.txt +0 -0
  20. {rnapolis-0.3.18 → rnapolis-0.4.1}/src/rnapolis/annotator.py +0 -0
  21. {rnapolis-0.3.18 → rnapolis-0.4.1}/src/rnapolis/clashfinder.py +0 -0
  22. {rnapolis-0.3.18 → rnapolis-0.4.1}/src/rnapolis/common.py +0 -0
  23. {rnapolis-0.3.18 → rnapolis-0.4.1}/src/rnapolis/metareader.py +0 -0
  24. {rnapolis-0.3.18 → rnapolis-0.4.1}/src/rnapolis/molecule_filter.py +0 -0
  25. {rnapolis-0.3.18 → rnapolis-0.4.1}/src/rnapolis/motif_extractor.py +0 -0
  26. {rnapolis-0.3.18 → rnapolis-0.4.1}/src/rnapolis/rfam_folder.py +0 -0
  27. {rnapolis-0.3.18 → rnapolis-0.4.1}/src/rnapolis/transformer.py +0 -0
  28. {rnapolis-0.3.18 → rnapolis-0.4.1}/src/rnapolis/util.py +0 -0
  29. {rnapolis-0.3.18 → rnapolis-0.4.1}/tests/test_metareader.py +0 -0
  30. {rnapolis-0.3.18 → rnapolis-0.4.1}/tests/test_parser.py +0 -0
  31. {rnapolis-0.3.18 → rnapolis-0.4.1}/tests/test_quadruplexes.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: RNApolis
3
- Version: 0.3.18
3
+ Version: 0.4.1
4
4
  Summary: A Python library containing RNA-related bioinformatics functions and classes
5
5
  Home-page: https://github.com/tzok/rnapolis-py
6
6
  Author: Tomasz Zok
@@ -5,7 +5,7 @@ with open("README.md") as f:
5
5
 
6
6
  setup(
7
7
  name="RNApolis",
8
- version="0.3.18",
8
+ version="0.4.1",
9
9
  packages=["rnapolis"],
10
10
  package_dir={"": "src"},
11
11
  author="Tomasz Zok",
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: RNApolis
3
- Version: 0.3.18
3
+ Version: 0.4.1
4
4
  Summary: A Python library containing RNA-related bioinformatics functions and classes
5
5
  Home-page: https://github.com/tzok/rnapolis-py
6
6
  Author: Tomasz Zok
@@ -12,7 +12,7 @@ logger = logging.getLogger(__name__)
12
12
  def read_3d_structure(
13
13
  cif_or_pdb: IO[str], model: Optional[int] = None, nucleic_acid_only: bool = False
14
14
  ) -> Structure3D:
15
- atoms, modified, sequence = (
15
+ atoms, modified, sequence_by_entity, is_nucleic_acid_by_entity = (
16
16
  parse_cif(cif_or_pdb) if is_cif(cif_or_pdb) else parse_pdb(cif_or_pdb)
17
17
  )
18
18
  available_models = {atom.model: None for atom in atoms}
@@ -24,7 +24,13 @@ def read_3d_structure(
24
24
  atoms = atoms_by_model[model]
25
25
  else:
26
26
  atoms = atoms_by_model[list(available_models.keys())[0]]
27
- return group_atoms(atoms, modified, sequence, nucleic_acid_only)
27
+ return group_atoms(
28
+ atoms,
29
+ modified,
30
+ sequence_by_entity,
31
+ is_nucleic_acid_by_entity,
32
+ nucleic_acid_only,
33
+ )
28
34
 
29
35
 
30
36
  def is_cif(cif_or_pdb: IO[str]) -> bool:
@@ -40,7 +46,8 @@ def parse_cif(
40
46
  ) -> Tuple[
41
47
  List[Atom],
42
48
  Dict[Union[ResidueLabel, ResidueAuth], str],
43
- Dict[Tuple[str, int], str],
49
+ Dict[str, str],
50
+ Dict[str, bool],
44
51
  ]:
45
52
  cif.seek(0)
46
53
 
@@ -48,7 +55,8 @@ def parse_cif(
48
55
  data = io_adapter.readFile(cif.name)
49
56
  atoms: List[Atom] = []
50
57
  modified: Dict[Union[ResidueLabel, ResidueAuth], str] = {}
51
- sequence = {}
58
+ sequence_by_entity = {}
59
+ is_nucleic_acid_by_entity = {}
52
60
 
53
61
  if data:
54
62
  atom_site = data[0].getObj("atom_site")
@@ -59,6 +67,7 @@ def parse_cif(
59
67
  for row in atom_site.getRowList():
60
68
  row_dict = dict(zip(atom_site.getAttributeList(), row))
61
69
 
70
+ label_entity_id = row_dict.get("label_entity_id", None)
62
71
  label_chain_name = row_dict.get("label_asym_id", None)
63
72
  label_residue_number = try_parse_int(row_dict.get("label_seq_id", None))
64
73
  label_residue_name = row_dict.get("label_comp_id", None)
@@ -127,7 +136,19 @@ def parse_cif(
127
136
  else None
128
137
  )
129
138
 
130
- atoms.append(Atom(label, auth, model, atom_name, x, y, z, occupancy))
139
+ atoms.append(
140
+ Atom(
141
+ label_entity_id,
142
+ label,
143
+ auth,
144
+ model,
145
+ atom_name,
146
+ x,
147
+ y,
148
+ z,
149
+ occupancy,
150
+ )
151
+ )
131
152
 
132
153
  if mod_residue:
133
154
  for row in mod_residue.getRowList():
@@ -178,17 +199,24 @@ def parse_cif(
178
199
  for row in entity_poly.getRowList():
179
200
  row_dict = dict(zip(entity_poly.getAttributeList(), row))
180
201
 
181
- pdbx_strand_id = row_dict.get("pdbx_strand_id", None)
202
+ entity_id = row_dict.get("entity_id", None)
203
+ type_ = row_dict.get("type", None)
182
204
  pdbx_seq_one_letter_code_can = row_dict.get(
183
205
  "pdbx_seq_one_letter_code_can", None
184
206
  )
185
207
 
186
- if pdbx_strand_id and pdbx_seq_one_letter_code_can:
187
- for strand in pdbx_strand_id.split(","):
188
- for i, letter in enumerate(pdbx_seq_one_letter_code_can):
189
- sequence[(strand, i + 1)] = letter
208
+ if entity_id and type_:
209
+ is_nucleic_acid_by_entity[entity_id] = type_ in (
210
+ "peptide nucleic acid",
211
+ "polydeoxyribonucleotide",
212
+ "polydeoxyribonucleotide/polyribonucleotide hybrid",
213
+ "polyribonucleotide",
214
+ )
215
+
216
+ if entity_id and pdbx_seq_one_letter_code_can:
217
+ sequence_by_entity[entity_id] = pdbx_seq_one_letter_code_can
190
218
 
191
- return atoms, modified, sequence
219
+ return atoms, modified, sequence_by_entity, is_nucleic_acid_by_entity
192
220
 
193
221
 
194
222
  def parse_pdb(
@@ -196,7 +224,8 @@ def parse_pdb(
196
224
  ) -> Tuple[
197
225
  List[Atom],
198
226
  Dict[Union[ResidueLabel, ResidueAuth], str],
199
- Dict[Tuple[str, int], str],
227
+ Dict[str, str],
228
+ Dict[str, bool],
200
229
  ]:
201
230
  pdb.seek(0)
202
231
  atoms: List[Atom] = []
@@ -222,7 +251,7 @@ def parse_pdb(
222
251
  auth = ResidueAuth(
223
252
  chain_identifier, residue_number, insertion_code, residue_name
224
253
  )
225
- atoms.append(Atom(None, auth, model, atom_name, x, y, z, occupancy))
254
+ atoms.append(Atom(None, None, auth, model, atom_name, x, y, z, occupancy))
226
255
  elif line.startswith("MODRES"):
227
256
  original_name = line[12:15]
228
257
  chain_identifier = line[16]
@@ -234,13 +263,14 @@ def parse_pdb(
234
263
  )
235
264
  modified[auth] = standard_residue_name
236
265
 
237
- return atoms, modified, {}
266
+ return atoms, modified, {}, {}
238
267
 
239
268
 
240
269
  def group_atoms(
241
270
  atoms: List[Atom],
242
271
  modified: Dict[Union[ResidueLabel, ResidueAuth], str],
243
- sequence: Dict[Tuple[str, int], str],
272
+ sequence_by_entity: Dict[str, str],
273
+ is_nucleic_acid_by_entity: Dict[str, bool],
244
274
  nucleic_acid_only: bool,
245
275
  ) -> Structure3D:
246
276
  if not atoms:
@@ -258,28 +288,45 @@ def group_atoms(
258
288
  label = key_previous[0]
259
289
  auth = key_previous[1]
260
290
  model = key_previous[2]
291
+ entity_id = residue_atoms[-1].entity_id
261
292
  name = get_residue_name(auth, label, modified)
262
- one_letter_name = get_one_letter_name(label, sequence, name)
263
- if one_letter_name not in "ACGUT":
293
+ one_letter_name = get_one_letter_name(
294
+ entity_id, label, sequence_by_entity, name
295
+ )
296
+
297
+ if one_letter_name not in "ACGUTN":
264
298
  one_letter_name = detect_one_letter_name(residue_atoms)
265
- residue = Residue3D(
266
- label, auth, model, one_letter_name, tuple(residue_atoms)
299
+
300
+ residues.append(
301
+ Residue3D(label, auth, model, one_letter_name, tuple(residue_atoms))
267
302
  )
268
- if not nucleic_acid_only or (nucleic_acid_only and residue.is_nucleotide):
269
- residues.append(residue)
303
+
270
304
  key_previous = key
271
305
  residue_atoms = [atom]
272
306
 
273
307
  label = key_previous[0]
274
308
  auth = key_previous[1]
275
309
  model = key_previous[2]
310
+ entity_id = residue_atoms[-1].entity_id
276
311
  name = get_residue_name(auth, label, modified)
277
- one_letter_name = get_one_letter_name(label, sequence, name)
278
- if one_letter_name not in "ACGUT":
312
+ one_letter_name = get_one_letter_name(entity_id, label, sequence_by_entity, name)
313
+
314
+ if one_letter_name not in "ACGUTN":
279
315
  one_letter_name = detect_one_letter_name(residue_atoms)
280
- residue = Residue3D(label, auth, model, one_letter_name, tuple(residue_atoms))
281
- if not nucleic_acid_only or (nucleic_acid_only and residue.is_nucleotide):
282
- residues.append(residue)
316
+
317
+ residues.append(
318
+ Residue3D(label, auth, model, one_letter_name, tuple(residue_atoms))
319
+ )
320
+
321
+ if nucleic_acid_only:
322
+ if is_nucleic_acid_by_entity:
323
+ residues = [
324
+ residue
325
+ for residue in residues
326
+ if is_nucleic_acid_by_entity[residue.atoms[0].entity_id]
327
+ ]
328
+ else:
329
+ residues = [residue for residue in residues if residue.is_nucleotide]
283
330
 
284
331
  return Structure3D(residues)
285
332
 
@@ -304,13 +351,14 @@ def get_residue_name(
304
351
 
305
352
 
306
353
  def get_one_letter_name(
307
- label: Optional[ResidueLabel], sequence: Dict[Tuple[str, int], str], name: str
354
+ entity_id: Optional[str],
355
+ label: Optional[ResidueLabel],
356
+ sequence_by_entity: Dict[str, str],
357
+ name: str,
308
358
  ) -> str:
309
359
  # try getting the value from _entity_poly first
310
- if label is not None:
311
- key = (label.chain, label.number)
312
- if key in sequence:
313
- return sequence[key]
360
+ if entity_id is not None and label is not None and entity_id in sequence_by_entity:
361
+ return sequence_by_entity[entity_id][label.number - 1]
314
362
  # RNA
315
363
  if len(name) == 1:
316
364
  return name
@@ -334,11 +382,13 @@ def detect_one_letter_name(atoms: List[Atom]) -> str:
334
382
  ) / len(atom_names_expected)
335
383
  score[candidate] = count
336
384
  items = sorted(score.items(), key=lambda kv: kv[1], reverse=True)
385
+ if items[0][1] == 0:
386
+ return "?"
337
387
  return items[0][0]
338
388
 
339
389
 
340
390
  def try_parse_int(s: str) -> Optional[int]:
341
391
  try:
342
392
  return int(s)
343
- except:
393
+ except ValueError:
344
394
  return None
@@ -96,6 +96,7 @@ AVERAGE_OXYGEN_PHOSPHORUS_DISTANCE_COVALENT = 1.6
96
96
 
97
97
  @dataclass(frozen=True, order=True)
98
98
  class Atom:
99
+ entity_id: Optional[str]
99
100
  label: Optional[ResidueLabel]
100
101
  auth: Optional[ResidueAuth]
101
102
  model: int
@@ -128,6 +129,29 @@ class Residue3D(Residue):
128
129
  "C": set(["N1", "C2", "O2", "N3", "C4", "N4", "C5", "C6"]),
129
130
  "U": set(["N1", "C2", "O2", "N3", "C4", "O4", "C5", "C6"]),
130
131
  }
132
+ # Heavy atoms in nucleotide
133
+ nucleotide_heavy_atoms = (
134
+ set(
135
+ [
136
+ "P",
137
+ "OP1",
138
+ "OP2",
139
+ "O5'",
140
+ "C5'",
141
+ "C4'",
142
+ "O4'",
143
+ "C3'",
144
+ "O3'",
145
+ "C2'",
146
+ "O2'",
147
+ "C1'",
148
+ ]
149
+ )
150
+ .union(nucleobase_heavy_atoms["A"])
151
+ .union(nucleobase_heavy_atoms["G"])
152
+ .union(nucleobase_heavy_atoms["C"])
153
+ .union(nucleobase_heavy_atoms["U"])
154
+ )
131
155
 
132
156
  def __lt__(self, other):
133
157
  return (self.model, self.chain, self.number, self.icode or " ") < (
@@ -176,8 +200,8 @@ class Residue3D(Residue):
176
200
 
177
201
  @cached_property
178
202
  def is_nucleotide(self) -> bool:
179
- return len(self.atoms) > 1 and any(
180
- [atom for atom in self.atoms if atom.name == "C1'"]
203
+ return self.nucleotide_heavy_atoms.intersection(
204
+ set([atom.name for atom in self.atoms])
181
205
  )
182
206
 
183
207
  @cached_property
@@ -268,7 +292,7 @@ class Residue3D(Residue):
268
292
  logging.error(
269
293
  f"Failed to determine the outermost atom for nucleotide {self}, so an arbitrary atom will be used"
270
294
  )
271
- yield Atom(self.label, self.auth, self.model, "UNK", 0.0, 0.0, 0.0, None)
295
+ yield Atom(None, self.label, self.auth, self.model, "UNK", 0.0, 0.0, 0.0, None)
272
296
 
273
297
  def __inner_generator(self):
274
298
  # try to find expected atom name
@@ -296,7 +320,7 @@ class Residue3D(Residue):
296
320
  logging.error(
297
321
  f"Failed to determine the innermost atom for nucleotide {self}, so an arbitrary atom will be used"
298
322
  )
299
- yield Atom(self.label, self.auth, self.model, "UNK", 0.0, 0.0, 0.0, None)
323
+ yield Atom(None, self.label, self.auth, self.model, "UNK", 0.0, 0.0, 0.0, None)
300
324
 
301
325
 
302
326
  @dataclass(frozen=True, order=True)
@@ -524,7 +548,20 @@ class Mapping2D3D:
524
548
  result: Dict[int, List] = {}
525
549
  residue_map: Dict[Residue3D, int] = {}
526
550
  i = 1
527
- for residue in self.structure3d.residues:
551
+
552
+ for j, residue in enumerate(self.structure3d.residues):
553
+ if self.find_gaps and j > 0:
554
+ previous = self.structure3d.residues[j - 1]
555
+ if (
556
+ previous.is_nucleotide
557
+ and residue.is_nucleotide
558
+ and previous.label
559
+ and residue.label
560
+ and previous.label.chain == residue.label.chain
561
+ ):
562
+ for k in range(residue.label.number - previous.label.number - 1):
563
+ result[i] = [i, "?", 0]
564
+ i += 1
528
565
  if residue.is_nucleotide:
529
566
  result[i] = [i, residue.one_letter_name, 0]
530
567
  residue_map[residue] = i
@@ -36,9 +36,7 @@ def test_1ehz():
36
36
  for bp in interactions[i]
37
37
  if (bp.nt1.full_name, bp.nt2.full_name) == element
38
38
  ]
39
- assert (
40
- False
41
- ), f"Interaction {element} occurs {count} times among {labels[i]} type: {duplicates}"
39
+ assert False, f"Interaction {element} occurs {count} times among {labels[i]} type: {duplicates}"
42
40
 
43
41
 
44
42
  def test_8btk():
@@ -34,7 +34,7 @@ def test_1DFU():
34
34
  assert b1u not in mapping.base_pair_graph[b2g]
35
35
 
36
36
 
37
- # in 4WTI the first residue has only O3' atom and so is not considered a nucleotide
37
+ # in 4WTI the first residue has only O3' atom, but is stil considered a nucleotide
38
38
  def test_4WTI():
39
39
  with open("tests/4WTI_1_T-P.cif") as f:
40
40
  structure3d = read_3d_structure(f, 1)
@@ -42,7 +42,7 @@ def test_4WTI():
42
42
  mapping = Mapping2D3D(
43
43
  structure3d, base_interactions.basePairs, base_interactions.stackings, True
44
44
  )
45
- assert mapping.dot_bracket == ">strand_T\nCGG\n.((\n>strand_P\nCC\n))"
45
+ assert mapping.dot_bracket == ">strand_T\nACGG\n..((\n>strand_P\nCC\n))"
46
46
 
47
47
 
48
48
  # in 1HMH the bases are oriented in 45 degrees and it caused the program to identify invalid base pair
@@ -64,3 +64,14 @@ def test_6INQ():
64
64
  assert structure3d.find_residue(None, ResidueAuth("T", 0, None, "DC")) is not None
65
65
  assert structure3d.find_residue(ResidueLabel("O", 126, "DG"), None) is not None
66
66
  assert structure3d.find_residue(None, ResidueAuth("N", 0, None, "DG")) is not None
67
+
68
+
69
+ # in 6g90 from rna3db, the sequence contains Ns which were ignored incorrectly
70
+ def test_6g90():
71
+ with open("tests/6g90_1.cif") as f:
72
+ structure3d = read_3d_structure(f, nucleic_acid_only=True)
73
+ sequence = "".join([residue.one_letter_name for residue in structure3d.residues])
74
+ assert (
75
+ sequence
76
+ == "AUACUUACCUUAAGAUAUCAGAGGAGAUCAAGAAGUCCUACUGAUCAAACAUGCGCUUCCAAGAAGGACGUUAAGCAUUUAUCAUUGAACGUUCAUUGAACAUUGAUGCAAACUCCUUGGUCACACACACGCGGAAGGCGUGUUUGCUGACGUCCCUUGUUUCAAUCAUUGGUUAACUGAUUUUUGGGGCCCUUUGUUCUUCUGAGAAGUGACACCAAUUGGUGUUAGGGGAGCUGGGGCCUUUCAAAANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNUUUUGGAAGGUCUUGGUCGGGUGGAUCUUAUAAUUUUUGAUUUA"
77
+ )
@@ -2,7 +2,6 @@ from collections import Counter
2
2
 
3
3
  from hypothesis import given, settings
4
4
  from hypothesis import strategies as st
5
-
6
5
  from rnapolis.common import (
7
6
  BaseInteractions,
8
7
  BasePair,
@@ -93,7 +92,7 @@ def test_rnapdbee_adapters_api_compliance_structure2d(obj):
93
92
 
94
93
  def test_bpseq_from_dotbracket():
95
94
  expected = BpSeq.from_file("tests/1ET4-A.bpseq")
96
- actual = BpSeq.from_dotbracket(DotBracket.from_file(f"tests/1ET4-A.dbn"))
95
+ actual = BpSeq.from_dotbracket(DotBracket.from_file("tests/1ET4-A.dbn"))
97
96
  assert expected == actual
98
97
 
99
98
 
@@ -1,7 +1,6 @@
1
1
  import os
2
2
 
3
3
  import pytest
4
-
5
4
  from rnapolis.rfam_folder import generate_consensus_secondary_structure, parse_fasta
6
5
 
7
6
  IN_GITHUB_ACTIONS = os.getenv("GITHUB_ACTIONS") == "true"
@@ -5,10 +5,10 @@ from rnapolis.tertiary import Atom, torsion_angle
5
5
 
6
6
 
7
7
  def test_torsion_angle():
8
- a1 = Atom(None, None, 1, "P", 50.63, 49.73, 50.57, None)
9
- a2 = Atom(None, None, 1, "O5'", 50.16, 49.14, 52.02, None)
10
- a3 = Atom(None, None, 1, "C5'", 50.22, 49.95, 53.21, None)
11
- a4 = Atom(None, None, 1, "C4'", 50.97, 49.23, 54.31, None)
8
+ a1 = Atom(None, None, None, 1, "P", 50.63, 49.73, 50.57, None)
9
+ a2 = Atom(None, None, None, 1, "O5'", 50.16, 49.14, 52.02, None)
10
+ a3 = Atom(None, None, None, 1, "C5'", 50.22, 49.95, 53.21, None)
11
+ a4 = Atom(None, None, None, 1, "C4'", 50.97, 49.23, 54.31, None)
12
12
  assert math.isclose(
13
13
  math.degrees(torsion_angle(a1, a2, a3, a4)), -127.83976634524326
14
14
  )
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes