RNApolis 0.4.4__py3-none-any.whl → 0.4.7__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: RNApolis
3
- Version: 0.4.4
3
+ Version: 0.4.7
4
4
  Summary: A Python library containing RNA-related bioinformatics functions and classes
5
5
  Home-page: https://github.com/tzok/rnapolis-py
6
6
  Author: Tomasz Zok
@@ -1,17 +1,17 @@
1
1
  rnapolis/annotator.py,sha256=7U3f0gchKdIGc6FwJx0UAc_95HJI5SgECj-b7-1yBhc,22086
2
2
  rnapolis/clashfinder.py,sha256=i95kp0o6OWNqmJDBr-PbsZd7RY2iJtBDr7QqolJSuAQ,8513
3
- rnapolis/common.py,sha256=PUYF01P2vevhyImhZjGYE0jJlsxWHX6GQmsxI4W7S-E,30255
3
+ rnapolis/common.py,sha256=NWhlPwT521jCSWcDcm_TNoYENjoZWpllf9sS-WuTEmA,30361
4
4
  rnapolis/metareader.py,sha256=I1-cXc2YNBPwa3zihAnMTjEsAo79tEKzSmWu5yvN1Pk,2071
5
5
  rnapolis/molecule_filter.py,sha256=hB6-nXgjmw7FAsQ3bj0cZ2FvuW2I1PXunEfcdwEUB1o,7389
6
6
  rnapolis/motif_extractor.py,sha256=duHvpi9Ulcny9K60E6VBpz5RpJZw-KdTB4_Ph0iP478,774
7
- rnapolis/parser.py,sha256=wCA9rXqt51iLECgeBqOShFpuT8JwanNkHYD5uXYvLzU,13988
7
+ rnapolis/parser.py,sha256=2pQYy0sh8TCpeluMmmSJ7C5dudK_bsfstTWCdpwwpNU,15193
8
8
  rnapolis/rfam_folder.py,sha256=SjiiyML_T1__saruFwSMJEoQ7Y55GIU8ktS8ZUn5-fw,11111
9
- rnapolis/tertiary.py,sha256=SQyiYWA0RJhAK70f88CKZvS4EzGKHQ2RoL1s4MueEDQ,21657
9
+ rnapolis/tertiary.py,sha256=6t9ZB4w33-5n_M3sns1RoFXCOTgVAgGH4WDNG5OG9Kg,23426
10
10
  rnapolis/transformer.py,sha256=V9nOQvdq4-p7yUWo0vQg0CDQMpmyxz9t4TMSRVEKHnw,1817
11
11
  rnapolis/util.py,sha256=IdquFO3PV1_KDqodjupzm0Rqvgy0CeSzxGHaGEHYXVU,543
12
- RNApolis-0.4.4.dist-info/LICENSE,sha256=ZGRu12MzCgbYA-Lt8MyBlmjvPZh7xfiD5u5wBx0enq4,1066
13
- RNApolis-0.4.4.dist-info/METADATA,sha256=irtWJbeg1LWun2r3WtnsnDDSHlLvru0hO9wz1e67cIE,54322
14
- RNApolis-0.4.4.dist-info/WHEEL,sha256=cVxcB9AmuTcXqmwrtPhNK88dr7IR_b6qagTj0UvIEbY,91
15
- RNApolis-0.4.4.dist-info/entry_points.txt,sha256=foN2Pn5e-OzEz0fFmNoX6PnFSZFQntOlY8LbognP5F0,308
16
- RNApolis-0.4.4.dist-info/top_level.txt,sha256=LcO18koxZcWoJ21KDRRRo_tyIbmXL5z61dPitZpy8yc,9
17
- RNApolis-0.4.4.dist-info/RECORD,,
12
+ RNApolis-0.4.7.dist-info/LICENSE,sha256=ZGRu12MzCgbYA-Lt8MyBlmjvPZh7xfiD5u5wBx0enq4,1066
13
+ RNApolis-0.4.7.dist-info/METADATA,sha256=551L8oU_7CdBw7v0jezfHQX7YzF9Fo83E6NVbLVfA50,54322
14
+ RNApolis-0.4.7.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
15
+ RNApolis-0.4.7.dist-info/entry_points.txt,sha256=foN2Pn5e-OzEz0fFmNoX6PnFSZFQntOlY8LbognP5F0,308
16
+ RNApolis-0.4.7.dist-info/top_level.txt,sha256=LcO18koxZcWoJ21KDRRRo_tyIbmXL5z61dPitZpy8yc,9
17
+ RNApolis-0.4.7.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (74.1.2)
2
+ Generator: setuptools (75.6.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
rnapolis/common.py CHANGED
@@ -338,6 +338,9 @@ class Entry(Sequence):
338
338
  return self.pair
339
339
  raise IndexError()
340
340
 
341
+ def __lt__(self, other):
342
+ return self.index_ < other.index_
343
+
341
344
  def __len__(self) -> int:
342
345
  return 3
343
346
 
@@ -838,7 +841,7 @@ class BpSeq:
838
841
 
839
842
  for i in range(1, len(regions)):
840
843
  k, l, _ = regions[i]
841
- available = [True for i in range(10)]
844
+ available = [True for _ in range(len("([{<" + string.ascii_uppercase))]
842
845
 
843
846
  for j in range(i):
844
847
  m, n, _ = regions[j]
rnapolis/parser.py CHANGED
@@ -1,7 +1,10 @@
1
1
  import logging
2
2
  from typing import IO, Dict, List, Optional, Tuple, Union
3
3
 
4
+ import numpy as np
4
5
  from mmcif.io.IoAdapterPy import IoAdapterPy
6
+ from scipy.spatial import KDTree
7
+
5
8
  from rnapolis.common import ResidueAuth, ResidueLabel
6
9
  from rnapolis.tertiary import BASE_ATOMS, Atom, Residue3D, Structure3D
7
10
 
@@ -53,10 +56,10 @@ def parse_cif(
53
56
 
54
57
  io_adapter = IoAdapterPy()
55
58
  data = io_adapter.readFile(cif.name)
56
- atoms: List[Atom] = []
59
+ atoms_to_process: List[Atom] = []
57
60
  modified: Dict[Union[ResidueLabel, ResidueAuth], str] = {}
58
- sequence_by_entity = {}
59
- is_nucleic_acid_by_entity = {}
61
+ sequence_by_entity: Dict[str, str] = {}
62
+ is_nucleic_acid_by_entity: Dict[str, bool] = {}
60
63
 
61
64
  if data:
62
65
  atom_site = data[0].getObj("atom_site")
@@ -136,7 +139,7 @@ def parse_cif(
136
139
  else None
137
140
  )
138
141
 
139
- atoms.append(
142
+ atoms_to_process.append(
140
143
  Atom(
141
144
  label_entity_id,
142
145
  label,
@@ -216,6 +219,7 @@ def parse_cif(
216
219
  if entity_id and pdbx_seq_one_letter_code_can:
217
220
  sequence_by_entity[entity_id] = pdbx_seq_one_letter_code_can
218
221
 
222
+ atoms = filter_clashing_atoms(atoms_to_process)
219
223
  return atoms, modified, sequence_by_entity, is_nucleic_acid_by_entity
220
224
 
221
225
 
@@ -228,7 +232,7 @@ def parse_pdb(
228
232
  Dict[str, bool],
229
233
  ]:
230
234
  pdb.seek(0)
231
- atoms: List[Atom] = []
235
+ atoms_to_process: List[Atom] = []
232
236
  modified: Dict[Union[ResidueLabel, ResidueAuth], str] = {}
233
237
  model = 1
234
238
 
@@ -236,9 +240,6 @@ def parse_pdb(
236
240
  if line.startswith("MODEL"):
237
241
  model = int(line[10:14].strip())
238
242
  elif line.startswith("ATOM") or line.startswith("HETATM"):
239
- alternate_location = line[16]
240
- if alternate_location != " ":
241
- continue
242
243
  atom_name = line[12:16].strip()
243
244
  residue_name = line[17:20].strip()
244
245
  chain_identifier = line[21]
@@ -251,7 +252,10 @@ def parse_pdb(
251
252
  auth = ResidueAuth(
252
253
  chain_identifier, residue_number, insertion_code, residue_name
253
254
  )
254
- atoms.append(Atom(None, None, auth, model, atom_name, x, y, z, occupancy))
255
+
256
+ atoms_to_process.append(
257
+ Atom(None, None, auth, model, atom_name, x, y, z, occupancy)
258
+ )
255
259
  elif line.startswith("MODRES"):
256
260
  original_name = line[12:15]
257
261
  chain_identifier = line[16]
@@ -263,6 +267,7 @@ def parse_pdb(
263
267
  )
264
268
  modified[auth] = standard_residue_name
265
269
 
270
+ atoms = filter_clashing_atoms(atoms_to_process)
266
271
  return atoms, modified, {}, {}
267
272
 
268
273
 
@@ -392,3 +397,36 @@ def try_parse_int(s: str) -> Optional[int]:
392
397
  return int(s)
393
398
  except ValueError:
394
399
  return None
400
+
401
+
402
+ def filter_clashing_atoms(atoms: List[Atom], clash_distance: float = 0.5) -> List[Atom]:
403
+ # First, remove duplicate atoms
404
+ unique_atoms = {}
405
+
406
+ for i, atom in enumerate(atoms):
407
+ key = (atom.label, atom.auth, atom.name)
408
+ if key not in unique_atoms or atom.occupancy > unique_atoms[key].occupancy:
409
+ unique_atoms[key] = atom
410
+
411
+ unique_atoms_list = list(unique_atoms.values())
412
+
413
+ # Now handle clashing atoms
414
+ coords = np.array([(atom.x, atom.y, atom.z) for atom in unique_atoms_list])
415
+ tree = KDTree(coords)
416
+
417
+ pairs = tree.query_pairs(r=clash_distance)
418
+
419
+ atoms_to_keep = set(range(len(unique_atoms_list)))
420
+
421
+ for i, j in pairs:
422
+ if (
423
+ unique_atoms_list[i].occupancy is None
424
+ or unique_atoms_list[j].occupancy is None
425
+ ):
426
+ continue
427
+ if unique_atoms_list[i].occupancy > unique_atoms_list[j].occupancy:
428
+ atoms_to_keep.discard(j)
429
+ else:
430
+ atoms_to_keep.discard(i)
431
+
432
+ return [unique_atoms_list[i] for i in atoms_to_keep]
rnapolis/tertiary.py CHANGED
@@ -124,36 +124,17 @@ class Residue3D(Residue):
124
124
  outermost_atoms = {"A": "N9", "G": "N9", "C": "N1", "U": "N1", "T": "N1"}
125
125
  # Dist representing expected name of atom closest to the tetrad center
126
126
  innermost_atoms = {"A": "N6", "G": "O6", "C": "N4", "U": "O4", "T": "O4"}
127
+ # Heavy atoms in phosphate and ribose
128
+ phosphate_atoms = {"P", "OP1", "OP2", "O3'", "O5'"}
129
+ sugar_atoms = {"C1'", "C2'", "C3'", "C4'", "C5'", "O4'"}
127
130
  # Heavy atoms for each main nucleobase
128
131
  nucleobase_heavy_atoms = {
129
132
  "A": set(["N1", "C2", "N3", "C4", "C5", "C6", "N6", "N7", "C8", "N9"]),
130
133
  "G": set(["N1", "C2", "N2", "N3", "C4", "C5", "C6", "O6", "N7", "C8", "N9"]),
131
134
  "C": set(["N1", "C2", "O2", "N3", "C4", "N4", "C5", "C6"]),
132
135
  "U": set(["N1", "C2", "O2", "N3", "C4", "O4", "C5", "C6"]),
136
+ "T": set(["N1", "C2", "O2", "N3", "C4", "O4", "C5", "C5M", "C6"]),
133
137
  }
134
- # Heavy atoms in nucleotide
135
- nucleotide_heavy_atoms = (
136
- set(
137
- [
138
- "P",
139
- "OP1",
140
- "OP2",
141
- "O5'",
142
- "C5'",
143
- "C4'",
144
- "O4'",
145
- "C3'",
146
- "O3'",
147
- "C2'",
148
- "O2'",
149
- "C1'",
150
- ]
151
- )
152
- .union(nucleobase_heavy_atoms["A"])
153
- .union(nucleobase_heavy_atoms["G"])
154
- .union(nucleobase_heavy_atoms["C"])
155
- .union(nucleobase_heavy_atoms["U"])
156
- )
157
138
 
158
139
  def __lt__(self, other):
159
140
  return (self.model, self.chain, self.number, self.icode or " ") < (
@@ -202,9 +183,59 @@ class Residue3D(Residue):
202
183
 
203
184
  @cached_property
204
185
  def is_nucleotide(self) -> bool:
205
- return self.nucleotide_heavy_atoms.intersection(
206
- set([atom.name for atom in self.atoms])
186
+ scores = {"phosphate": 0.0, "sugar": 0.0, "base": 0.0, "connections": 0.0}
187
+ weights = {"phosphate": 0.25, "sugar": 0.25, "base": 0.25, "connections": 0.25}
188
+
189
+ residue_atoms = {atom.name for atom in self.atoms}
190
+
191
+ phosphate_match = len(residue_atoms.intersection(self.phosphate_atoms))
192
+ scores["phosphate"] = phosphate_match / len(self.phosphate_atoms)
193
+
194
+ sugar_match = len(residue_atoms.intersection(self.sugar_atoms))
195
+ scores["sugar"] = sugar_match / len(self.sugar_atoms)
196
+
197
+ nucleobase_atoms = {
198
+ key: self.nucleobase_heavy_atoms[key] for key in self.nucleobase_heavy_atoms
199
+ }
200
+ matches = {
201
+ key: len(residue_atoms.intersection(nucleobase_atoms[key]))
202
+ / len(nucleobase_atoms[key])
203
+ for key in nucleobase_atoms
204
+ }
205
+ best_match = max(matches.items(), key=lambda x: x[1])
206
+ scores["base"] = best_match[1]
207
+
208
+ connection_score = 0.0
209
+ distance_threshold = 2.0
210
+
211
+ if "P" in residue_atoms and "O5'" in residue_atoms:
212
+ p_atom = next(atom for atom in self.atoms if atom.name == "P")
213
+ o5_atom = next(atom for atom in self.atoms if atom.name == "O5'")
214
+ if (
215
+ numpy.linalg.norm(p_atom.coordinates - o5_atom.coordinates)
216
+ <= distance_threshold
217
+ ):
218
+ connection_score += 0.5
219
+ if "C1'" in residue_atoms:
220
+ c1_atom = next(atom for atom in self.atoms if atom.name == "C1'")
221
+ for base_connection in ["N9", "N1"]:
222
+ if base_connection in residue_atoms:
223
+ base_atom = next(
224
+ atom for atom in self.atoms if atom.name == base_connection
225
+ )
226
+ if (
227
+ numpy.linalg.norm(c1_atom.coordinates - base_atom.coordinates)
228
+ <= distance_threshold
229
+ ):
230
+ connection_score += 0.5
231
+ break
232
+
233
+ scores["connections"] = connection_score
234
+
235
+ probability = sum(
236
+ scores[component] * weights[component] for component in scores.keys()
207
237
  )
238
+ return probability > 0.5
208
239
 
209
240
  @cached_property
210
241
  def base_normal_vector(self) -> Optional[numpy.typing.NDArray[numpy.floating]]:
@@ -566,15 +597,14 @@ class Mapping2D3D:
566
597
  return self.__generate_bpseq(canonical)
567
598
 
568
599
  def __generate_bpseq(self, base_pairs):
600
+ nucleotides = list(filter(lambda r: r.is_nucleotide, self.structure3d.residues))
569
601
  result: Dict[int, List] = {}
570
602
  residue_map: Dict[Residue3D, int] = {}
571
603
  i = 1
572
604
 
573
- for j, residue in enumerate(
574
- filter(lambda r: r.is_nucleotide, self.structure3d.residues)
575
- ):
605
+ for j, residue in enumerate(nucleotides):
576
606
  if self.find_gaps and j > 0:
577
- previous = self.structure3d.residues[j - 1]
607
+ previous = nucleotides[j - 1]
578
608
 
579
609
  if (
580
610
  not previous.is_connected(residue)