RNApolis 0.4.4__py3-none-any.whl → 0.4.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: RNApolis
3
- Version: 0.4.4
3
+ Version: 0.4.7
4
4
  Summary: A Python library containing RNA-related bioinformatics functions and classes
5
5
  Home-page: https://github.com/tzok/rnapolis-py
6
6
  Author: Tomasz Zok
@@ -1,17 +1,17 @@
1
1
  rnapolis/annotator.py,sha256=7U3f0gchKdIGc6FwJx0UAc_95HJI5SgECj-b7-1yBhc,22086
2
2
  rnapolis/clashfinder.py,sha256=i95kp0o6OWNqmJDBr-PbsZd7RY2iJtBDr7QqolJSuAQ,8513
3
- rnapolis/common.py,sha256=PUYF01P2vevhyImhZjGYE0jJlsxWHX6GQmsxI4W7S-E,30255
3
+ rnapolis/common.py,sha256=NWhlPwT521jCSWcDcm_TNoYENjoZWpllf9sS-WuTEmA,30361
4
4
  rnapolis/metareader.py,sha256=I1-cXc2YNBPwa3zihAnMTjEsAo79tEKzSmWu5yvN1Pk,2071
5
5
  rnapolis/molecule_filter.py,sha256=hB6-nXgjmw7FAsQ3bj0cZ2FvuW2I1PXunEfcdwEUB1o,7389
6
6
  rnapolis/motif_extractor.py,sha256=duHvpi9Ulcny9K60E6VBpz5RpJZw-KdTB4_Ph0iP478,774
7
- rnapolis/parser.py,sha256=wCA9rXqt51iLECgeBqOShFpuT8JwanNkHYD5uXYvLzU,13988
7
+ rnapolis/parser.py,sha256=2pQYy0sh8TCpeluMmmSJ7C5dudK_bsfstTWCdpwwpNU,15193
8
8
  rnapolis/rfam_folder.py,sha256=SjiiyML_T1__saruFwSMJEoQ7Y55GIU8ktS8ZUn5-fw,11111
9
- rnapolis/tertiary.py,sha256=SQyiYWA0RJhAK70f88CKZvS4EzGKHQ2RoL1s4MueEDQ,21657
9
+ rnapolis/tertiary.py,sha256=6t9ZB4w33-5n_M3sns1RoFXCOTgVAgGH4WDNG5OG9Kg,23426
10
10
  rnapolis/transformer.py,sha256=V9nOQvdq4-p7yUWo0vQg0CDQMpmyxz9t4TMSRVEKHnw,1817
11
11
  rnapolis/util.py,sha256=IdquFO3PV1_KDqodjupzm0Rqvgy0CeSzxGHaGEHYXVU,543
12
- RNApolis-0.4.4.dist-info/LICENSE,sha256=ZGRu12MzCgbYA-Lt8MyBlmjvPZh7xfiD5u5wBx0enq4,1066
13
- RNApolis-0.4.4.dist-info/METADATA,sha256=irtWJbeg1LWun2r3WtnsnDDSHlLvru0hO9wz1e67cIE,54322
14
- RNApolis-0.4.4.dist-info/WHEEL,sha256=cVxcB9AmuTcXqmwrtPhNK88dr7IR_b6qagTj0UvIEbY,91
15
- RNApolis-0.4.4.dist-info/entry_points.txt,sha256=foN2Pn5e-OzEz0fFmNoX6PnFSZFQntOlY8LbognP5F0,308
16
- RNApolis-0.4.4.dist-info/top_level.txt,sha256=LcO18koxZcWoJ21KDRRRo_tyIbmXL5z61dPitZpy8yc,9
17
- RNApolis-0.4.4.dist-info/RECORD,,
12
+ RNApolis-0.4.7.dist-info/LICENSE,sha256=ZGRu12MzCgbYA-Lt8MyBlmjvPZh7xfiD5u5wBx0enq4,1066
13
+ RNApolis-0.4.7.dist-info/METADATA,sha256=551L8oU_7CdBw7v0jezfHQX7YzF9Fo83E6NVbLVfA50,54322
14
+ RNApolis-0.4.7.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
15
+ RNApolis-0.4.7.dist-info/entry_points.txt,sha256=foN2Pn5e-OzEz0fFmNoX6PnFSZFQntOlY8LbognP5F0,308
16
+ RNApolis-0.4.7.dist-info/top_level.txt,sha256=LcO18koxZcWoJ21KDRRRo_tyIbmXL5z61dPitZpy8yc,9
17
+ RNApolis-0.4.7.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (74.1.2)
2
+ Generator: setuptools (75.6.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
rnapolis/common.py CHANGED
@@ -338,6 +338,9 @@ class Entry(Sequence):
338
338
  return self.pair
339
339
  raise IndexError()
340
340
 
341
+ def __lt__(self, other):
342
+ return self.index_ < other.index_
343
+
341
344
  def __len__(self) -> int:
342
345
  return 3
343
346
 
@@ -838,7 +841,7 @@ class BpSeq:
838
841
 
839
842
  for i in range(1, len(regions)):
840
843
  k, l, _ = regions[i]
841
- available = [True for i in range(10)]
844
+ available = [True for _ in range(len("([{<" + string.ascii_uppercase))]
842
845
 
843
846
  for j in range(i):
844
847
  m, n, _ = regions[j]
rnapolis/parser.py CHANGED
@@ -1,7 +1,10 @@
1
1
  import logging
2
2
  from typing import IO, Dict, List, Optional, Tuple, Union
3
3
 
4
+ import numpy as np
4
5
  from mmcif.io.IoAdapterPy import IoAdapterPy
6
+ from scipy.spatial import KDTree
7
+
5
8
  from rnapolis.common import ResidueAuth, ResidueLabel
6
9
  from rnapolis.tertiary import BASE_ATOMS, Atom, Residue3D, Structure3D
7
10
 
@@ -53,10 +56,10 @@ def parse_cif(
53
56
 
54
57
  io_adapter = IoAdapterPy()
55
58
  data = io_adapter.readFile(cif.name)
56
- atoms: List[Atom] = []
59
+ atoms_to_process: List[Atom] = []
57
60
  modified: Dict[Union[ResidueLabel, ResidueAuth], str] = {}
58
- sequence_by_entity = {}
59
- is_nucleic_acid_by_entity = {}
61
+ sequence_by_entity: Dict[str, str] = {}
62
+ is_nucleic_acid_by_entity: Dict[str, bool] = {}
60
63
 
61
64
  if data:
62
65
  atom_site = data[0].getObj("atom_site")
@@ -136,7 +139,7 @@ def parse_cif(
136
139
  else None
137
140
  )
138
141
 
139
- atoms.append(
142
+ atoms_to_process.append(
140
143
  Atom(
141
144
  label_entity_id,
142
145
  label,
@@ -216,6 +219,7 @@ def parse_cif(
216
219
  if entity_id and pdbx_seq_one_letter_code_can:
217
220
  sequence_by_entity[entity_id] = pdbx_seq_one_letter_code_can
218
221
 
222
+ atoms = filter_clashing_atoms(atoms_to_process)
219
223
  return atoms, modified, sequence_by_entity, is_nucleic_acid_by_entity
220
224
 
221
225
 
@@ -228,7 +232,7 @@ def parse_pdb(
228
232
  Dict[str, bool],
229
233
  ]:
230
234
  pdb.seek(0)
231
- atoms: List[Atom] = []
235
+ atoms_to_process: List[Atom] = []
232
236
  modified: Dict[Union[ResidueLabel, ResidueAuth], str] = {}
233
237
  model = 1
234
238
 
@@ -236,9 +240,6 @@ def parse_pdb(
236
240
  if line.startswith("MODEL"):
237
241
  model = int(line[10:14].strip())
238
242
  elif line.startswith("ATOM") or line.startswith("HETATM"):
239
- alternate_location = line[16]
240
- if alternate_location != " ":
241
- continue
242
243
  atom_name = line[12:16].strip()
243
244
  residue_name = line[17:20].strip()
244
245
  chain_identifier = line[21]
@@ -251,7 +252,10 @@ def parse_pdb(
251
252
  auth = ResidueAuth(
252
253
  chain_identifier, residue_number, insertion_code, residue_name
253
254
  )
254
- atoms.append(Atom(None, None, auth, model, atom_name, x, y, z, occupancy))
255
+
256
+ atoms_to_process.append(
257
+ Atom(None, None, auth, model, atom_name, x, y, z, occupancy)
258
+ )
255
259
  elif line.startswith("MODRES"):
256
260
  original_name = line[12:15]
257
261
  chain_identifier = line[16]
@@ -263,6 +267,7 @@ def parse_pdb(
263
267
  )
264
268
  modified[auth] = standard_residue_name
265
269
 
270
+ atoms = filter_clashing_atoms(atoms_to_process)
266
271
  return atoms, modified, {}, {}
267
272
 
268
273
 
@@ -392,3 +397,36 @@ def try_parse_int(s: str) -> Optional[int]:
392
397
  return int(s)
393
398
  except ValueError:
394
399
  return None
400
+
401
+
402
+ def filter_clashing_atoms(atoms: List[Atom], clash_distance: float = 0.5) -> List[Atom]:
403
+ # First, remove duplicate atoms
404
+ unique_atoms = {}
405
+
406
+ for i, atom in enumerate(atoms):
407
+ key = (atom.label, atom.auth, atom.name)
408
+ if key not in unique_atoms or atom.occupancy > unique_atoms[key].occupancy:
409
+ unique_atoms[key] = atom
410
+
411
+ unique_atoms_list = list(unique_atoms.values())
412
+
413
+ # Now handle clashing atoms
414
+ coords = np.array([(atom.x, atom.y, atom.z) for atom in unique_atoms_list])
415
+ tree = KDTree(coords)
416
+
417
+ pairs = tree.query_pairs(r=clash_distance)
418
+
419
+ atoms_to_keep = set(range(len(unique_atoms_list)))
420
+
421
+ for i, j in pairs:
422
+ if (
423
+ unique_atoms_list[i].occupancy is None
424
+ or unique_atoms_list[j].occupancy is None
425
+ ):
426
+ continue
427
+ if unique_atoms_list[i].occupancy > unique_atoms_list[j].occupancy:
428
+ atoms_to_keep.discard(j)
429
+ else:
430
+ atoms_to_keep.discard(i)
431
+
432
+ return [unique_atoms_list[i] for i in atoms_to_keep]
rnapolis/tertiary.py CHANGED
@@ -124,36 +124,17 @@ class Residue3D(Residue):
124
124
  outermost_atoms = {"A": "N9", "G": "N9", "C": "N1", "U": "N1", "T": "N1"}
125
125
  # Dist representing expected name of atom closest to the tetrad center
126
126
  innermost_atoms = {"A": "N6", "G": "O6", "C": "N4", "U": "O4", "T": "O4"}
127
+ # Heavy atoms in phosphate and ribose
128
+ phosphate_atoms = {"P", "OP1", "OP2", "O3'", "O5'"}
129
+ sugar_atoms = {"C1'", "C2'", "C3'", "C4'", "C5'", "O4'"}
127
130
  # Heavy atoms for each main nucleobase
128
131
  nucleobase_heavy_atoms = {
129
132
  "A": set(["N1", "C2", "N3", "C4", "C5", "C6", "N6", "N7", "C8", "N9"]),
130
133
  "G": set(["N1", "C2", "N2", "N3", "C4", "C5", "C6", "O6", "N7", "C8", "N9"]),
131
134
  "C": set(["N1", "C2", "O2", "N3", "C4", "N4", "C5", "C6"]),
132
135
  "U": set(["N1", "C2", "O2", "N3", "C4", "O4", "C5", "C6"]),
136
+ "T": set(["N1", "C2", "O2", "N3", "C4", "O4", "C5", "C5M", "C6"]),
133
137
  }
134
- # Heavy atoms in nucleotide
135
- nucleotide_heavy_atoms = (
136
- set(
137
- [
138
- "P",
139
- "OP1",
140
- "OP2",
141
- "O5'",
142
- "C5'",
143
- "C4'",
144
- "O4'",
145
- "C3'",
146
- "O3'",
147
- "C2'",
148
- "O2'",
149
- "C1'",
150
- ]
151
- )
152
- .union(nucleobase_heavy_atoms["A"])
153
- .union(nucleobase_heavy_atoms["G"])
154
- .union(nucleobase_heavy_atoms["C"])
155
- .union(nucleobase_heavy_atoms["U"])
156
- )
157
138
 
158
139
  def __lt__(self, other):
159
140
  return (self.model, self.chain, self.number, self.icode or " ") < (
@@ -202,9 +183,59 @@ class Residue3D(Residue):
202
183
 
203
184
  @cached_property
204
185
  def is_nucleotide(self) -> bool:
205
- return self.nucleotide_heavy_atoms.intersection(
206
- set([atom.name for atom in self.atoms])
186
+ scores = {"phosphate": 0.0, "sugar": 0.0, "base": 0.0, "connections": 0.0}
187
+ weights = {"phosphate": 0.25, "sugar": 0.25, "base": 0.25, "connections": 0.25}
188
+
189
+ residue_atoms = {atom.name for atom in self.atoms}
190
+
191
+ phosphate_match = len(residue_atoms.intersection(self.phosphate_atoms))
192
+ scores["phosphate"] = phosphate_match / len(self.phosphate_atoms)
193
+
194
+ sugar_match = len(residue_atoms.intersection(self.sugar_atoms))
195
+ scores["sugar"] = sugar_match / len(self.sugar_atoms)
196
+
197
+ nucleobase_atoms = {
198
+ key: self.nucleobase_heavy_atoms[key] for key in self.nucleobase_heavy_atoms
199
+ }
200
+ matches = {
201
+ key: len(residue_atoms.intersection(nucleobase_atoms[key]))
202
+ / len(nucleobase_atoms[key])
203
+ for key in nucleobase_atoms
204
+ }
205
+ best_match = max(matches.items(), key=lambda x: x[1])
206
+ scores["base"] = best_match[1]
207
+
208
+ connection_score = 0.0
209
+ distance_threshold = 2.0
210
+
211
+ if "P" in residue_atoms and "O5'" in residue_atoms:
212
+ p_atom = next(atom for atom in self.atoms if atom.name == "P")
213
+ o5_atom = next(atom for atom in self.atoms if atom.name == "O5'")
214
+ if (
215
+ numpy.linalg.norm(p_atom.coordinates - o5_atom.coordinates)
216
+ <= distance_threshold
217
+ ):
218
+ connection_score += 0.5
219
+ if "C1'" in residue_atoms:
220
+ c1_atom = next(atom for atom in self.atoms if atom.name == "C1'")
221
+ for base_connection in ["N9", "N1"]:
222
+ if base_connection in residue_atoms:
223
+ base_atom = next(
224
+ atom for atom in self.atoms if atom.name == base_connection
225
+ )
226
+ if (
227
+ numpy.linalg.norm(c1_atom.coordinates - base_atom.coordinates)
228
+ <= distance_threshold
229
+ ):
230
+ connection_score += 0.5
231
+ break
232
+
233
+ scores["connections"] = connection_score
234
+
235
+ probability = sum(
236
+ scores[component] * weights[component] for component in scores.keys()
207
237
  )
238
+ return probability > 0.5
208
239
 
209
240
  @cached_property
210
241
  def base_normal_vector(self) -> Optional[numpy.typing.NDArray[numpy.floating]]:
@@ -566,15 +597,14 @@ class Mapping2D3D:
566
597
  return self.__generate_bpseq(canonical)
567
598
 
568
599
  def __generate_bpseq(self, base_pairs):
600
+ nucleotides = list(filter(lambda r: r.is_nucleotide, self.structure3d.residues))
569
601
  result: Dict[int, List] = {}
570
602
  residue_map: Dict[Residue3D, int] = {}
571
603
  i = 1
572
604
 
573
- for j, residue in enumerate(
574
- filter(lambda r: r.is_nucleotide, self.structure3d.residues)
575
- ):
605
+ for j, residue in enumerate(nucleotides):
576
606
  if self.find_gaps and j > 0:
577
- previous = self.structure3d.residues[j - 1]
607
+ previous = nucleotides[j - 1]
578
608
 
579
609
  if (
580
610
  not previous.is_connected(residue)