RNApolis 0.3.18__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: RNApolis
3
- Version: 0.3.18
3
+ Version: 0.4.1
4
4
  Summary: A Python library containing RNA-related bioinformatics functions and classes
5
5
  Home-page: https://github.com/tzok/rnapolis-py
6
6
  Author: Tomasz Zok
@@ -4,14 +4,14 @@ rnapolis/common.py,sha256=PUYF01P2vevhyImhZjGYE0jJlsxWHX6GQmsxI4W7S-E,30255
4
4
  rnapolis/metareader.py,sha256=I1-cXc2YNBPwa3zihAnMTjEsAo79tEKzSmWu5yvN1Pk,2071
5
5
  rnapolis/molecule_filter.py,sha256=hB6-nXgjmw7FAsQ3bj0cZ2FvuW2I1PXunEfcdwEUB1o,7389
6
6
  rnapolis/motif_extractor.py,sha256=duHvpi9Ulcny9K60E6VBpz5RpJZw-KdTB4_Ph0iP478,774
7
- rnapolis/parser.py,sha256=rQuzaRqsNTdHCS8_dKW5uT_nSi0xLnixF5xfy8puo_s,12665
7
+ rnapolis/parser.py,sha256=wCA9rXqt51iLECgeBqOShFpuT8JwanNkHYD5uXYvLzU,13988
8
8
  rnapolis/rfam_folder.py,sha256=SjiiyML_T1__saruFwSMJEoQ7Y55GIU8ktS8ZUn5-fw,11111
9
- rnapolis/tertiary.py,sha256=wy8n7gfLHj_lIAydGgIwn_RMvBYiJzkSxXU_eff3ym0,19657
9
+ rnapolis/tertiary.py,sha256=qk1te8GPDuvQsnm4rTiw96VDYyNoO5x4IPf98zDzxPw,20824
10
10
  rnapolis/transformer.py,sha256=V9nOQvdq4-p7yUWo0vQg0CDQMpmyxz9t4TMSRVEKHnw,1817
11
11
  rnapolis/util.py,sha256=IdquFO3PV1_KDqodjupzm0Rqvgy0CeSzxGHaGEHYXVU,543
12
- RNApolis-0.3.18.dist-info/LICENSE,sha256=ZGRu12MzCgbYA-Lt8MyBlmjvPZh7xfiD5u5wBx0enq4,1066
13
- RNApolis-0.3.18.dist-info/METADATA,sha256=OkVdmxEbvCPPkUD1R49oZv5j89ng8-tds1yQeZYzORk,54323
14
- RNApolis-0.3.18.dist-info/WHEEL,sha256=HiCZjzuy6Dw0hdX5R3LCFPDmFS4BWl8H-8W39XfmgX4,91
15
- RNApolis-0.3.18.dist-info/entry_points.txt,sha256=foN2Pn5e-OzEz0fFmNoX6PnFSZFQntOlY8LbognP5F0,308
16
- RNApolis-0.3.18.dist-info/top_level.txt,sha256=LcO18koxZcWoJ21KDRRRo_tyIbmXL5z61dPitZpy8yc,9
17
- RNApolis-0.3.18.dist-info/RECORD,,
12
+ RNApolis-0.4.1.dist-info/LICENSE,sha256=ZGRu12MzCgbYA-Lt8MyBlmjvPZh7xfiD5u5wBx0enq4,1066
13
+ RNApolis-0.4.1.dist-info/METADATA,sha256=EayMQbE4Y5raff-7pFMmeh4EB81JaLEEMrbvEA4mbAk,54322
14
+ RNApolis-0.4.1.dist-info/WHEEL,sha256=HiCZjzuy6Dw0hdX5R3LCFPDmFS4BWl8H-8W39XfmgX4,91
15
+ RNApolis-0.4.1.dist-info/entry_points.txt,sha256=foN2Pn5e-OzEz0fFmNoX6PnFSZFQntOlY8LbognP5F0,308
16
+ RNApolis-0.4.1.dist-info/top_level.txt,sha256=LcO18koxZcWoJ21KDRRRo_tyIbmXL5z61dPitZpy8yc,9
17
+ RNApolis-0.4.1.dist-info/RECORD,,
rnapolis/parser.py CHANGED
@@ -12,7 +12,7 @@ logger = logging.getLogger(__name__)
12
12
  def read_3d_structure(
13
13
  cif_or_pdb: IO[str], model: Optional[int] = None, nucleic_acid_only: bool = False
14
14
  ) -> Structure3D:
15
- atoms, modified, sequence = (
15
+ atoms, modified, sequence_by_entity, is_nucleic_acid_by_entity = (
16
16
  parse_cif(cif_or_pdb) if is_cif(cif_or_pdb) else parse_pdb(cif_or_pdb)
17
17
  )
18
18
  available_models = {atom.model: None for atom in atoms}
@@ -24,7 +24,13 @@ def read_3d_structure(
24
24
  atoms = atoms_by_model[model]
25
25
  else:
26
26
  atoms = atoms_by_model[list(available_models.keys())[0]]
27
- return group_atoms(atoms, modified, sequence, nucleic_acid_only)
27
+ return group_atoms(
28
+ atoms,
29
+ modified,
30
+ sequence_by_entity,
31
+ is_nucleic_acid_by_entity,
32
+ nucleic_acid_only,
33
+ )
28
34
 
29
35
 
30
36
  def is_cif(cif_or_pdb: IO[str]) -> bool:
@@ -40,7 +46,8 @@ def parse_cif(
40
46
  ) -> Tuple[
41
47
  List[Atom],
42
48
  Dict[Union[ResidueLabel, ResidueAuth], str],
43
- Dict[Tuple[str, int], str],
49
+ Dict[str, str],
50
+ Dict[str, bool],
44
51
  ]:
45
52
  cif.seek(0)
46
53
 
@@ -48,7 +55,8 @@ def parse_cif(
48
55
  data = io_adapter.readFile(cif.name)
49
56
  atoms: List[Atom] = []
50
57
  modified: Dict[Union[ResidueLabel, ResidueAuth], str] = {}
51
- sequence = {}
58
+ sequence_by_entity = {}
59
+ is_nucleic_acid_by_entity = {}
52
60
 
53
61
  if data:
54
62
  atom_site = data[0].getObj("atom_site")
@@ -59,6 +67,7 @@ def parse_cif(
59
67
  for row in atom_site.getRowList():
60
68
  row_dict = dict(zip(atom_site.getAttributeList(), row))
61
69
 
70
+ label_entity_id = row_dict.get("label_entity_id", None)
62
71
  label_chain_name = row_dict.get("label_asym_id", None)
63
72
  label_residue_number = try_parse_int(row_dict.get("label_seq_id", None))
64
73
  label_residue_name = row_dict.get("label_comp_id", None)
@@ -127,7 +136,19 @@ def parse_cif(
127
136
  else None
128
137
  )
129
138
 
130
- atoms.append(Atom(label, auth, model, atom_name, x, y, z, occupancy))
139
+ atoms.append(
140
+ Atom(
141
+ label_entity_id,
142
+ label,
143
+ auth,
144
+ model,
145
+ atom_name,
146
+ x,
147
+ y,
148
+ z,
149
+ occupancy,
150
+ )
151
+ )
131
152
 
132
153
  if mod_residue:
133
154
  for row in mod_residue.getRowList():
@@ -178,17 +199,24 @@ def parse_cif(
178
199
  for row in entity_poly.getRowList():
179
200
  row_dict = dict(zip(entity_poly.getAttributeList(), row))
180
201
 
181
- pdbx_strand_id = row_dict.get("pdbx_strand_id", None)
202
+ entity_id = row_dict.get("entity_id", None)
203
+ type_ = row_dict.get("type", None)
182
204
  pdbx_seq_one_letter_code_can = row_dict.get(
183
205
  "pdbx_seq_one_letter_code_can", None
184
206
  )
185
207
 
186
- if pdbx_strand_id and pdbx_seq_one_letter_code_can:
187
- for strand in pdbx_strand_id.split(","):
188
- for i, letter in enumerate(pdbx_seq_one_letter_code_can):
189
- sequence[(strand, i + 1)] = letter
208
+ if entity_id and type_:
209
+ is_nucleic_acid_by_entity[entity_id] = type_ in (
210
+ "peptide nucleic acid",
211
+ "polydeoxyribonucleotide",
212
+ "polydeoxyribonucleotide/polyribonucleotide hybrid",
213
+ "polyribonucleotide",
214
+ )
215
+
216
+ if entity_id and pdbx_seq_one_letter_code_can:
217
+ sequence_by_entity[entity_id] = pdbx_seq_one_letter_code_can
190
218
 
191
- return atoms, modified, sequence
219
+ return atoms, modified, sequence_by_entity, is_nucleic_acid_by_entity
192
220
 
193
221
 
194
222
  def parse_pdb(
@@ -196,7 +224,8 @@ def parse_pdb(
196
224
  ) -> Tuple[
197
225
  List[Atom],
198
226
  Dict[Union[ResidueLabel, ResidueAuth], str],
199
- Dict[Tuple[str, int], str],
227
+ Dict[str, str],
228
+ Dict[str, bool],
200
229
  ]:
201
230
  pdb.seek(0)
202
231
  atoms: List[Atom] = []
@@ -222,7 +251,7 @@ def parse_pdb(
222
251
  auth = ResidueAuth(
223
252
  chain_identifier, residue_number, insertion_code, residue_name
224
253
  )
225
- atoms.append(Atom(None, auth, model, atom_name, x, y, z, occupancy))
254
+ atoms.append(Atom(None, None, auth, model, atom_name, x, y, z, occupancy))
226
255
  elif line.startswith("MODRES"):
227
256
  original_name = line[12:15]
228
257
  chain_identifier = line[16]
@@ -234,13 +263,14 @@ def parse_pdb(
234
263
  )
235
264
  modified[auth] = standard_residue_name
236
265
 
237
- return atoms, modified, {}
266
+ return atoms, modified, {}, {}
238
267
 
239
268
 
240
269
  def group_atoms(
241
270
  atoms: List[Atom],
242
271
  modified: Dict[Union[ResidueLabel, ResidueAuth], str],
243
- sequence: Dict[Tuple[str, int], str],
272
+ sequence_by_entity: Dict[str, str],
273
+ is_nucleic_acid_by_entity: Dict[str, bool],
244
274
  nucleic_acid_only: bool,
245
275
  ) -> Structure3D:
246
276
  if not atoms:
@@ -258,28 +288,45 @@ def group_atoms(
258
288
  label = key_previous[0]
259
289
  auth = key_previous[1]
260
290
  model = key_previous[2]
291
+ entity_id = residue_atoms[-1].entity_id
261
292
  name = get_residue_name(auth, label, modified)
262
- one_letter_name = get_one_letter_name(label, sequence, name)
263
- if one_letter_name not in "ACGUT":
293
+ one_letter_name = get_one_letter_name(
294
+ entity_id, label, sequence_by_entity, name
295
+ )
296
+
297
+ if one_letter_name not in "ACGUTN":
264
298
  one_letter_name = detect_one_letter_name(residue_atoms)
265
- residue = Residue3D(
266
- label, auth, model, one_letter_name, tuple(residue_atoms)
299
+
300
+ residues.append(
301
+ Residue3D(label, auth, model, one_letter_name, tuple(residue_atoms))
267
302
  )
268
- if not nucleic_acid_only or (nucleic_acid_only and residue.is_nucleotide):
269
- residues.append(residue)
303
+
270
304
  key_previous = key
271
305
  residue_atoms = [atom]
272
306
 
273
307
  label = key_previous[0]
274
308
  auth = key_previous[1]
275
309
  model = key_previous[2]
310
+ entity_id = residue_atoms[-1].entity_id
276
311
  name = get_residue_name(auth, label, modified)
277
- one_letter_name = get_one_letter_name(label, sequence, name)
278
- if one_letter_name not in "ACGUT":
312
+ one_letter_name = get_one_letter_name(entity_id, label, sequence_by_entity, name)
313
+
314
+ if one_letter_name not in "ACGUTN":
279
315
  one_letter_name = detect_one_letter_name(residue_atoms)
280
- residue = Residue3D(label, auth, model, one_letter_name, tuple(residue_atoms))
281
- if not nucleic_acid_only or (nucleic_acid_only and residue.is_nucleotide):
282
- residues.append(residue)
316
+
317
+ residues.append(
318
+ Residue3D(label, auth, model, one_letter_name, tuple(residue_atoms))
319
+ )
320
+
321
+ if nucleic_acid_only:
322
+ if is_nucleic_acid_by_entity:
323
+ residues = [
324
+ residue
325
+ for residue in residues
326
+ if is_nucleic_acid_by_entity[residue.atoms[0].entity_id]
327
+ ]
328
+ else:
329
+ residues = [residue for residue in residues if residue.is_nucleotide]
283
330
 
284
331
  return Structure3D(residues)
285
332
 
@@ -304,13 +351,14 @@ def get_residue_name(
304
351
 
305
352
 
306
353
  def get_one_letter_name(
307
- label: Optional[ResidueLabel], sequence: Dict[Tuple[str, int], str], name: str
354
+ entity_id: Optional[str],
355
+ label: Optional[ResidueLabel],
356
+ sequence_by_entity: Dict[str, str],
357
+ name: str,
308
358
  ) -> str:
309
359
  # try getting the value from _entity_poly first
310
- if label is not None:
311
- key = (label.chain, label.number)
312
- if key in sequence:
313
- return sequence[key]
360
+ if entity_id is not None and label is not None and entity_id in sequence_by_entity:
361
+ return sequence_by_entity[entity_id][label.number - 1]
314
362
  # RNA
315
363
  if len(name) == 1:
316
364
  return name
@@ -334,11 +382,13 @@ def detect_one_letter_name(atoms: List[Atom]) -> str:
334
382
  ) / len(atom_names_expected)
335
383
  score[candidate] = count
336
384
  items = sorted(score.items(), key=lambda kv: kv[1], reverse=True)
385
+ if items[0][1] == 0:
386
+ return "?"
337
387
  return items[0][0]
338
388
 
339
389
 
340
390
  def try_parse_int(s: str) -> Optional[int]:
341
391
  try:
342
392
  return int(s)
343
- except:
393
+ except ValueError:
344
394
  return None
rnapolis/tertiary.py CHANGED
@@ -96,6 +96,7 @@ AVERAGE_OXYGEN_PHOSPHORUS_DISTANCE_COVALENT = 1.6
96
96
 
97
97
  @dataclass(frozen=True, order=True)
98
98
  class Atom:
99
+ entity_id: Optional[str]
99
100
  label: Optional[ResidueLabel]
100
101
  auth: Optional[ResidueAuth]
101
102
  model: int
@@ -128,6 +129,29 @@ class Residue3D(Residue):
128
129
  "C": set(["N1", "C2", "O2", "N3", "C4", "N4", "C5", "C6"]),
129
130
  "U": set(["N1", "C2", "O2", "N3", "C4", "O4", "C5", "C6"]),
130
131
  }
132
+ # Heavy atoms in nucleotide
133
+ nucleotide_heavy_atoms = (
134
+ set(
135
+ [
136
+ "P",
137
+ "OP1",
138
+ "OP2",
139
+ "O5'",
140
+ "C5'",
141
+ "C4'",
142
+ "O4'",
143
+ "C3'",
144
+ "O3'",
145
+ "C2'",
146
+ "O2'",
147
+ "C1'",
148
+ ]
149
+ )
150
+ .union(nucleobase_heavy_atoms["A"])
151
+ .union(nucleobase_heavy_atoms["G"])
152
+ .union(nucleobase_heavy_atoms["C"])
153
+ .union(nucleobase_heavy_atoms["U"])
154
+ )
131
155
 
132
156
  def __lt__(self, other):
133
157
  return (self.model, self.chain, self.number, self.icode or " ") < (
@@ -176,8 +200,8 @@ class Residue3D(Residue):
176
200
 
177
201
  @cached_property
178
202
  def is_nucleotide(self) -> bool:
179
- return len(self.atoms) > 1 and any(
180
- [atom for atom in self.atoms if atom.name == "C1'"]
203
+ return self.nucleotide_heavy_atoms.intersection(
204
+ set([atom.name for atom in self.atoms])
181
205
  )
182
206
 
183
207
  @cached_property
@@ -268,7 +292,7 @@ class Residue3D(Residue):
268
292
  logging.error(
269
293
  f"Failed to determine the outermost atom for nucleotide {self}, so an arbitrary atom will be used"
270
294
  )
271
- yield Atom(self.label, self.auth, self.model, "UNK", 0.0, 0.0, 0.0, None)
295
+ yield Atom(None, self.label, self.auth, self.model, "UNK", 0.0, 0.0, 0.0, None)
272
296
 
273
297
  def __inner_generator(self):
274
298
  # try to find expected atom name
@@ -296,7 +320,7 @@ class Residue3D(Residue):
296
320
  logging.error(
297
321
  f"Failed to determine the innermost atom for nucleotide {self}, so an arbitrary atom will be used"
298
322
  )
299
- yield Atom(self.label, self.auth, self.model, "UNK", 0.0, 0.0, 0.0, None)
323
+ yield Atom(None, self.label, self.auth, self.model, "UNK", 0.0, 0.0, 0.0, None)
300
324
 
301
325
 
302
326
  @dataclass(frozen=True, order=True)
@@ -524,7 +548,20 @@ class Mapping2D3D:
524
548
  result: Dict[int, List] = {}
525
549
  residue_map: Dict[Residue3D, int] = {}
526
550
  i = 1
527
- for residue in self.structure3d.residues:
551
+
552
+ for j, residue in enumerate(self.structure3d.residues):
553
+ if self.find_gaps and j > 0:
554
+ previous = self.structure3d.residues[j - 1]
555
+ if (
556
+ previous.is_nucleotide
557
+ and residue.is_nucleotide
558
+ and previous.label
559
+ and residue.label
560
+ and previous.label.chain == residue.label.chain
561
+ ):
562
+ for k in range(residue.label.number - previous.label.number - 1):
563
+ result[i] = [i, "?", 0]
564
+ i += 1
528
565
  if residue.is_nucleotide:
529
566
  result[i] = [i, residue.one_letter_name, 0]
530
567
  residue_map[residue] = i