RNApolis 0.3.18__py3-none-any.whl → 0.4.0__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: RNApolis
3
- Version: 0.3.18
3
+ Version: 0.4.0
4
4
  Summary: A Python library containing RNA-related bioinformatics functions and classes
5
5
  Home-page: https://github.com/tzok/rnapolis-py
6
6
  Author: Tomasz Zok
@@ -4,14 +4,14 @@ rnapolis/common.py,sha256=PUYF01P2vevhyImhZjGYE0jJlsxWHX6GQmsxI4W7S-E,30255
4
4
  rnapolis/metareader.py,sha256=I1-cXc2YNBPwa3zihAnMTjEsAo79tEKzSmWu5yvN1Pk,2071
5
5
  rnapolis/molecule_filter.py,sha256=hB6-nXgjmw7FAsQ3bj0cZ2FvuW2I1PXunEfcdwEUB1o,7389
6
6
  rnapolis/motif_extractor.py,sha256=duHvpi9Ulcny9K60E6VBpz5RpJZw-KdTB4_Ph0iP478,774
7
- rnapolis/parser.py,sha256=rQuzaRqsNTdHCS8_dKW5uT_nSi0xLnixF5xfy8puo_s,12665
7
+ rnapolis/parser.py,sha256=wCA9rXqt51iLECgeBqOShFpuT8JwanNkHYD5uXYvLzU,13988
8
8
  rnapolis/rfam_folder.py,sha256=SjiiyML_T1__saruFwSMJEoQ7Y55GIU8ktS8ZUn5-fw,11111
9
- rnapolis/tertiary.py,sha256=wy8n7gfLHj_lIAydGgIwn_RMvBYiJzkSxXU_eff3ym0,19657
9
+ rnapolis/tertiary.py,sha256=iA5_ut1_nhwlbYpu2898h5li4pTEOc-iA_uK7vIvQ2o,20269
10
10
  rnapolis/transformer.py,sha256=V9nOQvdq4-p7yUWo0vQg0CDQMpmyxz9t4TMSRVEKHnw,1817
11
11
  rnapolis/util.py,sha256=IdquFO3PV1_KDqodjupzm0Rqvgy0CeSzxGHaGEHYXVU,543
12
- RNApolis-0.3.18.dist-info/LICENSE,sha256=ZGRu12MzCgbYA-Lt8MyBlmjvPZh7xfiD5u5wBx0enq4,1066
13
- RNApolis-0.3.18.dist-info/METADATA,sha256=OkVdmxEbvCPPkUD1R49oZv5j89ng8-tds1yQeZYzORk,54323
14
- RNApolis-0.3.18.dist-info/WHEEL,sha256=HiCZjzuy6Dw0hdX5R3LCFPDmFS4BWl8H-8W39XfmgX4,91
15
- RNApolis-0.3.18.dist-info/entry_points.txt,sha256=foN2Pn5e-OzEz0fFmNoX6PnFSZFQntOlY8LbognP5F0,308
16
- RNApolis-0.3.18.dist-info/top_level.txt,sha256=LcO18koxZcWoJ21KDRRRo_tyIbmXL5z61dPitZpy8yc,9
17
- RNApolis-0.3.18.dist-info/RECORD,,
12
+ RNApolis-0.4.0.dist-info/LICENSE,sha256=ZGRu12MzCgbYA-Lt8MyBlmjvPZh7xfiD5u5wBx0enq4,1066
13
+ RNApolis-0.4.0.dist-info/METADATA,sha256=kz7fPjyFm8j6HMwnDvtwlLkZGPpzw2Ce29XI1pVyIyU,54322
14
+ RNApolis-0.4.0.dist-info/WHEEL,sha256=HiCZjzuy6Dw0hdX5R3LCFPDmFS4BWl8H-8W39XfmgX4,91
15
+ RNApolis-0.4.0.dist-info/entry_points.txt,sha256=foN2Pn5e-OzEz0fFmNoX6PnFSZFQntOlY8LbognP5F0,308
16
+ RNApolis-0.4.0.dist-info/top_level.txt,sha256=LcO18koxZcWoJ21KDRRRo_tyIbmXL5z61dPitZpy8yc,9
17
+ RNApolis-0.4.0.dist-info/RECORD,,
rnapolis/parser.py CHANGED
@@ -12,7 +12,7 @@ logger = logging.getLogger(__name__)
12
12
  def read_3d_structure(
13
13
  cif_or_pdb: IO[str], model: Optional[int] = None, nucleic_acid_only: bool = False
14
14
  ) -> Structure3D:
15
- atoms, modified, sequence = (
15
+ atoms, modified, sequence_by_entity, is_nucleic_acid_by_entity = (
16
16
  parse_cif(cif_or_pdb) if is_cif(cif_or_pdb) else parse_pdb(cif_or_pdb)
17
17
  )
18
18
  available_models = {atom.model: None for atom in atoms}
@@ -24,7 +24,13 @@ def read_3d_structure(
24
24
  atoms = atoms_by_model[model]
25
25
  else:
26
26
  atoms = atoms_by_model[list(available_models.keys())[0]]
27
- return group_atoms(atoms, modified, sequence, nucleic_acid_only)
27
+ return group_atoms(
28
+ atoms,
29
+ modified,
30
+ sequence_by_entity,
31
+ is_nucleic_acid_by_entity,
32
+ nucleic_acid_only,
33
+ )
28
34
 
29
35
 
30
36
  def is_cif(cif_or_pdb: IO[str]) -> bool:
@@ -40,7 +46,8 @@ def parse_cif(
40
46
  ) -> Tuple[
41
47
  List[Atom],
42
48
  Dict[Union[ResidueLabel, ResidueAuth], str],
43
- Dict[Tuple[str, int], str],
49
+ Dict[str, str],
50
+ Dict[str, bool],
44
51
  ]:
45
52
  cif.seek(0)
46
53
 
@@ -48,7 +55,8 @@ def parse_cif(
48
55
  data = io_adapter.readFile(cif.name)
49
56
  atoms: List[Atom] = []
50
57
  modified: Dict[Union[ResidueLabel, ResidueAuth], str] = {}
51
- sequence = {}
58
+ sequence_by_entity = {}
59
+ is_nucleic_acid_by_entity = {}
52
60
 
53
61
  if data:
54
62
  atom_site = data[0].getObj("atom_site")
@@ -59,6 +67,7 @@ def parse_cif(
59
67
  for row in atom_site.getRowList():
60
68
  row_dict = dict(zip(atom_site.getAttributeList(), row))
61
69
 
70
+ label_entity_id = row_dict.get("label_entity_id", None)
62
71
  label_chain_name = row_dict.get("label_asym_id", None)
63
72
  label_residue_number = try_parse_int(row_dict.get("label_seq_id", None))
64
73
  label_residue_name = row_dict.get("label_comp_id", None)
@@ -127,7 +136,19 @@ def parse_cif(
127
136
  else None
128
137
  )
129
138
 
130
- atoms.append(Atom(label, auth, model, atom_name, x, y, z, occupancy))
139
+ atoms.append(
140
+ Atom(
141
+ label_entity_id,
142
+ label,
143
+ auth,
144
+ model,
145
+ atom_name,
146
+ x,
147
+ y,
148
+ z,
149
+ occupancy,
150
+ )
151
+ )
131
152
 
132
153
  if mod_residue:
133
154
  for row in mod_residue.getRowList():
@@ -178,17 +199,24 @@ def parse_cif(
178
199
  for row in entity_poly.getRowList():
179
200
  row_dict = dict(zip(entity_poly.getAttributeList(), row))
180
201
 
181
- pdbx_strand_id = row_dict.get("pdbx_strand_id", None)
202
+ entity_id = row_dict.get("entity_id", None)
203
+ type_ = row_dict.get("type", None)
182
204
  pdbx_seq_one_letter_code_can = row_dict.get(
183
205
  "pdbx_seq_one_letter_code_can", None
184
206
  )
185
207
 
186
- if pdbx_strand_id and pdbx_seq_one_letter_code_can:
187
- for strand in pdbx_strand_id.split(","):
188
- for i, letter in enumerate(pdbx_seq_one_letter_code_can):
189
- sequence[(strand, i + 1)] = letter
208
+ if entity_id and type_:
209
+ is_nucleic_acid_by_entity[entity_id] = type_ in (
210
+ "peptide nucleic acid",
211
+ "polydeoxyribonucleotide",
212
+ "polydeoxyribonucleotide/polyribonucleotide hybrid",
213
+ "polyribonucleotide",
214
+ )
215
+
216
+ if entity_id and pdbx_seq_one_letter_code_can:
217
+ sequence_by_entity[entity_id] = pdbx_seq_one_letter_code_can
190
218
 
191
- return atoms, modified, sequence
219
+ return atoms, modified, sequence_by_entity, is_nucleic_acid_by_entity
192
220
 
193
221
 
194
222
  def parse_pdb(
@@ -196,7 +224,8 @@ def parse_pdb(
196
224
  ) -> Tuple[
197
225
  List[Atom],
198
226
  Dict[Union[ResidueLabel, ResidueAuth], str],
199
- Dict[Tuple[str, int], str],
227
+ Dict[str, str],
228
+ Dict[str, bool],
200
229
  ]:
201
230
  pdb.seek(0)
202
231
  atoms: List[Atom] = []
@@ -222,7 +251,7 @@ def parse_pdb(
222
251
  auth = ResidueAuth(
223
252
  chain_identifier, residue_number, insertion_code, residue_name
224
253
  )
225
- atoms.append(Atom(None, auth, model, atom_name, x, y, z, occupancy))
254
+ atoms.append(Atom(None, None, auth, model, atom_name, x, y, z, occupancy))
226
255
  elif line.startswith("MODRES"):
227
256
  original_name = line[12:15]
228
257
  chain_identifier = line[16]
@@ -234,13 +263,14 @@ def parse_pdb(
234
263
  )
235
264
  modified[auth] = standard_residue_name
236
265
 
237
- return atoms, modified, {}
266
+ return atoms, modified, {}, {}
238
267
 
239
268
 
240
269
  def group_atoms(
241
270
  atoms: List[Atom],
242
271
  modified: Dict[Union[ResidueLabel, ResidueAuth], str],
243
- sequence: Dict[Tuple[str, int], str],
272
+ sequence_by_entity: Dict[str, str],
273
+ is_nucleic_acid_by_entity: Dict[str, bool],
244
274
  nucleic_acid_only: bool,
245
275
  ) -> Structure3D:
246
276
  if not atoms:
@@ -258,28 +288,45 @@ def group_atoms(
258
288
  label = key_previous[0]
259
289
  auth = key_previous[1]
260
290
  model = key_previous[2]
291
+ entity_id = residue_atoms[-1].entity_id
261
292
  name = get_residue_name(auth, label, modified)
262
- one_letter_name = get_one_letter_name(label, sequence, name)
263
- if one_letter_name not in "ACGUT":
293
+ one_letter_name = get_one_letter_name(
294
+ entity_id, label, sequence_by_entity, name
295
+ )
296
+
297
+ if one_letter_name not in "ACGUTN":
264
298
  one_letter_name = detect_one_letter_name(residue_atoms)
265
- residue = Residue3D(
266
- label, auth, model, one_letter_name, tuple(residue_atoms)
299
+
300
+ residues.append(
301
+ Residue3D(label, auth, model, one_letter_name, tuple(residue_atoms))
267
302
  )
268
- if not nucleic_acid_only or (nucleic_acid_only and residue.is_nucleotide):
269
- residues.append(residue)
303
+
270
304
  key_previous = key
271
305
  residue_atoms = [atom]
272
306
 
273
307
  label = key_previous[0]
274
308
  auth = key_previous[1]
275
309
  model = key_previous[2]
310
+ entity_id = residue_atoms[-1].entity_id
276
311
  name = get_residue_name(auth, label, modified)
277
- one_letter_name = get_one_letter_name(label, sequence, name)
278
- if one_letter_name not in "ACGUT":
312
+ one_letter_name = get_one_letter_name(entity_id, label, sequence_by_entity, name)
313
+
314
+ if one_letter_name not in "ACGUTN":
279
315
  one_letter_name = detect_one_letter_name(residue_atoms)
280
- residue = Residue3D(label, auth, model, one_letter_name, tuple(residue_atoms))
281
- if not nucleic_acid_only or (nucleic_acid_only and residue.is_nucleotide):
282
- residues.append(residue)
316
+
317
+ residues.append(
318
+ Residue3D(label, auth, model, one_letter_name, tuple(residue_atoms))
319
+ )
320
+
321
+ if nucleic_acid_only:
322
+ if is_nucleic_acid_by_entity:
323
+ residues = [
324
+ residue
325
+ for residue in residues
326
+ if is_nucleic_acid_by_entity[residue.atoms[0].entity_id]
327
+ ]
328
+ else:
329
+ residues = [residue for residue in residues if residue.is_nucleotide]
283
330
 
284
331
  return Structure3D(residues)
285
332
 
@@ -304,13 +351,14 @@ def get_residue_name(
304
351
 
305
352
 
306
353
  def get_one_letter_name(
307
- label: Optional[ResidueLabel], sequence: Dict[Tuple[str, int], str], name: str
354
+ entity_id: Optional[str],
355
+ label: Optional[ResidueLabel],
356
+ sequence_by_entity: Dict[str, str],
357
+ name: str,
308
358
  ) -> str:
309
359
  # try getting the value from _entity_poly first
310
- if label is not None:
311
- key = (label.chain, label.number)
312
- if key in sequence:
313
- return sequence[key]
360
+ if entity_id is not None and label is not None and entity_id in sequence_by_entity:
361
+ return sequence_by_entity[entity_id][label.number - 1]
314
362
  # RNA
315
363
  if len(name) == 1:
316
364
  return name
@@ -334,11 +382,13 @@ def detect_one_letter_name(atoms: List[Atom]) -> str:
334
382
  ) / len(atom_names_expected)
335
383
  score[candidate] = count
336
384
  items = sorted(score.items(), key=lambda kv: kv[1], reverse=True)
385
+ if items[0][1] == 0:
386
+ return "?"
337
387
  return items[0][0]
338
388
 
339
389
 
340
390
  def try_parse_int(s: str) -> Optional[int]:
341
391
  try:
342
392
  return int(s)
343
- except:
393
+ except ValueError:
344
394
  return None
rnapolis/tertiary.py CHANGED
@@ -96,6 +96,7 @@ AVERAGE_OXYGEN_PHOSPHORUS_DISTANCE_COVALENT = 1.6
96
96
 
97
97
  @dataclass(frozen=True, order=True)
98
98
  class Atom:
99
+ entity_id: Optional[str]
99
100
  label: Optional[ResidueLabel]
100
101
  auth: Optional[ResidueAuth]
101
102
  model: int
@@ -128,6 +129,29 @@ class Residue3D(Residue):
128
129
  "C": set(["N1", "C2", "O2", "N3", "C4", "N4", "C5", "C6"]),
129
130
  "U": set(["N1", "C2", "O2", "N3", "C4", "O4", "C5", "C6"]),
130
131
  }
132
+ # Heavy atoms in nucleotide
133
+ nucleotide_heavy_atoms = (
134
+ set(
135
+ [
136
+ "P",
137
+ "OP1",
138
+ "OP2",
139
+ "O5'",
140
+ "C5'",
141
+ "C4'",
142
+ "O4'",
143
+ "C3'",
144
+ "O3'",
145
+ "C2'",
146
+ "O2'",
147
+ "C1'",
148
+ ]
149
+ )
150
+ .union(nucleobase_heavy_atoms["A"])
151
+ .union(nucleobase_heavy_atoms["G"])
152
+ .union(nucleobase_heavy_atoms["C"])
153
+ .union(nucleobase_heavy_atoms["U"])
154
+ )
131
155
 
132
156
  def __lt__(self, other):
133
157
  return (self.model, self.chain, self.number, self.icode or " ") < (
@@ -176,8 +200,8 @@ class Residue3D(Residue):
176
200
 
177
201
  @cached_property
178
202
  def is_nucleotide(self) -> bool:
179
- return len(self.atoms) > 1 and any(
180
- [atom for atom in self.atoms if atom.name == "C1'"]
203
+ return self.nucleotide_heavy_atoms.intersection(
204
+ set([atom.name for atom in self.atoms])
181
205
  )
182
206
 
183
207
  @cached_property
@@ -268,7 +292,7 @@ class Residue3D(Residue):
268
292
  logging.error(
269
293
  f"Failed to determine the outermost atom for nucleotide {self}, so an arbitrary atom will be used"
270
294
  )
271
- yield Atom(self.label, self.auth, self.model, "UNK", 0.0, 0.0, 0.0, None)
295
+ yield Atom(None, self.label, self.auth, self.model, "UNK", 0.0, 0.0, 0.0, None)
272
296
 
273
297
  def __inner_generator(self):
274
298
  # try to find expected atom name
@@ -296,7 +320,7 @@ class Residue3D(Residue):
296
320
  logging.error(
297
321
  f"Failed to determine the innermost atom for nucleotide {self}, so an arbitrary atom will be used"
298
322
  )
299
- yield Atom(self.label, self.auth, self.model, "UNK", 0.0, 0.0, 0.0, None)
323
+ yield Atom(None, self.label, self.auth, self.model, "UNK", 0.0, 0.0, 0.0, None)
300
324
 
301
325
 
302
326
  @dataclass(frozen=True, order=True)