RNApolis 0.4.15__py3-none-any.whl → 0.4.16__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: RNApolis
3
- Version: 0.4.15
3
+ Version: 0.4.16
4
4
  Summary: A Python library containing RNA-related bioinformatics functions and classes
5
5
  Home-page: https://github.com/tzok/rnapolis-py
6
6
  Author: Tomasz Zok
@@ -2,16 +2,16 @@ rnapolis/annotator.py,sha256=hRRzRmneYxbg2tvwVHMWLfzmJb4szV0JL_6EOC09Gwg,22101
2
2
  rnapolis/clashfinder.py,sha256=i95kp0o6OWNqmJDBr-PbsZd7RY2iJtBDr7QqolJSuAQ,8513
3
3
  rnapolis/common.py,sha256=LY6Uz96Br8ki_gA8LpfatgtvVbt9jOTkwgagayqTgf8,31251
4
4
  rnapolis/metareader.py,sha256=I1-cXc2YNBPwa3zihAnMTjEsAo79tEKzSmWu5yvN1Pk,2071
5
- rnapolis/molecule_filter.py,sha256=_7zrF-AH56vXI4vhFtfaU9wr-X7aeOL6DsXbX6_9vPQ,9497
5
+ rnapolis/molecule_filter.py,sha256=jgcpJxx_oXEBX0d30v4k_FdwRouRUPUsEtCYWgLGpD4,7310
6
6
  rnapolis/motif_extractor.py,sha256=Lfn1iEkhkP9eZD3GPEWNAfy00QO7QPCc8wM_XS1ory8,1147
7
7
  rnapolis/parser.py,sha256=3g4mtFvpiEENFcSBBtx_E_x1vJPF9BujWnts0kb9XjE,16340
8
8
  rnapolis/rfam_folder.py,sha256=SjiiyML_T1__saruFwSMJEoQ7Y55GIU8ktS8ZUn5-fw,11111
9
9
  rnapolis/tertiary.py,sha256=6t9ZB4w33-5n_M3sns1RoFXCOTgVAgGH4WDNG5OG9Kg,23426
10
10
  rnapolis/transformer.py,sha256=aC0nBmHHJf5TyLvBIV57Jj3tlwpvHbPo347opfAOlQA,3844
11
11
  rnapolis/util.py,sha256=IdquFO3PV1_KDqodjupzm0Rqvgy0CeSzxGHaGEHYXVU,543
12
- RNApolis-0.4.15.dist-info/LICENSE,sha256=ZGRu12MzCgbYA-Lt8MyBlmjvPZh7xfiD5u5wBx0enq4,1066
13
- RNApolis-0.4.15.dist-info/METADATA,sha256=qLYXvmGt9gT0KsEnD-303D8EqqpVz5-hZYCekAOAn1E,54516
14
- RNApolis-0.4.15.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
15
- RNApolis-0.4.15.dist-info/entry_points.txt,sha256=foN2Pn5e-OzEz0fFmNoX6PnFSZFQntOlY8LbognP5F0,308
16
- RNApolis-0.4.15.dist-info/top_level.txt,sha256=LcO18koxZcWoJ21KDRRRo_tyIbmXL5z61dPitZpy8yc,9
17
- RNApolis-0.4.15.dist-info/RECORD,,
12
+ RNApolis-0.4.16.dist-info/LICENSE,sha256=ZGRu12MzCgbYA-Lt8MyBlmjvPZh7xfiD5u5wBx0enq4,1066
13
+ RNApolis-0.4.16.dist-info/METADATA,sha256=Ouh1NQ3gFk7NrpInnQmBWJWRn_1JML9qi5c8MVk8_Q8,54516
14
+ RNApolis-0.4.16.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
15
+ RNApolis-0.4.16.dist-info/entry_points.txt,sha256=foN2Pn5e-OzEz0fFmNoX6PnFSZFQntOlY8LbognP5F0,308
16
+ RNApolis-0.4.16.dist-info/top_level.txt,sha256=LcO18koxZcWoJ21KDRRRo_tyIbmXL5z61dPitZpy8yc,9
17
+ RNApolis-0.4.16.dist-info/RECORD,,
@@ -1,6 +1,8 @@
1
1
  #! /usr/bin/env python
2
2
  import argparse
3
+ import os
3
4
  import tempfile
5
+ from collections import defaultdict, namedtuple
4
6
  from typing import Iterable, List, Set, Tuple
5
7
 
6
8
  from mmcif.io.IoAdapterPy import IoAdapterPy
@@ -20,167 +22,104 @@ ENTITY_POLY_TYPES = [
20
22
  "polyribonucleotide",
21
23
  ]
22
24
 
23
- CATEGORIES_WITH_ENTITY_ID = [
24
- ("entity", "id"),
25
- ("atom_site", "label_entity_id"),
26
- ("entity_keywords", "entity_id"),
27
- ("entity_name_com", "entity_id"),
28
- ("entity_name_sys", "entity_id"),
29
- ("entity_poly", "entity_id"),
30
- ("entity_src_gen", "entity_id"),
31
- ("entity_src_nat", "entity_id"),
32
- ("pdbx_branch_scheme", "entity_id"),
33
- ("pdbx_chain_remapping", "entity_id"),
34
- ("pdbx_construct", "entity_id"),
35
- ("pdbx_entity_assembly", "entity_id"),
36
- ("pdbx_entity_branch", "entity_id"),
37
- ("pdbx_entity_branch_descriptor", "entity_id"),
38
- ("pdbx_entity_branch_list", "entity_id"),
39
- ("pdbx_entity_func_bind_mode", "entity_id"),
40
- ("pdbx_entity_name", "entity_id"),
41
- ("pdbx_entity_nonpoly", "entity_id"),
42
- ("pdbx_entity_poly_domain", "entity_id"),
43
- ("pdbx_entity_poly_na_nonstandard", "entity_id"),
44
- ("pdbx_entity_poly_na_type", "entity_id"),
45
- ("pdbx_entity_poly_protein_class", "entity_id"),
46
- ("pdbx_entity_prod_protocol", "entity_id"),
47
- ("pdbx_entity_remapping", "entity_id"),
48
- ("pdbx_entity_src_gen_character", "entity_id"),
49
- ("pdbx_entity_src_gen_chrom", "entity_id"),
50
- ("pdbx_entity_src_gen_clone", "entity_id"),
51
- ("pdbx_entity_src_gen_express", "entity_id"),
52
- ("pdbx_entity_src_gen_fract", "entity_id"),
53
- ("pdbx_entity_src_gen_lysis", "entity_id"),
54
- ("pdbx_entity_src_gen_prod_digest", "entity_id"),
55
- ("pdbx_entity_src_gen_prod_other", "entity_id"),
56
- ("pdbx_entity_src_gen_prod_pcr", "entity_id"),
57
- ("pdbx_entity_src_gen_proteolysis", "entity_id"),
58
- ("pdbx_entity_src_gen_pure", "entity_id"),
59
- ("pdbx_entity_src_gen_refold", "entity_id"),
60
- ("pdbx_entity_src_syn", "entity_id"),
61
- ("pdbx_linked_entity_list", "entity_id"),
62
- ("pdbx_prerelease_seq", "entity_id"),
63
- ("pdbx_sifts_xref_db", "entity_id"),
64
- ("pdbx_sifts_xref_db_segments", "entity_id"),
65
- ("pdbx_struct_entity_inst", "entity_id"),
66
- ("struct_asym", "entity_id"),
67
- ("struct_ref", "entity_id"),
68
- ]
69
-
70
- CATEGORIES_WITH_ASYM_ID = [
71
- ("pdbx_coordinate_model", "asym_id"),
72
- ("pdbx_distant_solvent_atoms", "label_asym_id"),
73
- ("pdbx_linked_entity_instance_list", "asym_id"),
74
- ("pdbx_poly_seq_scheme", "asym_id"),
75
- ("pdbx_sifts_unp_segments", "asym_id"),
76
- ("pdbx_struct_asym_gen", "asym_id"),
77
- ("pdbx_struct_ncs_virus_gen", "asym_id"),
78
- ("pdbx_struct_special_symmetry", "label_asym_id"),
79
- ("pdbx_unobs_or_zero_occ_atoms", "label_asym_id"),
80
- ("pdbx_unobs_or_zero_occ_residues", "label_asym_id"),
81
- ("refine_ls_restr_ncs", "pdbx_asym_id"),
82
- ("struct_biol_gen", "asym_id"),
83
- ]
84
-
85
- CATEGORIES_WITH_AUTH_ASYM_ID = [
86
- ("atom_site_anisotrop", "pdbx_auth_asym_id"),
87
- ("pdbx_atom_site_aniso_tls", "auth_asym_id"),
88
- ("pdbx_entity_instance_feature", "auth_asym_id"),
89
- ("pdbx_feature_monomer", "auth_asym_id"),
90
- ("pdbx_missing_atom_nonpoly", "auth_asym_id"),
91
- ("pdbx_missing_atom_poly", "auth_asym_id"),
92
- ("pdbx_modification_feature", "auth_asym_id"),
93
- ("pdbx_refine_component", "auth_asym_id"),
94
- ("pdbx_remediation_atom_site_mapping", "auth_asym_id"),
95
- ("pdbx_rmch_outlier", "auth_asym_id"),
96
- ("pdbx_rms_devs_cov_by_monomer", "auth_asym_id"),
97
- ("pdbx_sequence_pattern", "auth_asym_id"),
98
- ("pdbx_solvent_atom_site_mapping", "auth_asym_id"),
99
- ("pdbx_stereochemistry", "auth_asym_id"),
100
- ("pdbx_struct_chem_comp_diagnostics", "pdb_strand_id"),
101
- ("pdbx_struct_chem_comp_feature", "pdb_strand_id"),
102
- ("pdbx_struct_group_components", "auth_asym_id"),
103
- ("pdbx_struct_mod_residue", "auth_asym_id"),
104
- ("pdbx_sugar_phosphate_geometry", "auth_asym_id"),
105
- ("pdbx_validate_chiral", "auth_asym_id"),
106
- ("pdbx_validate_main_chain_plane", "auth_asym_id"),
107
- ("pdbx_validate_planes", "auth_asym_id"),
108
- ("pdbx_validate_planes_atom", "auth_asym_id"),
109
- ("pdbx_validate_torsion", "auth_asym_id"),
110
- ("struct_mon_nucl", "auth_asym_id"),
111
- ("struct_mon_prot", "auth_asym_id"),
112
- ("struct_site_gen", "auth_asym_id"),
113
- ]
25
+ Link = namedtuple(
26
+ "Link", ["parent_category_id", "parent_name", "child_category_id", "child_name"]
27
+ )
114
28
 
115
29
 
116
- def select_ids(
117
- data: List[DataContainer],
118
- obj_name: str,
119
- tested_field_name: str,
120
- extracted_field_name: str,
121
- accepted_values: Set[str],
122
- ) -> Set[str]:
123
- obj = data[0].getObj(obj_name)
124
- ids = set()
30
+ def load_pdbx_item_linked_group_list():
31
+ dictionary = os.path.join(
32
+ os.path.abspath(os.path.dirname(__file__)), "mmcif_pdbx_v50.dic"
33
+ )
34
+ adapter = IoAdapterPy()
35
+ data = adapter.readFile(dictionary)
36
+ obj = data[0].getObj("pdbx_item_linked_group_list")
37
+ links = defaultdict(set)
125
38
 
126
39
  if obj:
127
40
  for row in obj.getRowList():
128
41
  row_dict = dict(zip(obj.getAttributeList(), row))
42
+ child_category_id = row_dict["child_category_id"]
43
+ child_name = row_dict["child_name"].split(".")[1]
44
+ parent_name = row_dict["parent_name"].split(".")[1]
45
+ parent_category_id = row_dict["parent_category_id"]
46
+ links[parent_category_id].add(
47
+ Link(parent_category_id, parent_name, child_category_id, child_name)
48
+ )
129
49
 
130
- if row_dict.get(tested_field_name, None) in accepted_values:
131
- ids.add(row_dict[extracted_field_name])
50
+ return links
132
51
 
133
- return ids
52
+
53
+ def select_ids(
54
+ data: List[DataContainer],
55
+ category: str,
56
+ field_name_to_extract: str,
57
+ field_name_to_check: str,
58
+ accepted_values: Iterable[str],
59
+ ) -> Set[str]:
60
+ obj = data[0].getObj(category)
61
+ if not obj:
62
+ return set()
63
+ attributes = obj.getAttributeList()
64
+ if field_name_to_check not in attributes or field_name_to_extract not in attributes:
65
+ return set()
66
+ index_to_check = attributes.index(field_name_to_check)
67
+ index_to_extract = attributes.index(field_name_to_extract)
68
+ return {
69
+ row[index_to_extract]
70
+ for row in obj.getRowList()
71
+ if row[index_to_check] in accepted_values
72
+ }
134
73
 
135
74
 
136
75
  def select_category_by_id(
137
76
  data: List[DataContainer],
138
77
  category: str,
139
78
  field_name: str,
140
- ids: List[str],
79
+ ids: Iterable[str],
141
80
  ) -> Tuple[List[str], List[List[str]]]:
142
81
  obj = data[0].getObj(category)
143
- attributes = []
144
- rows = []
145
-
146
- if obj:
147
- attributes = obj.getAttributeList()
148
-
149
- for row in obj.getRowList():
150
- row_dict = dict(zip(obj.getAttributeList(), row))
151
-
152
- if row_dict.get(field_name, None) in ids:
153
- rows.append(row)
154
-
155
- return attributes, rows
156
-
82
+ if not obj:
83
+ return [], []
84
+ attributes = obj.getAttributeList()
85
+ if field_name not in attributes:
86
+ return attributes, []
87
+ index = attributes.index(field_name)
88
+ return attributes, [row for row in obj.getRowList() if row[index] in ids]
157
89
 
158
- def filter_by_poly_types(
159
- file_content: str,
160
- entity_poly_types: Iterable[str] = ["polyribonucleotide"],
161
- retain_categories: Iterable[str] = [],
162
- ) -> str:
163
- adapter = IoAdapterPy()
164
90
 
91
+ def read_cif(file_content: str) -> DataContainer:
165
92
  with tempfile.NamedTemporaryFile("rt+") as f:
93
+ adapter = IoAdapterPy()
166
94
  f.write(file_content)
167
95
  f.seek(0)
168
- data = adapter.readFile(f.name)
169
-
170
- entity_ids = select_ids(
171
- data, "entity_poly", "type", "entity_id", set(entity_poly_types)
172
- )
173
- asym_ids = select_ids(data, "struct_asym", "entity_id", "id", entity_ids)
174
- auth_asym_ids = select_ids(
175
- data, "atom_site", "label_asym_id", "auth_asym_id", asym_ids
176
- )
96
+ return adapter.readFile(f.name)
97
+
98
+
99
+ def filter_cif(data, entity_ids, asym_ids, auth_asym_ids, retain_categories):
100
+ links = load_pdbx_item_linked_group_list()
101
+ categories_with_entity_id = [("entity", "id")] + [
102
+ (link.child_category_id, link.child_name)
103
+ for link in links["entity"]
104
+ if link.parent_name == "id"
105
+ ]
106
+ categories_with_asym_id = [("struct_asym", "id")] + [
107
+ (link.child_category_id, link.child_name)
108
+ for link in links["struct_asym"]
109
+ if link.parent_name == "id"
110
+ ]
111
+ categories_with_auth_asym_id = [("atom_site", "auth_asym_id")] + [
112
+ (link.child_category_id, link.child_name)
113
+ for link in links["atom_site"]
114
+ if link.parent_name == "auth_asym_id"
115
+ ]
177
116
 
178
117
  output = DataContainer("rnapolis")
179
118
 
180
119
  for table, ids in (
181
- (CATEGORIES_WITH_ENTITY_ID, entity_ids),
182
- (CATEGORIES_WITH_ASYM_ID, asym_ids),
183
- (CATEGORIES_WITH_AUTH_ASYM_ID, auth_asym_ids),
120
+ (categories_with_entity_id, entity_ids),
121
+ (categories_with_asym_id, asym_ids),
122
+ (categories_with_auth_asym_id, auth_asym_ids),
184
123
  ):
185
124
  for category, field_name in table:
186
125
  attributes, rows = select_category_by_id(data, category, field_name, ids)
@@ -195,13 +134,36 @@ def filter_by_poly_types(
195
134
  output.append(obj)
196
135
 
197
136
  with tempfile.NamedTemporaryFile("rt+") as tmp:
137
+ adapter = IoAdapterPy()
198
138
  adapter.writeFile(tmp.name, [output])
199
139
  tmp.seek(0)
200
140
  return tmp.read()
201
141
 
202
142
 
143
+ def filter_by_poly_types(
144
+ file_content: str,
145
+ entity_poly_types: Iterable[str] = [
146
+ "polyribonucleotide",
147
+ "polydeoxyribonucleotide",
148
+ "polydeoxyribonucleotide/polyribonucleotide hybrid",
149
+ ],
150
+ retain_categories: Iterable[str] = ["chem_comp"],
151
+ ) -> str:
152
+ data = read_cif(file_content)
153
+ entity_ids = select_ids(
154
+ data, "entity_poly", "entity_id", "type", set(entity_poly_types)
155
+ )
156
+ asym_ids = select_ids(data, "struct_asym", "id", "entity_id", entity_ids)
157
+ auth_asym_ids = select_ids(
158
+ data, "atom_site", "auth_asym_id", "label_asym_id", asym_ids
159
+ )
160
+ return filter_cif(data, entity_ids, asym_ids, auth_asym_ids, retain_categories)
161
+
162
+
203
163
  def filter_by_chains(
204
- file_content: str, chains: Iterable[str], retain_categories: Iterable[str] = []
164
+ file_content: str,
165
+ chains: Iterable[str],
166
+ retain_categories: Iterable[str] = ["chem_comp"],
205
167
  ) -> str:
206
168
  """
207
169
  Filter a PDBx/mmCIF file by chain IDs. The function returns a new PDBx/mmCIF file.
@@ -210,68 +172,55 @@ def filter_by_chains(
210
172
  This is because the function filters by entity, so if you ask for chain "A",
211
173
  which is part of entity 1 having chains "A", "B" and "C", then you will get all three chains.
212
174
  """
213
- adapter = IoAdapterPy()
214
-
215
- with tempfile.NamedTemporaryFile("rt+") as f:
216
- f.write(file_content)
217
- f.seek(0)
218
- data = adapter.readFile(f.name)
219
-
220
- output = DataContainer("rnapolis")
221
-
222
- entity_ids = select_ids(data, "struct_asym", "id", "entity_id", set(chains))
175
+ data = read_cif(file_content)
223
176
  asym_ids = set(chains)
177
+ entity_ids = select_ids(data, "struct_asym", "entity_id", "id", asym_ids)
224
178
  auth_asym_ids = select_ids(
225
- data, "atom_site", "label_asym_id", "auth_asym_id", asym_ids
179
+ data, "atom_site", "auth_asym_id", "label_asym_id", asym_ids
226
180
  )
227
-
228
- for table, ids in (
229
- (CATEGORIES_WITH_ENTITY_ID, entity_ids),
230
- (CATEGORIES_WITH_ASYM_ID, asym_ids),
231
- (CATEGORIES_WITH_AUTH_ASYM_ID, auth_asym_ids),
232
- ):
233
- for category, field_name in table:
234
- attributes, rows = select_category_by_id(data, category, field_name, ids)
235
-
236
- if attributes and rows:
237
- obj = DataCategory(category, attributes, rows)
238
- output.append(obj)
239
-
240
- for category in retain_categories:
241
- obj = data[0].getObj(category)
242
- if obj:
243
- output.append(obj)
244
-
245
- with tempfile.NamedTemporaryFile("rt+") as tmp:
246
- adapter.writeFile(tmp.name, [output])
247
- tmp.seek(0)
248
- return tmp.read()
181
+ return filter_cif(data, entity_ids, asym_ids, auth_asym_ids, retain_categories)
249
182
 
250
183
 
251
184
  def main():
252
185
  parser = argparse.ArgumentParser()
253
186
  parser.add_argument(
254
- "--type",
255
- help="a type of molecule to select, you can provide this argument multiple times (default: polyribonucleotide)",
187
+ "--filter-by-poly-types",
188
+ help=f"filter by entity poly types, possible values: {', '.join(ENTITY_POLY_TYPES)}",
256
189
  action="append",
257
- default=["polyribonucleotide"],
258
- choices=ENTITY_POLY_TYPES,
190
+ default=[],
259
191
  )
260
192
  parser.add_argument(
261
- "--chain",
262
- help="a chain ID (label_asym_id) to select, you can provide this argument multiple times (if provided, it overrides the --type argument)",
193
+ "--filter-by-chains",
194
+ help="filter by chain IDs (label_asym_id), e.g. A, B, C",
263
195
  action="append",
264
196
  default=[],
265
197
  )
198
+ parser.add_argument(
199
+ "--retain-categories",
200
+ help="categories to retain in the output file default: chem_comp",
201
+ action="append",
202
+ default=["chem_comp"],
203
+ )
266
204
  parser.add_argument("path", help="path to a PDBx/mmCIF file")
267
205
  args = parser.parse_args()
268
206
 
269
207
  file = handle_input_file(args.path)
270
-
271
- if args.chain:
272
- print(filter_by_chains(file.read(), args.chain))
273
- elif args.type:
274
- print(filter_by_poly_types(file.read(), args.type))
208
+ if args.filter_by_poly_types:
209
+ print(
210
+ filter_by_poly_types(
211
+ file.read(),
212
+ entity_poly_types=args.filter_by_poly_types,
213
+ retain_categories=args.retain_categories,
214
+ )
215
+ )
216
+ elif args.filter_by_chains:
217
+ print(
218
+ filter_by_chains(
219
+ file.read(),
220
+ chains=args.filter_by_chains,
221
+ retain_categories=args.retain_categories,
222
+ )
223
+ )
275
224
  else:
276
225
  parser.print_help()
277
226