RNApolis 0.4.15__py3-none-any.whl → 0.4.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,8 @@
1
1
  #! /usr/bin/env python
2
2
  import argparse
3
+ import os
3
4
  import tempfile
5
+ from collections import defaultdict, namedtuple
4
6
  from typing import Iterable, List, Set, Tuple
5
7
 
6
8
  from mmcif.io.IoAdapterPy import IoAdapterPy
@@ -20,167 +22,104 @@ ENTITY_POLY_TYPES = [
20
22
  "polyribonucleotide",
21
23
  ]
22
24
 
23
- CATEGORIES_WITH_ENTITY_ID = [
24
- ("entity", "id"),
25
- ("atom_site", "label_entity_id"),
26
- ("entity_keywords", "entity_id"),
27
- ("entity_name_com", "entity_id"),
28
- ("entity_name_sys", "entity_id"),
29
- ("entity_poly", "entity_id"),
30
- ("entity_src_gen", "entity_id"),
31
- ("entity_src_nat", "entity_id"),
32
- ("pdbx_branch_scheme", "entity_id"),
33
- ("pdbx_chain_remapping", "entity_id"),
34
- ("pdbx_construct", "entity_id"),
35
- ("pdbx_entity_assembly", "entity_id"),
36
- ("pdbx_entity_branch", "entity_id"),
37
- ("pdbx_entity_branch_descriptor", "entity_id"),
38
- ("pdbx_entity_branch_list", "entity_id"),
39
- ("pdbx_entity_func_bind_mode", "entity_id"),
40
- ("pdbx_entity_name", "entity_id"),
41
- ("pdbx_entity_nonpoly", "entity_id"),
42
- ("pdbx_entity_poly_domain", "entity_id"),
43
- ("pdbx_entity_poly_na_nonstandard", "entity_id"),
44
- ("pdbx_entity_poly_na_type", "entity_id"),
45
- ("pdbx_entity_poly_protein_class", "entity_id"),
46
- ("pdbx_entity_prod_protocol", "entity_id"),
47
- ("pdbx_entity_remapping", "entity_id"),
48
- ("pdbx_entity_src_gen_character", "entity_id"),
49
- ("pdbx_entity_src_gen_chrom", "entity_id"),
50
- ("pdbx_entity_src_gen_clone", "entity_id"),
51
- ("pdbx_entity_src_gen_express", "entity_id"),
52
- ("pdbx_entity_src_gen_fract", "entity_id"),
53
- ("pdbx_entity_src_gen_lysis", "entity_id"),
54
- ("pdbx_entity_src_gen_prod_digest", "entity_id"),
55
- ("pdbx_entity_src_gen_prod_other", "entity_id"),
56
- ("pdbx_entity_src_gen_prod_pcr", "entity_id"),
57
- ("pdbx_entity_src_gen_proteolysis", "entity_id"),
58
- ("pdbx_entity_src_gen_pure", "entity_id"),
59
- ("pdbx_entity_src_gen_refold", "entity_id"),
60
- ("pdbx_entity_src_syn", "entity_id"),
61
- ("pdbx_linked_entity_list", "entity_id"),
62
- ("pdbx_prerelease_seq", "entity_id"),
63
- ("pdbx_sifts_xref_db", "entity_id"),
64
- ("pdbx_sifts_xref_db_segments", "entity_id"),
65
- ("pdbx_struct_entity_inst", "entity_id"),
66
- ("struct_asym", "entity_id"),
67
- ("struct_ref", "entity_id"),
68
- ]
69
-
70
- CATEGORIES_WITH_ASYM_ID = [
71
- ("pdbx_coordinate_model", "asym_id"),
72
- ("pdbx_distant_solvent_atoms", "label_asym_id"),
73
- ("pdbx_linked_entity_instance_list", "asym_id"),
74
- ("pdbx_poly_seq_scheme", "asym_id"),
75
- ("pdbx_sifts_unp_segments", "asym_id"),
76
- ("pdbx_struct_asym_gen", "asym_id"),
77
- ("pdbx_struct_ncs_virus_gen", "asym_id"),
78
- ("pdbx_struct_special_symmetry", "label_asym_id"),
79
- ("pdbx_unobs_or_zero_occ_atoms", "label_asym_id"),
80
- ("pdbx_unobs_or_zero_occ_residues", "label_asym_id"),
81
- ("refine_ls_restr_ncs", "pdbx_asym_id"),
82
- ("struct_biol_gen", "asym_id"),
83
- ]
84
-
85
- CATEGORIES_WITH_AUTH_ASYM_ID = [
86
- ("atom_site_anisotrop", "pdbx_auth_asym_id"),
87
- ("pdbx_atom_site_aniso_tls", "auth_asym_id"),
88
- ("pdbx_entity_instance_feature", "auth_asym_id"),
89
- ("pdbx_feature_monomer", "auth_asym_id"),
90
- ("pdbx_missing_atom_nonpoly", "auth_asym_id"),
91
- ("pdbx_missing_atom_poly", "auth_asym_id"),
92
- ("pdbx_modification_feature", "auth_asym_id"),
93
- ("pdbx_refine_component", "auth_asym_id"),
94
- ("pdbx_remediation_atom_site_mapping", "auth_asym_id"),
95
- ("pdbx_rmch_outlier", "auth_asym_id"),
96
- ("pdbx_rms_devs_cov_by_monomer", "auth_asym_id"),
97
- ("pdbx_sequence_pattern", "auth_asym_id"),
98
- ("pdbx_solvent_atom_site_mapping", "auth_asym_id"),
99
- ("pdbx_stereochemistry", "auth_asym_id"),
100
- ("pdbx_struct_chem_comp_diagnostics", "pdb_strand_id"),
101
- ("pdbx_struct_chem_comp_feature", "pdb_strand_id"),
102
- ("pdbx_struct_group_components", "auth_asym_id"),
103
- ("pdbx_struct_mod_residue", "auth_asym_id"),
104
- ("pdbx_sugar_phosphate_geometry", "auth_asym_id"),
105
- ("pdbx_validate_chiral", "auth_asym_id"),
106
- ("pdbx_validate_main_chain_plane", "auth_asym_id"),
107
- ("pdbx_validate_planes", "auth_asym_id"),
108
- ("pdbx_validate_planes_atom", "auth_asym_id"),
109
- ("pdbx_validate_torsion", "auth_asym_id"),
110
- ("struct_mon_nucl", "auth_asym_id"),
111
- ("struct_mon_prot", "auth_asym_id"),
112
- ("struct_site_gen", "auth_asym_id"),
113
- ]
25
+ Link = namedtuple(
26
+ "Link", ["parent_category_id", "parent_name", "child_category_id", "child_name"]
27
+ )
114
28
 
115
29
 
116
- def select_ids(
117
- data: List[DataContainer],
118
- obj_name: str,
119
- tested_field_name: str,
120
- extracted_field_name: str,
121
- accepted_values: Set[str],
122
- ) -> Set[str]:
123
- obj = data[0].getObj(obj_name)
124
- ids = set()
30
+ def load_pdbx_item_linked_group_list():
31
+ dictionary = os.path.join(
32
+ os.path.abspath(os.path.dirname(__file__)), "mmcif_pdbx_v50.dic"
33
+ )
34
+ adapter = IoAdapterPy()
35
+ data = adapter.readFile(dictionary)
36
+ obj = data[0].getObj("pdbx_item_linked_group_list")
37
+ links = defaultdict(set)
125
38
 
126
39
  if obj:
127
40
  for row in obj.getRowList():
128
41
  row_dict = dict(zip(obj.getAttributeList(), row))
42
+ child_category_id = row_dict["child_category_id"]
43
+ child_name = row_dict["child_name"].split(".")[1]
44
+ parent_name = row_dict["parent_name"].split(".")[1]
45
+ parent_category_id = row_dict["parent_category_id"]
46
+ links[parent_category_id].add(
47
+ Link(parent_category_id, parent_name, child_category_id, child_name)
48
+ )
129
49
 
130
- if row_dict.get(tested_field_name, None) in accepted_values:
131
- ids.add(row_dict[extracted_field_name])
50
+ return links
132
51
 
133
- return ids
52
+
53
+ def select_ids(
54
+ data: List[DataContainer],
55
+ category: str,
56
+ field_name_to_extract: str,
57
+ field_name_to_check: str,
58
+ accepted_values: Iterable[str],
59
+ ) -> Set[str]:
60
+ obj = data[0].getObj(category)
61
+ if not obj:
62
+ return set()
63
+ attributes = obj.getAttributeList()
64
+ if field_name_to_check not in attributes or field_name_to_extract not in attributes:
65
+ return set()
66
+ index_to_check = attributes.index(field_name_to_check)
67
+ index_to_extract = attributes.index(field_name_to_extract)
68
+ return {
69
+ row[index_to_extract]
70
+ for row in obj.getRowList()
71
+ if row[index_to_check] in accepted_values
72
+ }
134
73
 
135
74
 
136
75
  def select_category_by_id(
137
76
  data: List[DataContainer],
138
77
  category: str,
139
78
  field_name: str,
140
- ids: List[str],
79
+ ids: Iterable[str],
141
80
  ) -> Tuple[List[str], List[List[str]]]:
142
81
  obj = data[0].getObj(category)
143
- attributes = []
144
- rows = []
145
-
146
- if obj:
147
- attributes = obj.getAttributeList()
148
-
149
- for row in obj.getRowList():
150
- row_dict = dict(zip(obj.getAttributeList(), row))
151
-
152
- if row_dict.get(field_name, None) in ids:
153
- rows.append(row)
154
-
155
- return attributes, rows
156
-
82
+ if not obj:
83
+ return [], []
84
+ attributes = obj.getAttributeList()
85
+ if field_name not in attributes:
86
+ return attributes, []
87
+ index = attributes.index(field_name)
88
+ return attributes, [row for row in obj.getRowList() if row[index] in ids]
157
89
 
158
- def filter_by_poly_types(
159
- file_content: str,
160
- entity_poly_types: Iterable[str] = ["polyribonucleotide"],
161
- retain_categories: Iterable[str] = [],
162
- ) -> str:
163
- adapter = IoAdapterPy()
164
90
 
91
+ def read_cif(file_content: str) -> DataContainer:
165
92
  with tempfile.NamedTemporaryFile("rt+") as f:
93
+ adapter = IoAdapterPy()
166
94
  f.write(file_content)
167
95
  f.seek(0)
168
- data = adapter.readFile(f.name)
169
-
170
- entity_ids = select_ids(
171
- data, "entity_poly", "type", "entity_id", set(entity_poly_types)
172
- )
173
- asym_ids = select_ids(data, "struct_asym", "entity_id", "id", entity_ids)
174
- auth_asym_ids = select_ids(
175
- data, "atom_site", "label_asym_id", "auth_asym_id", asym_ids
176
- )
96
+ return adapter.readFile(f.name)
97
+
98
+
99
+ def filter_cif(data, entity_ids, asym_ids, auth_asym_ids, retain_categories):
100
+ links = load_pdbx_item_linked_group_list()
101
+ categories_with_entity_id = [("entity", "id")] + [
102
+ (link.child_category_id, link.child_name)
103
+ for link in links["entity"]
104
+ if link.parent_name == "id"
105
+ ]
106
+ categories_with_asym_id = [("struct_asym", "id")] + [
107
+ (link.child_category_id, link.child_name)
108
+ for link in links["struct_asym"]
109
+ if link.parent_name == "id"
110
+ ]
111
+ categories_with_auth_asym_id = [("atom_site", "auth_asym_id")] + [
112
+ (link.child_category_id, link.child_name)
113
+ for link in links["atom_site"]
114
+ if link.parent_name == "auth_asym_id"
115
+ ]
177
116
 
178
117
  output = DataContainer("rnapolis")
179
118
 
180
119
  for table, ids in (
181
- (CATEGORIES_WITH_ENTITY_ID, entity_ids),
182
- (CATEGORIES_WITH_ASYM_ID, asym_ids),
183
- (CATEGORIES_WITH_AUTH_ASYM_ID, auth_asym_ids),
120
+ (categories_with_entity_id, entity_ids),
121
+ (categories_with_asym_id, asym_ids),
122
+ (categories_with_auth_asym_id, auth_asym_ids),
184
123
  ):
185
124
  for category, field_name in table:
186
125
  attributes, rows = select_category_by_id(data, category, field_name, ids)
@@ -195,13 +134,36 @@ def filter_by_poly_types(
195
134
  output.append(obj)
196
135
 
197
136
  with tempfile.NamedTemporaryFile("rt+") as tmp:
137
+ adapter = IoAdapterPy()
198
138
  adapter.writeFile(tmp.name, [output])
199
139
  tmp.seek(0)
200
140
  return tmp.read()
201
141
 
202
142
 
143
+ def filter_by_poly_types(
144
+ file_content: str,
145
+ entity_poly_types: Iterable[str] = [
146
+ "polyribonucleotide",
147
+ "polydeoxyribonucleotide",
148
+ "polydeoxyribonucleotide/polyribonucleotide hybrid",
149
+ ],
150
+ retain_categories: Iterable[str] = ["chem_comp"],
151
+ ) -> str:
152
+ data = read_cif(file_content)
153
+ entity_ids = select_ids(
154
+ data, "entity_poly", "entity_id", "type", set(entity_poly_types)
155
+ )
156
+ asym_ids = select_ids(data, "struct_asym", "id", "entity_id", entity_ids)
157
+ auth_asym_ids = select_ids(
158
+ data, "atom_site", "auth_asym_id", "label_asym_id", asym_ids
159
+ )
160
+ return filter_cif(data, entity_ids, asym_ids, auth_asym_ids, retain_categories)
161
+
162
+
203
163
  def filter_by_chains(
204
- file_content: str, chains: Iterable[str], retain_categories: Iterable[str] = []
164
+ file_content: str,
165
+ chains: Iterable[str],
166
+ retain_categories: Iterable[str] = ["chem_comp"],
205
167
  ) -> str:
206
168
  """
207
169
  Filter a PDBx/mmCIF file by chain IDs. The function returns a new PDBx/mmCIF file.
@@ -210,68 +172,55 @@ def filter_by_chains(
210
172
  This is because the function filters by entity, so if you ask for chain "A",
211
173
  which is part of entity 1 having chains "A", "B" and "C", then you will get all three chains.
212
174
  """
213
- adapter = IoAdapterPy()
214
-
215
- with tempfile.NamedTemporaryFile("rt+") as f:
216
- f.write(file_content)
217
- f.seek(0)
218
- data = adapter.readFile(f.name)
219
-
220
- output = DataContainer("rnapolis")
221
-
222
- entity_ids = select_ids(data, "struct_asym", "id", "entity_id", set(chains))
175
+ data = read_cif(file_content)
223
176
  asym_ids = set(chains)
177
+ entity_ids = select_ids(data, "struct_asym", "entity_id", "id", asym_ids)
224
178
  auth_asym_ids = select_ids(
225
- data, "atom_site", "label_asym_id", "auth_asym_id", asym_ids
179
+ data, "atom_site", "auth_asym_id", "label_asym_id", asym_ids
226
180
  )
227
-
228
- for table, ids in (
229
- (CATEGORIES_WITH_ENTITY_ID, entity_ids),
230
- (CATEGORIES_WITH_ASYM_ID, asym_ids),
231
- (CATEGORIES_WITH_AUTH_ASYM_ID, auth_asym_ids),
232
- ):
233
- for category, field_name in table:
234
- attributes, rows = select_category_by_id(data, category, field_name, ids)
235
-
236
- if attributes and rows:
237
- obj = DataCategory(category, attributes, rows)
238
- output.append(obj)
239
-
240
- for category in retain_categories:
241
- obj = data[0].getObj(category)
242
- if obj:
243
- output.append(obj)
244
-
245
- with tempfile.NamedTemporaryFile("rt+") as tmp:
246
- adapter.writeFile(tmp.name, [output])
247
- tmp.seek(0)
248
- return tmp.read()
181
+ return filter_cif(data, entity_ids, asym_ids, auth_asym_ids, retain_categories)
249
182
 
250
183
 
251
184
  def main():
252
185
  parser = argparse.ArgumentParser()
253
186
  parser.add_argument(
254
- "--type",
255
- help="a type of molecule to select, you can provide this argument multiple times (default: polyribonucleotide)",
187
+ "--filter-by-poly-types",
188
+ help=f"filter by entity poly types, possible values: {', '.join(ENTITY_POLY_TYPES)}",
256
189
  action="append",
257
- default=["polyribonucleotide"],
258
- choices=ENTITY_POLY_TYPES,
190
+ default=[],
259
191
  )
260
192
  parser.add_argument(
261
- "--chain",
262
- help="a chain ID (label_asym_id) to select, you can provide this argument multiple times (if provided, it overrides the --type argument)",
193
+ "--filter-by-chains",
194
+ help="filter by chain IDs (label_asym_id), e.g. A, B, C",
263
195
  action="append",
264
196
  default=[],
265
197
  )
198
+ parser.add_argument(
199
+ "--retain-categories",
200
+ help="categories to retain in the output file default: chem_comp",
201
+ action="append",
202
+ default=["chem_comp"],
203
+ )
266
204
  parser.add_argument("path", help="path to a PDBx/mmCIF file")
267
205
  args = parser.parse_args()
268
206
 
269
207
  file = handle_input_file(args.path)
270
-
271
- if args.chain:
272
- print(filter_by_chains(file.read(), args.chain))
273
- elif args.type:
274
- print(filter_by_poly_types(file.read(), args.type))
208
+ if args.filter_by_poly_types:
209
+ print(
210
+ filter_by_poly_types(
211
+ file.read(),
212
+ entity_poly_types=args.filter_by_poly_types,
213
+ retain_categories=args.retain_categories,
214
+ )
215
+ )
216
+ elif args.filter_by_chains:
217
+ print(
218
+ filter_by_chains(
219
+ file.read(),
220
+ chains=args.filter_by_chains,
221
+ retain_categories=args.retain_categories,
222
+ )
223
+ )
275
224
  else:
276
225
  parser.print_help()
277
226