RNApolis 0.4.15__py3-none-any.whl → 0.4.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {RNApolis-0.4.15.dist-info → RNApolis-0.4.17.dist-info}/METADATA +1 -1
- {RNApolis-0.4.15.dist-info → RNApolis-0.4.17.dist-info}/RECORD +8 -7
- rnapolis/mmcif_pdbx_v50.dic +173762 -0
- rnapolis/molecule_filter.py +130 -181
- {RNApolis-0.4.15.dist-info → RNApolis-0.4.17.dist-info}/LICENSE +0 -0
- {RNApolis-0.4.15.dist-info → RNApolis-0.4.17.dist-info}/WHEEL +0 -0
- {RNApolis-0.4.15.dist-info → RNApolis-0.4.17.dist-info}/entry_points.txt +0 -0
- {RNApolis-0.4.15.dist-info → RNApolis-0.4.17.dist-info}/top_level.txt +0 -0
rnapolis/molecule_filter.py
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
#! /usr/bin/env python
|
2
2
|
import argparse
|
3
|
+
import os
|
3
4
|
import tempfile
|
5
|
+
from collections import defaultdict, namedtuple
|
4
6
|
from typing import Iterable, List, Set, Tuple
|
5
7
|
|
6
8
|
from mmcif.io.IoAdapterPy import IoAdapterPy
|
@@ -20,167 +22,104 @@ ENTITY_POLY_TYPES = [
|
|
20
22
|
"polyribonucleotide",
|
21
23
|
]
|
22
24
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
("entity_keywords", "entity_id"),
|
27
|
-
("entity_name_com", "entity_id"),
|
28
|
-
("entity_name_sys", "entity_id"),
|
29
|
-
("entity_poly", "entity_id"),
|
30
|
-
("entity_src_gen", "entity_id"),
|
31
|
-
("entity_src_nat", "entity_id"),
|
32
|
-
("pdbx_branch_scheme", "entity_id"),
|
33
|
-
("pdbx_chain_remapping", "entity_id"),
|
34
|
-
("pdbx_construct", "entity_id"),
|
35
|
-
("pdbx_entity_assembly", "entity_id"),
|
36
|
-
("pdbx_entity_branch", "entity_id"),
|
37
|
-
("pdbx_entity_branch_descriptor", "entity_id"),
|
38
|
-
("pdbx_entity_branch_list", "entity_id"),
|
39
|
-
("pdbx_entity_func_bind_mode", "entity_id"),
|
40
|
-
("pdbx_entity_name", "entity_id"),
|
41
|
-
("pdbx_entity_nonpoly", "entity_id"),
|
42
|
-
("pdbx_entity_poly_domain", "entity_id"),
|
43
|
-
("pdbx_entity_poly_na_nonstandard", "entity_id"),
|
44
|
-
("pdbx_entity_poly_na_type", "entity_id"),
|
45
|
-
("pdbx_entity_poly_protein_class", "entity_id"),
|
46
|
-
("pdbx_entity_prod_protocol", "entity_id"),
|
47
|
-
("pdbx_entity_remapping", "entity_id"),
|
48
|
-
("pdbx_entity_src_gen_character", "entity_id"),
|
49
|
-
("pdbx_entity_src_gen_chrom", "entity_id"),
|
50
|
-
("pdbx_entity_src_gen_clone", "entity_id"),
|
51
|
-
("pdbx_entity_src_gen_express", "entity_id"),
|
52
|
-
("pdbx_entity_src_gen_fract", "entity_id"),
|
53
|
-
("pdbx_entity_src_gen_lysis", "entity_id"),
|
54
|
-
("pdbx_entity_src_gen_prod_digest", "entity_id"),
|
55
|
-
("pdbx_entity_src_gen_prod_other", "entity_id"),
|
56
|
-
("pdbx_entity_src_gen_prod_pcr", "entity_id"),
|
57
|
-
("pdbx_entity_src_gen_proteolysis", "entity_id"),
|
58
|
-
("pdbx_entity_src_gen_pure", "entity_id"),
|
59
|
-
("pdbx_entity_src_gen_refold", "entity_id"),
|
60
|
-
("pdbx_entity_src_syn", "entity_id"),
|
61
|
-
("pdbx_linked_entity_list", "entity_id"),
|
62
|
-
("pdbx_prerelease_seq", "entity_id"),
|
63
|
-
("pdbx_sifts_xref_db", "entity_id"),
|
64
|
-
("pdbx_sifts_xref_db_segments", "entity_id"),
|
65
|
-
("pdbx_struct_entity_inst", "entity_id"),
|
66
|
-
("struct_asym", "entity_id"),
|
67
|
-
("struct_ref", "entity_id"),
|
68
|
-
]
|
69
|
-
|
70
|
-
CATEGORIES_WITH_ASYM_ID = [
|
71
|
-
("pdbx_coordinate_model", "asym_id"),
|
72
|
-
("pdbx_distant_solvent_atoms", "label_asym_id"),
|
73
|
-
("pdbx_linked_entity_instance_list", "asym_id"),
|
74
|
-
("pdbx_poly_seq_scheme", "asym_id"),
|
75
|
-
("pdbx_sifts_unp_segments", "asym_id"),
|
76
|
-
("pdbx_struct_asym_gen", "asym_id"),
|
77
|
-
("pdbx_struct_ncs_virus_gen", "asym_id"),
|
78
|
-
("pdbx_struct_special_symmetry", "label_asym_id"),
|
79
|
-
("pdbx_unobs_or_zero_occ_atoms", "label_asym_id"),
|
80
|
-
("pdbx_unobs_or_zero_occ_residues", "label_asym_id"),
|
81
|
-
("refine_ls_restr_ncs", "pdbx_asym_id"),
|
82
|
-
("struct_biol_gen", "asym_id"),
|
83
|
-
]
|
84
|
-
|
85
|
-
CATEGORIES_WITH_AUTH_ASYM_ID = [
|
86
|
-
("atom_site_anisotrop", "pdbx_auth_asym_id"),
|
87
|
-
("pdbx_atom_site_aniso_tls", "auth_asym_id"),
|
88
|
-
("pdbx_entity_instance_feature", "auth_asym_id"),
|
89
|
-
("pdbx_feature_monomer", "auth_asym_id"),
|
90
|
-
("pdbx_missing_atom_nonpoly", "auth_asym_id"),
|
91
|
-
("pdbx_missing_atom_poly", "auth_asym_id"),
|
92
|
-
("pdbx_modification_feature", "auth_asym_id"),
|
93
|
-
("pdbx_refine_component", "auth_asym_id"),
|
94
|
-
("pdbx_remediation_atom_site_mapping", "auth_asym_id"),
|
95
|
-
("pdbx_rmch_outlier", "auth_asym_id"),
|
96
|
-
("pdbx_rms_devs_cov_by_monomer", "auth_asym_id"),
|
97
|
-
("pdbx_sequence_pattern", "auth_asym_id"),
|
98
|
-
("pdbx_solvent_atom_site_mapping", "auth_asym_id"),
|
99
|
-
("pdbx_stereochemistry", "auth_asym_id"),
|
100
|
-
("pdbx_struct_chem_comp_diagnostics", "pdb_strand_id"),
|
101
|
-
("pdbx_struct_chem_comp_feature", "pdb_strand_id"),
|
102
|
-
("pdbx_struct_group_components", "auth_asym_id"),
|
103
|
-
("pdbx_struct_mod_residue", "auth_asym_id"),
|
104
|
-
("pdbx_sugar_phosphate_geometry", "auth_asym_id"),
|
105
|
-
("pdbx_validate_chiral", "auth_asym_id"),
|
106
|
-
("pdbx_validate_main_chain_plane", "auth_asym_id"),
|
107
|
-
("pdbx_validate_planes", "auth_asym_id"),
|
108
|
-
("pdbx_validate_planes_atom", "auth_asym_id"),
|
109
|
-
("pdbx_validate_torsion", "auth_asym_id"),
|
110
|
-
("struct_mon_nucl", "auth_asym_id"),
|
111
|
-
("struct_mon_prot", "auth_asym_id"),
|
112
|
-
("struct_site_gen", "auth_asym_id"),
|
113
|
-
]
|
25
|
+
Link = namedtuple(
|
26
|
+
"Link", ["parent_category_id", "parent_name", "child_category_id", "child_name"]
|
27
|
+
)
|
114
28
|
|
115
29
|
|
116
|
-
def
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
ids = set()
|
30
|
+
def load_pdbx_item_linked_group_list():
|
31
|
+
dictionary = os.path.join(
|
32
|
+
os.path.abspath(os.path.dirname(__file__)), "mmcif_pdbx_v50.dic"
|
33
|
+
)
|
34
|
+
adapter = IoAdapterPy()
|
35
|
+
data = adapter.readFile(dictionary)
|
36
|
+
obj = data[0].getObj("pdbx_item_linked_group_list")
|
37
|
+
links = defaultdict(set)
|
125
38
|
|
126
39
|
if obj:
|
127
40
|
for row in obj.getRowList():
|
128
41
|
row_dict = dict(zip(obj.getAttributeList(), row))
|
42
|
+
child_category_id = row_dict["child_category_id"]
|
43
|
+
child_name = row_dict["child_name"].split(".")[1]
|
44
|
+
parent_name = row_dict["parent_name"].split(".")[1]
|
45
|
+
parent_category_id = row_dict["parent_category_id"]
|
46
|
+
links[parent_category_id].add(
|
47
|
+
Link(parent_category_id, parent_name, child_category_id, child_name)
|
48
|
+
)
|
129
49
|
|
130
|
-
|
131
|
-
ids.add(row_dict[extracted_field_name])
|
50
|
+
return links
|
132
51
|
|
133
|
-
|
52
|
+
|
53
|
+
def select_ids(
|
54
|
+
data: List[DataContainer],
|
55
|
+
category: str,
|
56
|
+
field_name_to_extract: str,
|
57
|
+
field_name_to_check: str,
|
58
|
+
accepted_values: Iterable[str],
|
59
|
+
) -> Set[str]:
|
60
|
+
obj = data[0].getObj(category)
|
61
|
+
if not obj:
|
62
|
+
return set()
|
63
|
+
attributes = obj.getAttributeList()
|
64
|
+
if field_name_to_check not in attributes or field_name_to_extract not in attributes:
|
65
|
+
return set()
|
66
|
+
index_to_check = attributes.index(field_name_to_check)
|
67
|
+
index_to_extract = attributes.index(field_name_to_extract)
|
68
|
+
return {
|
69
|
+
row[index_to_extract]
|
70
|
+
for row in obj.getRowList()
|
71
|
+
if row[index_to_check] in accepted_values
|
72
|
+
}
|
134
73
|
|
135
74
|
|
136
75
|
def select_category_by_id(
|
137
76
|
data: List[DataContainer],
|
138
77
|
category: str,
|
139
78
|
field_name: str,
|
140
|
-
ids:
|
79
|
+
ids: Iterable[str],
|
141
80
|
) -> Tuple[List[str], List[List[str]]]:
|
142
81
|
obj = data[0].getObj(category)
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
if
|
147
|
-
attributes
|
148
|
-
|
149
|
-
|
150
|
-
row_dict = dict(zip(obj.getAttributeList(), row))
|
151
|
-
|
152
|
-
if row_dict.get(field_name, None) in ids:
|
153
|
-
rows.append(row)
|
154
|
-
|
155
|
-
return attributes, rows
|
156
|
-
|
82
|
+
if not obj:
|
83
|
+
return [], []
|
84
|
+
attributes = obj.getAttributeList()
|
85
|
+
if field_name not in attributes:
|
86
|
+
return attributes, []
|
87
|
+
index = attributes.index(field_name)
|
88
|
+
return attributes, [row for row in obj.getRowList() if row[index] in ids]
|
157
89
|
|
158
|
-
def filter_by_poly_types(
|
159
|
-
file_content: str,
|
160
|
-
entity_poly_types: Iterable[str] = ["polyribonucleotide"],
|
161
|
-
retain_categories: Iterable[str] = [],
|
162
|
-
) -> str:
|
163
|
-
adapter = IoAdapterPy()
|
164
90
|
|
91
|
+
def read_cif(file_content: str) -> DataContainer:
|
165
92
|
with tempfile.NamedTemporaryFile("rt+") as f:
|
93
|
+
adapter = IoAdapterPy()
|
166
94
|
f.write(file_content)
|
167
95
|
f.seek(0)
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
)
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
96
|
+
return adapter.readFile(f.name)
|
97
|
+
|
98
|
+
|
99
|
+
def filter_cif(data, entity_ids, asym_ids, auth_asym_ids, retain_categories):
|
100
|
+
links = load_pdbx_item_linked_group_list()
|
101
|
+
categories_with_entity_id = [("entity", "id")] + [
|
102
|
+
(link.child_category_id, link.child_name)
|
103
|
+
for link in links["entity"]
|
104
|
+
if link.parent_name == "id"
|
105
|
+
]
|
106
|
+
categories_with_asym_id = [("struct_asym", "id")] + [
|
107
|
+
(link.child_category_id, link.child_name)
|
108
|
+
for link in links["struct_asym"]
|
109
|
+
if link.parent_name == "id"
|
110
|
+
]
|
111
|
+
categories_with_auth_asym_id = [("atom_site", "auth_asym_id")] + [
|
112
|
+
(link.child_category_id, link.child_name)
|
113
|
+
for link in links["atom_site"]
|
114
|
+
if link.parent_name == "auth_asym_id"
|
115
|
+
]
|
177
116
|
|
178
117
|
output = DataContainer("rnapolis")
|
179
118
|
|
180
119
|
for table, ids in (
|
181
|
-
(
|
182
|
-
(
|
183
|
-
(
|
120
|
+
(categories_with_entity_id, entity_ids),
|
121
|
+
(categories_with_asym_id, asym_ids),
|
122
|
+
(categories_with_auth_asym_id, auth_asym_ids),
|
184
123
|
):
|
185
124
|
for category, field_name in table:
|
186
125
|
attributes, rows = select_category_by_id(data, category, field_name, ids)
|
@@ -195,13 +134,36 @@ def filter_by_poly_types(
|
|
195
134
|
output.append(obj)
|
196
135
|
|
197
136
|
with tempfile.NamedTemporaryFile("rt+") as tmp:
|
137
|
+
adapter = IoAdapterPy()
|
198
138
|
adapter.writeFile(tmp.name, [output])
|
199
139
|
tmp.seek(0)
|
200
140
|
return tmp.read()
|
201
141
|
|
202
142
|
|
143
|
+
def filter_by_poly_types(
|
144
|
+
file_content: str,
|
145
|
+
entity_poly_types: Iterable[str] = [
|
146
|
+
"polyribonucleotide",
|
147
|
+
"polydeoxyribonucleotide",
|
148
|
+
"polydeoxyribonucleotide/polyribonucleotide hybrid",
|
149
|
+
],
|
150
|
+
retain_categories: Iterable[str] = ["chem_comp"],
|
151
|
+
) -> str:
|
152
|
+
data = read_cif(file_content)
|
153
|
+
entity_ids = select_ids(
|
154
|
+
data, "entity_poly", "entity_id", "type", set(entity_poly_types)
|
155
|
+
)
|
156
|
+
asym_ids = select_ids(data, "struct_asym", "id", "entity_id", entity_ids)
|
157
|
+
auth_asym_ids = select_ids(
|
158
|
+
data, "atom_site", "auth_asym_id", "label_asym_id", asym_ids
|
159
|
+
)
|
160
|
+
return filter_cif(data, entity_ids, asym_ids, auth_asym_ids, retain_categories)
|
161
|
+
|
162
|
+
|
203
163
|
def filter_by_chains(
|
204
|
-
file_content: str,
|
164
|
+
file_content: str,
|
165
|
+
chains: Iterable[str],
|
166
|
+
retain_categories: Iterable[str] = ["chem_comp"],
|
205
167
|
) -> str:
|
206
168
|
"""
|
207
169
|
Filter a PDBx/mmCIF file by chain IDs. The function returns a new PDBx/mmCIF file.
|
@@ -210,68 +172,55 @@ def filter_by_chains(
|
|
210
172
|
This is because the function filters by entity, so if you ask for chain "A",
|
211
173
|
which is part of entity 1 having chains "A", "B" and "C", then you will get all three chains.
|
212
174
|
"""
|
213
|
-
|
214
|
-
|
215
|
-
with tempfile.NamedTemporaryFile("rt+") as f:
|
216
|
-
f.write(file_content)
|
217
|
-
f.seek(0)
|
218
|
-
data = adapter.readFile(f.name)
|
219
|
-
|
220
|
-
output = DataContainer("rnapolis")
|
221
|
-
|
222
|
-
entity_ids = select_ids(data, "struct_asym", "id", "entity_id", set(chains))
|
175
|
+
data = read_cif(file_content)
|
223
176
|
asym_ids = set(chains)
|
177
|
+
entity_ids = select_ids(data, "struct_asym", "entity_id", "id", asym_ids)
|
224
178
|
auth_asym_ids = select_ids(
|
225
|
-
data, "atom_site", "
|
179
|
+
data, "atom_site", "auth_asym_id", "label_asym_id", asym_ids
|
226
180
|
)
|
227
|
-
|
228
|
-
for table, ids in (
|
229
|
-
(CATEGORIES_WITH_ENTITY_ID, entity_ids),
|
230
|
-
(CATEGORIES_WITH_ASYM_ID, asym_ids),
|
231
|
-
(CATEGORIES_WITH_AUTH_ASYM_ID, auth_asym_ids),
|
232
|
-
):
|
233
|
-
for category, field_name in table:
|
234
|
-
attributes, rows = select_category_by_id(data, category, field_name, ids)
|
235
|
-
|
236
|
-
if attributes and rows:
|
237
|
-
obj = DataCategory(category, attributes, rows)
|
238
|
-
output.append(obj)
|
239
|
-
|
240
|
-
for category in retain_categories:
|
241
|
-
obj = data[0].getObj(category)
|
242
|
-
if obj:
|
243
|
-
output.append(obj)
|
244
|
-
|
245
|
-
with tempfile.NamedTemporaryFile("rt+") as tmp:
|
246
|
-
adapter.writeFile(tmp.name, [output])
|
247
|
-
tmp.seek(0)
|
248
|
-
return tmp.read()
|
181
|
+
return filter_cif(data, entity_ids, asym_ids, auth_asym_ids, retain_categories)
|
249
182
|
|
250
183
|
|
251
184
|
def main():
|
252
185
|
parser = argparse.ArgumentParser()
|
253
186
|
parser.add_argument(
|
254
|
-
"--
|
255
|
-
help="
|
187
|
+
"--filter-by-poly-types",
|
188
|
+
help=f"filter by entity poly types, possible values: {', '.join(ENTITY_POLY_TYPES)}",
|
256
189
|
action="append",
|
257
|
-
default=[
|
258
|
-
choices=ENTITY_POLY_TYPES,
|
190
|
+
default=[],
|
259
191
|
)
|
260
192
|
parser.add_argument(
|
261
|
-
"--
|
262
|
-
help="
|
193
|
+
"--filter-by-chains",
|
194
|
+
help="filter by chain IDs (label_asym_id), e.g. A, B, C",
|
263
195
|
action="append",
|
264
196
|
default=[],
|
265
197
|
)
|
198
|
+
parser.add_argument(
|
199
|
+
"--retain-categories",
|
200
|
+
help="categories to retain in the output file default: chem_comp",
|
201
|
+
action="append",
|
202
|
+
default=["chem_comp"],
|
203
|
+
)
|
266
204
|
parser.add_argument("path", help="path to a PDBx/mmCIF file")
|
267
205
|
args = parser.parse_args()
|
268
206
|
|
269
207
|
file = handle_input_file(args.path)
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
208
|
+
if args.filter_by_poly_types:
|
209
|
+
print(
|
210
|
+
filter_by_poly_types(
|
211
|
+
file.read(),
|
212
|
+
entity_poly_types=args.filter_by_poly_types,
|
213
|
+
retain_categories=args.retain_categories,
|
214
|
+
)
|
215
|
+
)
|
216
|
+
elif args.filter_by_chains:
|
217
|
+
print(
|
218
|
+
filter_by_chains(
|
219
|
+
file.read(),
|
220
|
+
chains=args.filter_by_chains,
|
221
|
+
retain_categories=args.retain_categories,
|
222
|
+
)
|
223
|
+
)
|
275
224
|
else:
|
276
225
|
parser.print_help()
|
277
226
|
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|