RNApolis 0.4.15__py3-none-any.whl → 0.4.16__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- {RNApolis-0.4.15.dist-info → RNApolis-0.4.16.dist-info}/METADATA +1 -1
- {RNApolis-0.4.15.dist-info → RNApolis-0.4.16.dist-info}/RECORD +7 -7
- rnapolis/molecule_filter.py +130 -181
- {RNApolis-0.4.15.dist-info → RNApolis-0.4.16.dist-info}/LICENSE +0 -0
- {RNApolis-0.4.15.dist-info → RNApolis-0.4.16.dist-info}/WHEEL +0 -0
- {RNApolis-0.4.15.dist-info → RNApolis-0.4.16.dist-info}/entry_points.txt +0 -0
- {RNApolis-0.4.15.dist-info → RNApolis-0.4.16.dist-info}/top_level.txt +0 -0
@@ -2,16 +2,16 @@ rnapolis/annotator.py,sha256=hRRzRmneYxbg2tvwVHMWLfzmJb4szV0JL_6EOC09Gwg,22101
|
|
2
2
|
rnapolis/clashfinder.py,sha256=i95kp0o6OWNqmJDBr-PbsZd7RY2iJtBDr7QqolJSuAQ,8513
|
3
3
|
rnapolis/common.py,sha256=LY6Uz96Br8ki_gA8LpfatgtvVbt9jOTkwgagayqTgf8,31251
|
4
4
|
rnapolis/metareader.py,sha256=I1-cXc2YNBPwa3zihAnMTjEsAo79tEKzSmWu5yvN1Pk,2071
|
5
|
-
rnapolis/molecule_filter.py,sha256=
|
5
|
+
rnapolis/molecule_filter.py,sha256=jgcpJxx_oXEBX0d30v4k_FdwRouRUPUsEtCYWgLGpD4,7310
|
6
6
|
rnapolis/motif_extractor.py,sha256=Lfn1iEkhkP9eZD3GPEWNAfy00QO7QPCc8wM_XS1ory8,1147
|
7
7
|
rnapolis/parser.py,sha256=3g4mtFvpiEENFcSBBtx_E_x1vJPF9BujWnts0kb9XjE,16340
|
8
8
|
rnapolis/rfam_folder.py,sha256=SjiiyML_T1__saruFwSMJEoQ7Y55GIU8ktS8ZUn5-fw,11111
|
9
9
|
rnapolis/tertiary.py,sha256=6t9ZB4w33-5n_M3sns1RoFXCOTgVAgGH4WDNG5OG9Kg,23426
|
10
10
|
rnapolis/transformer.py,sha256=aC0nBmHHJf5TyLvBIV57Jj3tlwpvHbPo347opfAOlQA,3844
|
11
11
|
rnapolis/util.py,sha256=IdquFO3PV1_KDqodjupzm0Rqvgy0CeSzxGHaGEHYXVU,543
|
12
|
-
RNApolis-0.4.
|
13
|
-
RNApolis-0.4.
|
14
|
-
RNApolis-0.4.
|
15
|
-
RNApolis-0.4.
|
16
|
-
RNApolis-0.4.
|
17
|
-
RNApolis-0.4.
|
12
|
+
RNApolis-0.4.16.dist-info/LICENSE,sha256=ZGRu12MzCgbYA-Lt8MyBlmjvPZh7xfiD5u5wBx0enq4,1066
|
13
|
+
RNApolis-0.4.16.dist-info/METADATA,sha256=Ouh1NQ3gFk7NrpInnQmBWJWRn_1JML9qi5c8MVk8_Q8,54516
|
14
|
+
RNApolis-0.4.16.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
15
|
+
RNApolis-0.4.16.dist-info/entry_points.txt,sha256=foN2Pn5e-OzEz0fFmNoX6PnFSZFQntOlY8LbognP5F0,308
|
16
|
+
RNApolis-0.4.16.dist-info/top_level.txt,sha256=LcO18koxZcWoJ21KDRRRo_tyIbmXL5z61dPitZpy8yc,9
|
17
|
+
RNApolis-0.4.16.dist-info/RECORD,,
|
rnapolis/molecule_filter.py
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
#! /usr/bin/env python
|
2
2
|
import argparse
|
3
|
+
import os
|
3
4
|
import tempfile
|
5
|
+
from collections import defaultdict, namedtuple
|
4
6
|
from typing import Iterable, List, Set, Tuple
|
5
7
|
|
6
8
|
from mmcif.io.IoAdapterPy import IoAdapterPy
|
@@ -20,167 +22,104 @@ ENTITY_POLY_TYPES = [
|
|
20
22
|
"polyribonucleotide",
|
21
23
|
]
|
22
24
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
("entity_keywords", "entity_id"),
|
27
|
-
("entity_name_com", "entity_id"),
|
28
|
-
("entity_name_sys", "entity_id"),
|
29
|
-
("entity_poly", "entity_id"),
|
30
|
-
("entity_src_gen", "entity_id"),
|
31
|
-
("entity_src_nat", "entity_id"),
|
32
|
-
("pdbx_branch_scheme", "entity_id"),
|
33
|
-
("pdbx_chain_remapping", "entity_id"),
|
34
|
-
("pdbx_construct", "entity_id"),
|
35
|
-
("pdbx_entity_assembly", "entity_id"),
|
36
|
-
("pdbx_entity_branch", "entity_id"),
|
37
|
-
("pdbx_entity_branch_descriptor", "entity_id"),
|
38
|
-
("pdbx_entity_branch_list", "entity_id"),
|
39
|
-
("pdbx_entity_func_bind_mode", "entity_id"),
|
40
|
-
("pdbx_entity_name", "entity_id"),
|
41
|
-
("pdbx_entity_nonpoly", "entity_id"),
|
42
|
-
("pdbx_entity_poly_domain", "entity_id"),
|
43
|
-
("pdbx_entity_poly_na_nonstandard", "entity_id"),
|
44
|
-
("pdbx_entity_poly_na_type", "entity_id"),
|
45
|
-
("pdbx_entity_poly_protein_class", "entity_id"),
|
46
|
-
("pdbx_entity_prod_protocol", "entity_id"),
|
47
|
-
("pdbx_entity_remapping", "entity_id"),
|
48
|
-
("pdbx_entity_src_gen_character", "entity_id"),
|
49
|
-
("pdbx_entity_src_gen_chrom", "entity_id"),
|
50
|
-
("pdbx_entity_src_gen_clone", "entity_id"),
|
51
|
-
("pdbx_entity_src_gen_express", "entity_id"),
|
52
|
-
("pdbx_entity_src_gen_fract", "entity_id"),
|
53
|
-
("pdbx_entity_src_gen_lysis", "entity_id"),
|
54
|
-
("pdbx_entity_src_gen_prod_digest", "entity_id"),
|
55
|
-
("pdbx_entity_src_gen_prod_other", "entity_id"),
|
56
|
-
("pdbx_entity_src_gen_prod_pcr", "entity_id"),
|
57
|
-
("pdbx_entity_src_gen_proteolysis", "entity_id"),
|
58
|
-
("pdbx_entity_src_gen_pure", "entity_id"),
|
59
|
-
("pdbx_entity_src_gen_refold", "entity_id"),
|
60
|
-
("pdbx_entity_src_syn", "entity_id"),
|
61
|
-
("pdbx_linked_entity_list", "entity_id"),
|
62
|
-
("pdbx_prerelease_seq", "entity_id"),
|
63
|
-
("pdbx_sifts_xref_db", "entity_id"),
|
64
|
-
("pdbx_sifts_xref_db_segments", "entity_id"),
|
65
|
-
("pdbx_struct_entity_inst", "entity_id"),
|
66
|
-
("struct_asym", "entity_id"),
|
67
|
-
("struct_ref", "entity_id"),
|
68
|
-
]
|
69
|
-
|
70
|
-
CATEGORIES_WITH_ASYM_ID = [
|
71
|
-
("pdbx_coordinate_model", "asym_id"),
|
72
|
-
("pdbx_distant_solvent_atoms", "label_asym_id"),
|
73
|
-
("pdbx_linked_entity_instance_list", "asym_id"),
|
74
|
-
("pdbx_poly_seq_scheme", "asym_id"),
|
75
|
-
("pdbx_sifts_unp_segments", "asym_id"),
|
76
|
-
("pdbx_struct_asym_gen", "asym_id"),
|
77
|
-
("pdbx_struct_ncs_virus_gen", "asym_id"),
|
78
|
-
("pdbx_struct_special_symmetry", "label_asym_id"),
|
79
|
-
("pdbx_unobs_or_zero_occ_atoms", "label_asym_id"),
|
80
|
-
("pdbx_unobs_or_zero_occ_residues", "label_asym_id"),
|
81
|
-
("refine_ls_restr_ncs", "pdbx_asym_id"),
|
82
|
-
("struct_biol_gen", "asym_id"),
|
83
|
-
]
|
84
|
-
|
85
|
-
CATEGORIES_WITH_AUTH_ASYM_ID = [
|
86
|
-
("atom_site_anisotrop", "pdbx_auth_asym_id"),
|
87
|
-
("pdbx_atom_site_aniso_tls", "auth_asym_id"),
|
88
|
-
("pdbx_entity_instance_feature", "auth_asym_id"),
|
89
|
-
("pdbx_feature_monomer", "auth_asym_id"),
|
90
|
-
("pdbx_missing_atom_nonpoly", "auth_asym_id"),
|
91
|
-
("pdbx_missing_atom_poly", "auth_asym_id"),
|
92
|
-
("pdbx_modification_feature", "auth_asym_id"),
|
93
|
-
("pdbx_refine_component", "auth_asym_id"),
|
94
|
-
("pdbx_remediation_atom_site_mapping", "auth_asym_id"),
|
95
|
-
("pdbx_rmch_outlier", "auth_asym_id"),
|
96
|
-
("pdbx_rms_devs_cov_by_monomer", "auth_asym_id"),
|
97
|
-
("pdbx_sequence_pattern", "auth_asym_id"),
|
98
|
-
("pdbx_solvent_atom_site_mapping", "auth_asym_id"),
|
99
|
-
("pdbx_stereochemistry", "auth_asym_id"),
|
100
|
-
("pdbx_struct_chem_comp_diagnostics", "pdb_strand_id"),
|
101
|
-
("pdbx_struct_chem_comp_feature", "pdb_strand_id"),
|
102
|
-
("pdbx_struct_group_components", "auth_asym_id"),
|
103
|
-
("pdbx_struct_mod_residue", "auth_asym_id"),
|
104
|
-
("pdbx_sugar_phosphate_geometry", "auth_asym_id"),
|
105
|
-
("pdbx_validate_chiral", "auth_asym_id"),
|
106
|
-
("pdbx_validate_main_chain_plane", "auth_asym_id"),
|
107
|
-
("pdbx_validate_planes", "auth_asym_id"),
|
108
|
-
("pdbx_validate_planes_atom", "auth_asym_id"),
|
109
|
-
("pdbx_validate_torsion", "auth_asym_id"),
|
110
|
-
("struct_mon_nucl", "auth_asym_id"),
|
111
|
-
("struct_mon_prot", "auth_asym_id"),
|
112
|
-
("struct_site_gen", "auth_asym_id"),
|
113
|
-
]
|
25
|
+
Link = namedtuple(
|
26
|
+
"Link", ["parent_category_id", "parent_name", "child_category_id", "child_name"]
|
27
|
+
)
|
114
28
|
|
115
29
|
|
116
|
-
def
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
ids = set()
|
30
|
+
def load_pdbx_item_linked_group_list():
|
31
|
+
dictionary = os.path.join(
|
32
|
+
os.path.abspath(os.path.dirname(__file__)), "mmcif_pdbx_v50.dic"
|
33
|
+
)
|
34
|
+
adapter = IoAdapterPy()
|
35
|
+
data = adapter.readFile(dictionary)
|
36
|
+
obj = data[0].getObj("pdbx_item_linked_group_list")
|
37
|
+
links = defaultdict(set)
|
125
38
|
|
126
39
|
if obj:
|
127
40
|
for row in obj.getRowList():
|
128
41
|
row_dict = dict(zip(obj.getAttributeList(), row))
|
42
|
+
child_category_id = row_dict["child_category_id"]
|
43
|
+
child_name = row_dict["child_name"].split(".")[1]
|
44
|
+
parent_name = row_dict["parent_name"].split(".")[1]
|
45
|
+
parent_category_id = row_dict["parent_category_id"]
|
46
|
+
links[parent_category_id].add(
|
47
|
+
Link(parent_category_id, parent_name, child_category_id, child_name)
|
48
|
+
)
|
129
49
|
|
130
|
-
|
131
|
-
ids.add(row_dict[extracted_field_name])
|
50
|
+
return links
|
132
51
|
|
133
|
-
|
52
|
+
|
53
|
+
def select_ids(
|
54
|
+
data: List[DataContainer],
|
55
|
+
category: str,
|
56
|
+
field_name_to_extract: str,
|
57
|
+
field_name_to_check: str,
|
58
|
+
accepted_values: Iterable[str],
|
59
|
+
) -> Set[str]:
|
60
|
+
obj = data[0].getObj(category)
|
61
|
+
if not obj:
|
62
|
+
return set()
|
63
|
+
attributes = obj.getAttributeList()
|
64
|
+
if field_name_to_check not in attributes or field_name_to_extract not in attributes:
|
65
|
+
return set()
|
66
|
+
index_to_check = attributes.index(field_name_to_check)
|
67
|
+
index_to_extract = attributes.index(field_name_to_extract)
|
68
|
+
return {
|
69
|
+
row[index_to_extract]
|
70
|
+
for row in obj.getRowList()
|
71
|
+
if row[index_to_check] in accepted_values
|
72
|
+
}
|
134
73
|
|
135
74
|
|
136
75
|
def select_category_by_id(
|
137
76
|
data: List[DataContainer],
|
138
77
|
category: str,
|
139
78
|
field_name: str,
|
140
|
-
ids:
|
79
|
+
ids: Iterable[str],
|
141
80
|
) -> Tuple[List[str], List[List[str]]]:
|
142
81
|
obj = data[0].getObj(category)
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
if
|
147
|
-
attributes
|
148
|
-
|
149
|
-
|
150
|
-
row_dict = dict(zip(obj.getAttributeList(), row))
|
151
|
-
|
152
|
-
if row_dict.get(field_name, None) in ids:
|
153
|
-
rows.append(row)
|
154
|
-
|
155
|
-
return attributes, rows
|
156
|
-
|
82
|
+
if not obj:
|
83
|
+
return [], []
|
84
|
+
attributes = obj.getAttributeList()
|
85
|
+
if field_name not in attributes:
|
86
|
+
return attributes, []
|
87
|
+
index = attributes.index(field_name)
|
88
|
+
return attributes, [row for row in obj.getRowList() if row[index] in ids]
|
157
89
|
|
158
|
-
def filter_by_poly_types(
|
159
|
-
file_content: str,
|
160
|
-
entity_poly_types: Iterable[str] = ["polyribonucleotide"],
|
161
|
-
retain_categories: Iterable[str] = [],
|
162
|
-
) -> str:
|
163
|
-
adapter = IoAdapterPy()
|
164
90
|
|
91
|
+
def read_cif(file_content: str) -> DataContainer:
|
165
92
|
with tempfile.NamedTemporaryFile("rt+") as f:
|
93
|
+
adapter = IoAdapterPy()
|
166
94
|
f.write(file_content)
|
167
95
|
f.seek(0)
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
)
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
96
|
+
return adapter.readFile(f.name)
|
97
|
+
|
98
|
+
|
99
|
+
def filter_cif(data, entity_ids, asym_ids, auth_asym_ids, retain_categories):
|
100
|
+
links = load_pdbx_item_linked_group_list()
|
101
|
+
categories_with_entity_id = [("entity", "id")] + [
|
102
|
+
(link.child_category_id, link.child_name)
|
103
|
+
for link in links["entity"]
|
104
|
+
if link.parent_name == "id"
|
105
|
+
]
|
106
|
+
categories_with_asym_id = [("struct_asym", "id")] + [
|
107
|
+
(link.child_category_id, link.child_name)
|
108
|
+
for link in links["struct_asym"]
|
109
|
+
if link.parent_name == "id"
|
110
|
+
]
|
111
|
+
categories_with_auth_asym_id = [("atom_site", "auth_asym_id")] + [
|
112
|
+
(link.child_category_id, link.child_name)
|
113
|
+
for link in links["atom_site"]
|
114
|
+
if link.parent_name == "auth_asym_id"
|
115
|
+
]
|
177
116
|
|
178
117
|
output = DataContainer("rnapolis")
|
179
118
|
|
180
119
|
for table, ids in (
|
181
|
-
(
|
182
|
-
(
|
183
|
-
(
|
120
|
+
(categories_with_entity_id, entity_ids),
|
121
|
+
(categories_with_asym_id, asym_ids),
|
122
|
+
(categories_with_auth_asym_id, auth_asym_ids),
|
184
123
|
):
|
185
124
|
for category, field_name in table:
|
186
125
|
attributes, rows = select_category_by_id(data, category, field_name, ids)
|
@@ -195,13 +134,36 @@ def filter_by_poly_types(
|
|
195
134
|
output.append(obj)
|
196
135
|
|
197
136
|
with tempfile.NamedTemporaryFile("rt+") as tmp:
|
137
|
+
adapter = IoAdapterPy()
|
198
138
|
adapter.writeFile(tmp.name, [output])
|
199
139
|
tmp.seek(0)
|
200
140
|
return tmp.read()
|
201
141
|
|
202
142
|
|
143
|
+
def filter_by_poly_types(
|
144
|
+
file_content: str,
|
145
|
+
entity_poly_types: Iterable[str] = [
|
146
|
+
"polyribonucleotide",
|
147
|
+
"polydeoxyribonucleotide",
|
148
|
+
"polydeoxyribonucleotide/polyribonucleotide hybrid",
|
149
|
+
],
|
150
|
+
retain_categories: Iterable[str] = ["chem_comp"],
|
151
|
+
) -> str:
|
152
|
+
data = read_cif(file_content)
|
153
|
+
entity_ids = select_ids(
|
154
|
+
data, "entity_poly", "entity_id", "type", set(entity_poly_types)
|
155
|
+
)
|
156
|
+
asym_ids = select_ids(data, "struct_asym", "id", "entity_id", entity_ids)
|
157
|
+
auth_asym_ids = select_ids(
|
158
|
+
data, "atom_site", "auth_asym_id", "label_asym_id", asym_ids
|
159
|
+
)
|
160
|
+
return filter_cif(data, entity_ids, asym_ids, auth_asym_ids, retain_categories)
|
161
|
+
|
162
|
+
|
203
163
|
def filter_by_chains(
|
204
|
-
file_content: str,
|
164
|
+
file_content: str,
|
165
|
+
chains: Iterable[str],
|
166
|
+
retain_categories: Iterable[str] = ["chem_comp"],
|
205
167
|
) -> str:
|
206
168
|
"""
|
207
169
|
Filter a PDBx/mmCIF file by chain IDs. The function returns a new PDBx/mmCIF file.
|
@@ -210,68 +172,55 @@ def filter_by_chains(
|
|
210
172
|
This is because the function filters by entity, so if you ask for chain "A",
|
211
173
|
which is part of entity 1 having chains "A", "B" and "C", then you will get all three chains.
|
212
174
|
"""
|
213
|
-
|
214
|
-
|
215
|
-
with tempfile.NamedTemporaryFile("rt+") as f:
|
216
|
-
f.write(file_content)
|
217
|
-
f.seek(0)
|
218
|
-
data = adapter.readFile(f.name)
|
219
|
-
|
220
|
-
output = DataContainer("rnapolis")
|
221
|
-
|
222
|
-
entity_ids = select_ids(data, "struct_asym", "id", "entity_id", set(chains))
|
175
|
+
data = read_cif(file_content)
|
223
176
|
asym_ids = set(chains)
|
177
|
+
entity_ids = select_ids(data, "struct_asym", "entity_id", "id", asym_ids)
|
224
178
|
auth_asym_ids = select_ids(
|
225
|
-
data, "atom_site", "
|
179
|
+
data, "atom_site", "auth_asym_id", "label_asym_id", asym_ids
|
226
180
|
)
|
227
|
-
|
228
|
-
for table, ids in (
|
229
|
-
(CATEGORIES_WITH_ENTITY_ID, entity_ids),
|
230
|
-
(CATEGORIES_WITH_ASYM_ID, asym_ids),
|
231
|
-
(CATEGORIES_WITH_AUTH_ASYM_ID, auth_asym_ids),
|
232
|
-
):
|
233
|
-
for category, field_name in table:
|
234
|
-
attributes, rows = select_category_by_id(data, category, field_name, ids)
|
235
|
-
|
236
|
-
if attributes and rows:
|
237
|
-
obj = DataCategory(category, attributes, rows)
|
238
|
-
output.append(obj)
|
239
|
-
|
240
|
-
for category in retain_categories:
|
241
|
-
obj = data[0].getObj(category)
|
242
|
-
if obj:
|
243
|
-
output.append(obj)
|
244
|
-
|
245
|
-
with tempfile.NamedTemporaryFile("rt+") as tmp:
|
246
|
-
adapter.writeFile(tmp.name, [output])
|
247
|
-
tmp.seek(0)
|
248
|
-
return tmp.read()
|
181
|
+
return filter_cif(data, entity_ids, asym_ids, auth_asym_ids, retain_categories)
|
249
182
|
|
250
183
|
|
251
184
|
def main():
|
252
185
|
parser = argparse.ArgumentParser()
|
253
186
|
parser.add_argument(
|
254
|
-
"--
|
255
|
-
help="
|
187
|
+
"--filter-by-poly-types",
|
188
|
+
help=f"filter by entity poly types, possible values: {', '.join(ENTITY_POLY_TYPES)}",
|
256
189
|
action="append",
|
257
|
-
default=[
|
258
|
-
choices=ENTITY_POLY_TYPES,
|
190
|
+
default=[],
|
259
191
|
)
|
260
192
|
parser.add_argument(
|
261
|
-
"--
|
262
|
-
help="
|
193
|
+
"--filter-by-chains",
|
194
|
+
help="filter by chain IDs (label_asym_id), e.g. A, B, C",
|
263
195
|
action="append",
|
264
196
|
default=[],
|
265
197
|
)
|
198
|
+
parser.add_argument(
|
199
|
+
"--retain-categories",
|
200
|
+
help="categories to retain in the output file default: chem_comp",
|
201
|
+
action="append",
|
202
|
+
default=["chem_comp"],
|
203
|
+
)
|
266
204
|
parser.add_argument("path", help="path to a PDBx/mmCIF file")
|
267
205
|
args = parser.parse_args()
|
268
206
|
|
269
207
|
file = handle_input_file(args.path)
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
208
|
+
if args.filter_by_poly_types:
|
209
|
+
print(
|
210
|
+
filter_by_poly_types(
|
211
|
+
file.read(),
|
212
|
+
entity_poly_types=args.filter_by_poly_types,
|
213
|
+
retain_categories=args.retain_categories,
|
214
|
+
)
|
215
|
+
)
|
216
|
+
elif args.filter_by_chains:
|
217
|
+
print(
|
218
|
+
filter_by_chains(
|
219
|
+
file.read(),
|
220
|
+
chains=args.filter_by_chains,
|
221
|
+
retain_categories=args.retain_categories,
|
222
|
+
)
|
223
|
+
)
|
275
224
|
else:
|
276
225
|
parser.print_help()
|
277
226
|
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|