RNApolis 0.4.14__tar.gz → 0.4.16__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
Files changed (34) hide show
  1. {rnapolis-0.4.14/src/RNApolis.egg-info → rnapolis-0.4.16}/PKG-INFO +11 -2
  2. {rnapolis-0.4.14 → rnapolis-0.4.16}/setup.py +1 -1
  3. {rnapolis-0.4.14 → rnapolis-0.4.16/src/RNApolis.egg-info}/PKG-INFO +11 -2
  4. {rnapolis-0.4.14 → rnapolis-0.4.16}/src/rnapolis/annotator.py +2 -1
  5. rnapolis-0.4.16/src/rnapolis/molecule_filter.py +229 -0
  6. {rnapolis-0.4.14 → rnapolis-0.4.16}/src/rnapolis/parser.py +13 -2
  7. {rnapolis-0.4.14 → rnapolis-0.4.16}/tests/test_parser.py +19 -0
  8. rnapolis-0.4.14/src/rnapolis/molecule_filter.py +0 -280
  9. {rnapolis-0.4.14 → rnapolis-0.4.16}/LICENSE +0 -0
  10. {rnapolis-0.4.14 → rnapolis-0.4.16}/README.md +0 -0
  11. {rnapolis-0.4.14 → rnapolis-0.4.16}/pyproject.toml +0 -0
  12. {rnapolis-0.4.14 → rnapolis-0.4.16}/setup.cfg +0 -0
  13. {rnapolis-0.4.14 → rnapolis-0.4.16}/src/RNApolis.egg-info/SOURCES.txt +0 -0
  14. {rnapolis-0.4.14 → rnapolis-0.4.16}/src/RNApolis.egg-info/dependency_links.txt +0 -0
  15. {rnapolis-0.4.14 → rnapolis-0.4.16}/src/RNApolis.egg-info/entry_points.txt +0 -0
  16. {rnapolis-0.4.14 → rnapolis-0.4.16}/src/RNApolis.egg-info/requires.txt +0 -0
  17. {rnapolis-0.4.14 → rnapolis-0.4.16}/src/RNApolis.egg-info/top_level.txt +0 -0
  18. {rnapolis-0.4.14 → rnapolis-0.4.16}/src/rnapolis/clashfinder.py +0 -0
  19. {rnapolis-0.4.14 → rnapolis-0.4.16}/src/rnapolis/common.py +0 -0
  20. {rnapolis-0.4.14 → rnapolis-0.4.16}/src/rnapolis/metareader.py +0 -0
  21. {rnapolis-0.4.14 → rnapolis-0.4.16}/src/rnapolis/motif_extractor.py +0 -0
  22. {rnapolis-0.4.14 → rnapolis-0.4.16}/src/rnapolis/rfam_folder.py +0 -0
  23. {rnapolis-0.4.14 → rnapolis-0.4.16}/src/rnapolis/tertiary.py +0 -0
  24. {rnapolis-0.4.14 → rnapolis-0.4.16}/src/rnapolis/transformer.py +0 -0
  25. {rnapolis-0.4.14 → rnapolis-0.4.16}/src/rnapolis/util.py +0 -0
  26. {rnapolis-0.4.14 → rnapolis-0.4.16}/tests/test_annotator.py +0 -0
  27. {rnapolis-0.4.14 → rnapolis-0.4.16}/tests/test_bugfixes.py +0 -0
  28. {rnapolis-0.4.14 → rnapolis-0.4.16}/tests/test_common.py +0 -0
  29. {rnapolis-0.4.14 → rnapolis-0.4.16}/tests/test_metareader.py +0 -0
  30. {rnapolis-0.4.14 → rnapolis-0.4.16}/tests/test_molecule_filter.py +0 -0
  31. {rnapolis-0.4.14 → rnapolis-0.4.16}/tests/test_quadruplexes.py +0 -0
  32. {rnapolis-0.4.14 → rnapolis-0.4.16}/tests/test_rfam_folder.py +0 -0
  33. {rnapolis-0.4.14 → rnapolis-0.4.16}/tests/test_tertiary.py +0 -0
  34. {rnapolis-0.4.14 → rnapolis-0.4.16}/tests/test_transformer.py +0 -0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.2
2
2
  Name: RNApolis
3
- Version: 0.4.14
3
+ Version: 0.4.16
4
4
  Summary: A Python library containing RNA-related bioinformatics functions and classes
5
5
  Home-page: https://github.com/tzok/rnapolis-py
6
6
  Author: Tomasz Zok
@@ -26,6 +26,15 @@ Requires-Dist: pulp
26
26
  Requires-Dist: requests
27
27
  Requires-Dist: scipy
28
28
  Requires-Dist: viennarna
29
+ Dynamic: author
30
+ Dynamic: author-email
31
+ Dynamic: classifier
32
+ Dynamic: description
33
+ Dynamic: description-content-type
34
+ Dynamic: home-page
35
+ Dynamic: project-url
36
+ Dynamic: requires-dist
37
+ Dynamic: summary
29
38
 
30
39
  # RNApolis
31
40
 
@@ -5,7 +5,7 @@ with open("README.md") as f:
5
5
 
6
6
  setup(
7
7
  name="RNApolis",
8
- version="0.4.14",
8
+ version="0.4.16",
9
9
  packages=["rnapolis"],
10
10
  package_dir={"": "src"},
11
11
  author="Tomasz Zok",
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.2
2
2
  Name: RNApolis
3
- Version: 0.4.14
3
+ Version: 0.4.16
4
4
  Summary: A Python library containing RNA-related bioinformatics functions and classes
5
5
  Home-page: https://github.com/tzok/rnapolis-py
6
6
  Author: Tomasz Zok
@@ -26,6 +26,15 @@ Requires-Dist: pulp
26
26
  Requires-Dist: requests
27
27
  Requires-Dist: scipy
28
28
  Requires-Dist: viennarna
29
+ Dynamic: author
30
+ Dynamic: author-email
31
+ Dynamic: classifier
32
+ Dynamic: description
33
+ Dynamic: description-content-type
34
+ Dynamic: home-page
35
+ Dynamic: project-url
36
+ Dynamic: requires-dist
37
+ Dynamic: summary
29
38
 
30
39
  # RNApolis
31
40
 
@@ -11,6 +11,8 @@ import numpy
11
11
  import numpy.typing
12
12
  import orjson
13
13
  from ordered_set import OrderedSet
14
+ from scipy.spatial import KDTree
15
+
14
16
  from rnapolis.common import (
15
17
  BR,
16
18
  BaseInteractions,
@@ -42,7 +44,6 @@ from rnapolis.tertiary import (
42
44
  torsion_angle,
43
45
  )
44
46
  from rnapolis.util import handle_input_file
45
- from scipy.spatial import KDTree
46
47
 
47
48
  HYDROGEN_BOND_MAX_DISTANCE = 4.0
48
49
  HYDROGEN_BOND_ANGLE_RANGE = (50.0, 130.0) # 90 degrees is ideal, so allow +- 40 degrees
@@ -0,0 +1,229 @@
1
+ #! /usr/bin/env python
2
+ import argparse
3
+ import os
4
+ import tempfile
5
+ from collections import defaultdict, namedtuple
6
+ from typing import Iterable, List, Set, Tuple
7
+
8
+ from mmcif.io.IoAdapterPy import IoAdapterPy
9
+ from mmcif.io.PdbxReader import DataCategory, DataContainer
10
+
11
+ from rnapolis.util import handle_input_file
12
+
13
+ # Source: https://mmcif.wwpdb.org/dictionaries/mmcif_pdbx_v50.dic/Items/_entity_poly.type.html
14
+ ENTITY_POLY_TYPES = [
15
+ "cyclic-pseudo-peptide",
16
+ "other",
17
+ "peptide nucleic acid",
18
+ "polydeoxyribonucleotide",
19
+ "polydeoxyribonucleotide/polyribonucleotide hybrid",
20
+ "polypeptide(D)",
21
+ "polypeptide(L)",
22
+ "polyribonucleotide",
23
+ ]
24
+
25
+ Link = namedtuple(
26
+ "Link", ["parent_category_id", "parent_name", "child_category_id", "child_name"]
27
+ )
28
+
29
+
30
+ def load_pdbx_item_linked_group_list():
31
+ dictionary = os.path.join(
32
+ os.path.abspath(os.path.dirname(__file__)), "mmcif_pdbx_v50.dic"
33
+ )
34
+ adapter = IoAdapterPy()
35
+ data = adapter.readFile(dictionary)
36
+ obj = data[0].getObj("pdbx_item_linked_group_list")
37
+ links = defaultdict(set)
38
+
39
+ if obj:
40
+ for row in obj.getRowList():
41
+ row_dict = dict(zip(obj.getAttributeList(), row))
42
+ child_category_id = row_dict["child_category_id"]
43
+ child_name = row_dict["child_name"].split(".")[1]
44
+ parent_name = row_dict["parent_name"].split(".")[1]
45
+ parent_category_id = row_dict["parent_category_id"]
46
+ links[parent_category_id].add(
47
+ Link(parent_category_id, parent_name, child_category_id, child_name)
48
+ )
49
+
50
+ return links
51
+
52
+
53
+ def select_ids(
54
+ data: List[DataContainer],
55
+ category: str,
56
+ field_name_to_extract: str,
57
+ field_name_to_check: str,
58
+ accepted_values: Iterable[str],
59
+ ) -> Set[str]:
60
+ obj = data[0].getObj(category)
61
+ if not obj:
62
+ return set()
63
+ attributes = obj.getAttributeList()
64
+ if field_name_to_check not in attributes or field_name_to_extract not in attributes:
65
+ return set()
66
+ index_to_check = attributes.index(field_name_to_check)
67
+ index_to_extract = attributes.index(field_name_to_extract)
68
+ return {
69
+ row[index_to_extract]
70
+ for row in obj.getRowList()
71
+ if row[index_to_check] in accepted_values
72
+ }
73
+
74
+
75
+ def select_category_by_id(
76
+ data: List[DataContainer],
77
+ category: str,
78
+ field_name: str,
79
+ ids: Iterable[str],
80
+ ) -> Tuple[List[str], List[List[str]]]:
81
+ obj = data[0].getObj(category)
82
+ if not obj:
83
+ return [], []
84
+ attributes = obj.getAttributeList()
85
+ if field_name not in attributes:
86
+ return attributes, []
87
+ index = attributes.index(field_name)
88
+ return attributes, [row for row in obj.getRowList() if row[index] in ids]
89
+
90
+
91
+ def read_cif(file_content: str) -> DataContainer:
92
+ with tempfile.NamedTemporaryFile("rt+") as f:
93
+ adapter = IoAdapterPy()
94
+ f.write(file_content)
95
+ f.seek(0)
96
+ return adapter.readFile(f.name)
97
+
98
+
99
+ def filter_cif(data, entity_ids, asym_ids, auth_asym_ids, retain_categories):
100
+ links = load_pdbx_item_linked_group_list()
101
+ categories_with_entity_id = [("entity", "id")] + [
102
+ (link.child_category_id, link.child_name)
103
+ for link in links["entity"]
104
+ if link.parent_name == "id"
105
+ ]
106
+ categories_with_asym_id = [("struct_asym", "id")] + [
107
+ (link.child_category_id, link.child_name)
108
+ for link in links["struct_asym"]
109
+ if link.parent_name == "id"
110
+ ]
111
+ categories_with_auth_asym_id = [("atom_site", "auth_asym_id")] + [
112
+ (link.child_category_id, link.child_name)
113
+ for link in links["atom_site"]
114
+ if link.parent_name == "auth_asym_id"
115
+ ]
116
+
117
+ output = DataContainer("rnapolis")
118
+
119
+ for table, ids in (
120
+ (categories_with_entity_id, entity_ids),
121
+ (categories_with_asym_id, asym_ids),
122
+ (categories_with_auth_asym_id, auth_asym_ids),
123
+ ):
124
+ for category, field_name in table:
125
+ attributes, rows = select_category_by_id(data, category, field_name, ids)
126
+
127
+ if attributes and rows:
128
+ obj = DataCategory(category, attributes, rows)
129
+ output.append(obj)
130
+
131
+ for category in retain_categories:
132
+ obj = data[0].getObj(category)
133
+ if obj:
134
+ output.append(obj)
135
+
136
+ with tempfile.NamedTemporaryFile("rt+") as tmp:
137
+ adapter = IoAdapterPy()
138
+ adapter.writeFile(tmp.name, [output])
139
+ tmp.seek(0)
140
+ return tmp.read()
141
+
142
+
143
+ def filter_by_poly_types(
144
+ file_content: str,
145
+ entity_poly_types: Iterable[str] = [
146
+ "polyribonucleotide",
147
+ "polydeoxyribonucleotide",
148
+ "polydeoxyribonucleotide/polyribonucleotide hybrid",
149
+ ],
150
+ retain_categories: Iterable[str] = ["chem_comp"],
151
+ ) -> str:
152
+ data = read_cif(file_content)
153
+ entity_ids = select_ids(
154
+ data, "entity_poly", "entity_id", "type", set(entity_poly_types)
155
+ )
156
+ asym_ids = select_ids(data, "struct_asym", "id", "entity_id", entity_ids)
157
+ auth_asym_ids = select_ids(
158
+ data, "atom_site", "auth_asym_id", "label_asym_id", asym_ids
159
+ )
160
+ return filter_cif(data, entity_ids, asym_ids, auth_asym_ids, retain_categories)
161
+
162
+
163
+ def filter_by_chains(
164
+ file_content: str,
165
+ chains: Iterable[str],
166
+ retain_categories: Iterable[str] = ["chem_comp"],
167
+ ) -> str:
168
+ """
169
+ Filter a PDBx/mmCIF file by chain IDs. The function returns a new PDBx/mmCIF file.
170
+
171
+ Warning! The new file might contain more chains than provided in the `chains` argument.
172
+ This is because the function filters by entity, so if you ask for chain "A",
173
+ which is part of entity 1 having chains "A", "B" and "C", then you will get all three chains.
174
+ """
175
+ data = read_cif(file_content)
176
+ asym_ids = set(chains)
177
+ entity_ids = select_ids(data, "struct_asym", "entity_id", "id", asym_ids)
178
+ auth_asym_ids = select_ids(
179
+ data, "atom_site", "auth_asym_id", "label_asym_id", asym_ids
180
+ )
181
+ return filter_cif(data, entity_ids, asym_ids, auth_asym_ids, retain_categories)
182
+
183
+
184
+ def main():
185
+ parser = argparse.ArgumentParser()
186
+ parser.add_argument(
187
+ "--filter-by-poly-types",
188
+ help=f"filter by entity poly types, possible values: {', '.join(ENTITY_POLY_TYPES)}",
189
+ action="append",
190
+ default=[],
191
+ )
192
+ parser.add_argument(
193
+ "--filter-by-chains",
194
+ help="filter by chain IDs (label_asym_id), e.g. A, B, C",
195
+ action="append",
196
+ default=[],
197
+ )
198
+ parser.add_argument(
199
+ "--retain-categories",
200
+ help="categories to retain in the output file default: chem_comp",
201
+ action="append",
202
+ default=["chem_comp"],
203
+ )
204
+ parser.add_argument("path", help="path to a PDBx/mmCIF file")
205
+ args = parser.parse_args()
206
+
207
+ file = handle_input_file(args.path)
208
+ if args.filter_by_poly_types:
209
+ print(
210
+ filter_by_poly_types(
211
+ file.read(),
212
+ entity_poly_types=args.filter_by_poly_types,
213
+ retain_categories=args.retain_categories,
214
+ )
215
+ )
216
+ elif args.filter_by_chains:
217
+ print(
218
+ filter_by_chains(
219
+ file.read(),
220
+ chains=args.filter_by_chains,
221
+ retain_categories=args.retain_categories,
222
+ )
223
+ )
224
+ else:
225
+ parser.print_help()
226
+
227
+
228
+ if __name__ == "__main__":
229
+ main()
@@ -218,7 +218,9 @@ def parse_cif(
218
218
  )
219
219
 
220
220
  if entity_id and pdbx_seq_one_letter_code_can:
221
- sequence_by_entity[entity_id] = pdbx_seq_one_letter_code_can
221
+ sequence_by_entity[entity_id] = (
222
+ pdbx_seq_one_letter_code_can.replace("\n", "")
223
+ )
222
224
 
223
225
  if entity:
224
226
  for row in entity.getRowList():
@@ -234,7 +236,16 @@ def parse_cif(
234
236
 
235
237
  if type_:
236
238
  is_nucleic_acid_by_entity[entity_id] = (
237
- is_nucleic_acid_by_entity.get(entity_id, type_)
239
+ is_nucleic_acid_by_entity.get(
240
+ entity_id,
241
+ type_
242
+ in (
243
+ "peptide nucleic acid",
244
+ "polydeoxyribonucleotide",
245
+ "polydeoxyribonucleotide/polyribonucleotide hybrid",
246
+ "polyribonucleotide",
247
+ ),
248
+ )
238
249
  )
239
250
 
240
251
  atoms = filter_clashing_atoms(atoms_to_process)
@@ -1,3 +1,5 @@
1
+ import gzip
2
+
1
3
  from rnapolis.parser import read_3d_structure
2
4
 
3
5
 
@@ -31,3 +33,20 @@ def test_4qln_no_duplicate_atoms():
31
33
  assert len(atom_names) == len(
32
34
  set(atom_names)
33
35
  ), f"Duplicate atoms found in residue {residue.auth}"
36
+
37
+
38
+ def test_1gid():
39
+ expected_sequence = "GAAUUGCGGGAAAGGGGUCAACAGCCGUUCAGUACCAAGUCUCAGGGGAAACUUUGAGAUGGCCUUGCAAAGGGUAUGGUAAUAAGCUGACGGACAUGGUCCUAACCACGCAGCCAAGUCCUAAGUCAACAGAUCUUCUGUUGAUAUGGAUGCAGUUC"
40
+
41
+ with gzip.open("tests/1gid.cif.gz", "rt") as f:
42
+ structure3d = read_3d_structure(f, nucleic_acid_only=True)
43
+
44
+ residues_a = [r for r in structure3d.residues if r.auth.chain == "A"]
45
+ residues_b = [r for r in structure3d.residues if r.auth.chain == "B"]
46
+ assert len(residues_a) == len(expected_sequence)
47
+ assert len(residues_b) == len(expected_sequence)
48
+
49
+ actual_sequence_a = "".join([residue.one_letter_name for residue in residues_a])
50
+ actual_sequence_b = "".join([residue.one_letter_name for residue in residues_b])
51
+ assert actual_sequence_a == expected_sequence
52
+ assert actual_sequence_b == expected_sequence
@@ -1,280 +0,0 @@
1
- #! /usr/bin/env python
2
- import argparse
3
- import tempfile
4
- from typing import Iterable, List, Set, Tuple
5
-
6
- from mmcif.io.IoAdapterPy import IoAdapterPy
7
- from mmcif.io.PdbxReader import DataCategory, DataContainer
8
-
9
- from rnapolis.util import handle_input_file
10
-
11
- # Source: https://mmcif.wwpdb.org/dictionaries/mmcif_pdbx_v50.dic/Items/_entity_poly.type.html
12
- ENTITY_POLY_TYPES = [
13
- "cyclic-pseudo-peptide",
14
- "other",
15
- "peptide nucleic acid",
16
- "polydeoxyribonucleotide",
17
- "polydeoxyribonucleotide/polyribonucleotide hybrid",
18
- "polypeptide(D)",
19
- "polypeptide(L)",
20
- "polyribonucleotide",
21
- ]
22
-
23
- CATEGORIES_WITH_ENTITY_ID = [
24
- ("entity", "id"),
25
- ("atom_site", "label_entity_id"),
26
- ("entity_keywords", "entity_id"),
27
- ("entity_name_com", "entity_id"),
28
- ("entity_name_sys", "entity_id"),
29
- ("entity_poly", "entity_id"),
30
- ("entity_src_gen", "entity_id"),
31
- ("entity_src_nat", "entity_id"),
32
- ("pdbx_branch_scheme", "entity_id"),
33
- ("pdbx_chain_remapping", "entity_id"),
34
- ("pdbx_construct", "entity_id"),
35
- ("pdbx_entity_assembly", "entity_id"),
36
- ("pdbx_entity_branch", "entity_id"),
37
- ("pdbx_entity_branch_descriptor", "entity_id"),
38
- ("pdbx_entity_branch_list", "entity_id"),
39
- ("pdbx_entity_func_bind_mode", "entity_id"),
40
- ("pdbx_entity_name", "entity_id"),
41
- ("pdbx_entity_nonpoly", "entity_id"),
42
- ("pdbx_entity_poly_domain", "entity_id"),
43
- ("pdbx_entity_poly_na_nonstandard", "entity_id"),
44
- ("pdbx_entity_poly_na_type", "entity_id"),
45
- ("pdbx_entity_poly_protein_class", "entity_id"),
46
- ("pdbx_entity_prod_protocol", "entity_id"),
47
- ("pdbx_entity_remapping", "entity_id"),
48
- ("pdbx_entity_src_gen_character", "entity_id"),
49
- ("pdbx_entity_src_gen_chrom", "entity_id"),
50
- ("pdbx_entity_src_gen_clone", "entity_id"),
51
- ("pdbx_entity_src_gen_express", "entity_id"),
52
- ("pdbx_entity_src_gen_fract", "entity_id"),
53
- ("pdbx_entity_src_gen_lysis", "entity_id"),
54
- ("pdbx_entity_src_gen_prod_digest", "entity_id"),
55
- ("pdbx_entity_src_gen_prod_other", "entity_id"),
56
- ("pdbx_entity_src_gen_prod_pcr", "entity_id"),
57
- ("pdbx_entity_src_gen_proteolysis", "entity_id"),
58
- ("pdbx_entity_src_gen_pure", "entity_id"),
59
- ("pdbx_entity_src_gen_refold", "entity_id"),
60
- ("pdbx_entity_src_syn", "entity_id"),
61
- ("pdbx_linked_entity_list", "entity_id"),
62
- ("pdbx_prerelease_seq", "entity_id"),
63
- ("pdbx_sifts_xref_db", "entity_id"),
64
- ("pdbx_sifts_xref_db_segments", "entity_id"),
65
- ("pdbx_struct_entity_inst", "entity_id"),
66
- ("struct_asym", "entity_id"),
67
- ("struct_ref", "entity_id"),
68
- ]
69
-
70
- CATEGORIES_WITH_ASYM_ID = [
71
- ("pdbx_coordinate_model", "asym_id"),
72
- ("pdbx_distant_solvent_atoms", "label_asym_id"),
73
- ("pdbx_linked_entity_instance_list", "asym_id"),
74
- ("pdbx_poly_seq_scheme", "asym_id"),
75
- ("pdbx_sifts_unp_segments", "asym_id"),
76
- ("pdbx_struct_asym_gen", "asym_id"),
77
- ("pdbx_struct_ncs_virus_gen", "asym_id"),
78
- ("pdbx_struct_special_symmetry", "label_asym_id"),
79
- ("pdbx_unobs_or_zero_occ_atoms", "label_asym_id"),
80
- ("pdbx_unobs_or_zero_occ_residues", "label_asym_id"),
81
- ("refine_ls_restr_ncs", "pdbx_asym_id"),
82
- ("struct_biol_gen", "asym_id"),
83
- ]
84
-
85
- CATEGORIES_WITH_AUTH_ASYM_ID = [
86
- ("atom_site_anisotrop", "pdbx_auth_asym_id"),
87
- ("pdbx_atom_site_aniso_tls", "auth_asym_id"),
88
- ("pdbx_entity_instance_feature", "auth_asym_id"),
89
- ("pdbx_feature_monomer", "auth_asym_id"),
90
- ("pdbx_missing_atom_nonpoly", "auth_asym_id"),
91
- ("pdbx_missing_atom_poly", "auth_asym_id"),
92
- ("pdbx_modification_feature", "auth_asym_id"),
93
- ("pdbx_refine_component", "auth_asym_id"),
94
- ("pdbx_remediation_atom_site_mapping", "auth_asym_id"),
95
- ("pdbx_rmch_outlier", "auth_asym_id"),
96
- ("pdbx_rms_devs_cov_by_monomer", "auth_asym_id"),
97
- ("pdbx_sequence_pattern", "auth_asym_id"),
98
- ("pdbx_solvent_atom_site_mapping", "auth_asym_id"),
99
- ("pdbx_stereochemistry", "auth_asym_id"),
100
- ("pdbx_struct_chem_comp_diagnostics", "pdb_strand_id"),
101
- ("pdbx_struct_chem_comp_feature", "pdb_strand_id"),
102
- ("pdbx_struct_group_components", "auth_asym_id"),
103
- ("pdbx_struct_mod_residue", "auth_asym_id"),
104
- ("pdbx_sugar_phosphate_geometry", "auth_asym_id"),
105
- ("pdbx_validate_chiral", "auth_asym_id"),
106
- ("pdbx_validate_main_chain_plane", "auth_asym_id"),
107
- ("pdbx_validate_planes", "auth_asym_id"),
108
- ("pdbx_validate_planes_atom", "auth_asym_id"),
109
- ("pdbx_validate_torsion", "auth_asym_id"),
110
- ("struct_mon_nucl", "auth_asym_id"),
111
- ("struct_mon_prot", "auth_asym_id"),
112
- ("struct_site_gen", "auth_asym_id"),
113
- ]
114
-
115
-
116
- def select_ids(
117
- data: List[DataContainer],
118
- obj_name: str,
119
- tested_field_name: str,
120
- extracted_field_name: str,
121
- accepted_values: Set[str],
122
- ) -> Set[str]:
123
- obj = data[0].getObj(obj_name)
124
- ids = set()
125
-
126
- if obj:
127
- for row in obj.getRowList():
128
- row_dict = dict(zip(obj.getAttributeList(), row))
129
-
130
- if row_dict.get(tested_field_name, None) in accepted_values:
131
- ids.add(row_dict[extracted_field_name])
132
-
133
- return ids
134
-
135
-
136
- def select_category_by_id(
137
- data: List[DataContainer],
138
- category: str,
139
- field_name: str,
140
- ids: List[str],
141
- ) -> Tuple[List[str], List[List[str]]]:
142
- obj = data[0].getObj(category)
143
- attributes = []
144
- rows = []
145
-
146
- if obj:
147
- attributes = obj.getAttributeList()
148
-
149
- for row in obj.getRowList():
150
- row_dict = dict(zip(obj.getAttributeList(), row))
151
-
152
- if row_dict.get(field_name, None) in ids:
153
- rows.append(row)
154
-
155
- return attributes, rows
156
-
157
-
158
- def filter_by_poly_types(
159
- file_content: str,
160
- entity_poly_types: Iterable[str] = ["polyribonucleotide"],
161
- retain_categories: Iterable[str] = [],
162
- ) -> str:
163
- adapter = IoAdapterPy()
164
-
165
- with tempfile.NamedTemporaryFile("rt+") as f:
166
- f.write(file_content)
167
- f.seek(0)
168
- data = adapter.readFile(f.name)
169
-
170
- entity_ids = select_ids(
171
- data, "entity_poly", "type", "entity_id", set(entity_poly_types)
172
- )
173
- asym_ids = select_ids(data, "struct_asym", "entity_id", "id", entity_ids)
174
- auth_asym_ids = select_ids(
175
- data, "atom_site", "label_asym_id", "auth_asym_id", asym_ids
176
- )
177
-
178
- output = DataContainer("rnapolis")
179
-
180
- for table, ids in (
181
- (CATEGORIES_WITH_ENTITY_ID, entity_ids),
182
- (CATEGORIES_WITH_ASYM_ID, asym_ids),
183
- (CATEGORIES_WITH_AUTH_ASYM_ID, auth_asym_ids),
184
- ):
185
- for category, field_name in table:
186
- attributes, rows = select_category_by_id(data, category, field_name, ids)
187
-
188
- if attributes and rows:
189
- obj = DataCategory(category, attributes, rows)
190
- output.append(obj)
191
-
192
- for category in retain_categories:
193
- obj = data[0].getObj(category)
194
- if obj:
195
- output.append(obj)
196
-
197
- with tempfile.NamedTemporaryFile("rt+") as tmp:
198
- adapter.writeFile(tmp.name, [output])
199
- tmp.seek(0)
200
- return tmp.read()
201
-
202
-
203
- def filter_by_chains(
204
- file_content: str, chains: Iterable[str], retain_categories: Iterable[str] = []
205
- ) -> str:
206
- """
207
- Filter a PDBx/mmCIF file by chain IDs. The function returns a new PDBx/mmCIF file.
208
-
209
- Warning! The new file might contain more chains than provided in the `chains` argument.
210
- This is because the function filters by entity, so if you ask for chain "A",
211
- which is part of entity 1 having chains "A", "B" and "C", then you will get all three chains.
212
- """
213
- adapter = IoAdapterPy()
214
-
215
- with tempfile.NamedTemporaryFile("rt+") as f:
216
- f.write(file_content)
217
- f.seek(0)
218
- data = adapter.readFile(f.name)
219
-
220
- output = DataContainer("rnapolis")
221
-
222
- entity_ids = select_ids(data, "struct_asym", "id", "entity_id", set(chains))
223
- asym_ids = set(chains)
224
- auth_asym_ids = select_ids(
225
- data, "atom_site", "label_asym_id", "auth_asym_id", asym_ids
226
- )
227
-
228
- for table, ids in (
229
- (CATEGORIES_WITH_ENTITY_ID, entity_ids),
230
- (CATEGORIES_WITH_ASYM_ID, asym_ids),
231
- (CATEGORIES_WITH_AUTH_ASYM_ID, auth_asym_ids),
232
- ):
233
- for category, field_name in table:
234
- attributes, rows = select_category_by_id(data, category, field_name, ids)
235
-
236
- if attributes and rows:
237
- obj = DataCategory(category, attributes, rows)
238
- output.append(obj)
239
-
240
- for category in retain_categories:
241
- obj = data[0].getObj(category)
242
- if obj:
243
- output.append(obj)
244
-
245
- with tempfile.NamedTemporaryFile("rt+") as tmp:
246
- adapter.writeFile(tmp.name, [output])
247
- tmp.seek(0)
248
- return tmp.read()
249
-
250
-
251
- def main():
252
- parser = argparse.ArgumentParser()
253
- parser.add_argument(
254
- "--type",
255
- help="a type of molecule to select, you can provide this argument multiple times (default: polyribonucleotide)",
256
- action="append",
257
- default=["polyribonucleotide"],
258
- choices=ENTITY_POLY_TYPES,
259
- )
260
- parser.add_argument(
261
- "--chain",
262
- help="a chain ID (label_asym_id) to select, you can provide this argument multiple times (if provided, it overrides the --type argument)",
263
- action="append",
264
- default=[],
265
- )
266
- parser.add_argument("path", help="path to a PDBx/mmCIF file")
267
- args = parser.parse_args()
268
-
269
- file = handle_input_file(args.path)
270
-
271
- if args.chain:
272
- print(filter_by_chains(file.read(), args.chain))
273
- elif args.type:
274
- print(filter_by_poly_types(file.read(), args.type))
275
- else:
276
- parser.print_help()
277
-
278
-
279
- if __name__ == "__main__":
280
- main()
File without changes
File without changes
File without changes
File without changes