biotite 1.0.1__cp310-cp310-win_amd64.whl → 1.2.0__cp310-cp310-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biotite might be problematic. Click here for more details.
- biotite/application/application.py +3 -3
- biotite/application/autodock/app.py +1 -1
- biotite/application/blast/webapp.py +1 -1
- biotite/application/clustalo/app.py +1 -1
- biotite/application/dssp/app.py +13 -3
- biotite/application/localapp.py +36 -2
- biotite/application/msaapp.py +10 -10
- biotite/application/muscle/app3.py +5 -18
- biotite/application/muscle/app5.py +5 -5
- biotite/application/sra/app.py +0 -5
- biotite/application/util.py +22 -2
- biotite/application/viennarna/rnaalifold.py +8 -8
- biotite/application/viennarna/rnaplot.py +9 -3
- biotite/application/viennarna/util.py +1 -1
- biotite/application/webapp.py +1 -1
- biotite/database/afdb/__init__.py +12 -0
- biotite/database/afdb/download.py +191 -0
- biotite/database/entrez/dbnames.py +10 -0
- biotite/database/entrez/download.py +9 -10
- biotite/database/entrez/key.py +1 -1
- biotite/database/entrez/query.py +5 -4
- biotite/database/pubchem/download.py +6 -6
- biotite/database/pubchem/error.py +10 -0
- biotite/database/pubchem/query.py +12 -23
- biotite/database/rcsb/download.py +3 -2
- biotite/database/rcsb/query.py +8 -9
- biotite/database/uniprot/check.py +22 -17
- biotite/database/uniprot/download.py +3 -6
- biotite/database/uniprot/query.py +4 -5
- biotite/file.py +14 -2
- biotite/interface/__init__.py +19 -0
- biotite/interface/openmm/__init__.py +16 -0
- biotite/interface/openmm/state.py +93 -0
- biotite/interface/openmm/system.py +227 -0
- biotite/interface/pymol/__init__.py +198 -0
- biotite/interface/pymol/cgo.py +346 -0
- biotite/interface/pymol/convert.py +185 -0
- biotite/interface/pymol/display.py +267 -0
- biotite/interface/pymol/object.py +1226 -0
- biotite/interface/pymol/shapes.py +178 -0
- biotite/interface/pymol/startup.py +169 -0
- biotite/interface/rdkit/__init__.py +15 -0
- biotite/interface/rdkit/mol.py +490 -0
- biotite/interface/version.py +71 -0
- biotite/interface/warning.py +19 -0
- biotite/sequence/align/__init__.py +0 -4
- biotite/sequence/align/alignment.py +49 -14
- biotite/sequence/align/banded.cp310-win_amd64.pyd +0 -0
- biotite/sequence/align/banded.pyx +26 -26
- biotite/sequence/align/cigar.py +2 -2
- biotite/sequence/align/kmeralphabet.cp310-win_amd64.pyd +0 -0
- biotite/sequence/align/kmeralphabet.pyx +19 -2
- biotite/sequence/align/kmersimilarity.cp310-win_amd64.pyd +0 -0
- biotite/sequence/align/kmertable.cp310-win_amd64.pyd +0 -0
- biotite/sequence/align/kmertable.pyx +58 -48
- biotite/sequence/align/localgapped.cp310-win_amd64.pyd +0 -0
- biotite/sequence/align/localgapped.pyx +47 -47
- biotite/sequence/align/localungapped.cp310-win_amd64.pyd +0 -0
- biotite/sequence/align/localungapped.pyx +10 -10
- biotite/sequence/align/matrix.py +284 -57
- biotite/sequence/align/matrix_data/3Di.mat +24 -0
- biotite/sequence/align/matrix_data/PB.license +21 -0
- biotite/sequence/align/matrix_data/PB.mat +18 -0
- biotite/sequence/align/multiple.cp310-win_amd64.pyd +0 -0
- biotite/sequence/align/pairwise.cp310-win_amd64.pyd +0 -0
- biotite/sequence/align/pairwise.pyx +35 -35
- biotite/sequence/align/permutation.cp310-win_amd64.pyd +0 -0
- biotite/sequence/align/selector.cp310-win_amd64.pyd +0 -0
- biotite/sequence/align/selector.pyx +2 -2
- biotite/sequence/align/statistics.py +1 -1
- biotite/sequence/align/tracetable.cp310-win_amd64.pyd +0 -0
- biotite/sequence/alphabet.py +5 -2
- biotite/sequence/annotation.py +19 -13
- biotite/sequence/codec.cp310-win_amd64.pyd +0 -0
- biotite/sequence/codon.py +1 -2
- biotite/sequence/graphics/alignment.py +25 -39
- biotite/sequence/graphics/color_schemes/3di_flower.json +48 -0
- biotite/sequence/graphics/color_schemes/pb_flower.json +2 -1
- biotite/sequence/graphics/colorschemes.py +44 -11
- biotite/sequence/graphics/dendrogram.py +4 -2
- biotite/sequence/graphics/features.py +2 -2
- biotite/sequence/graphics/logo.py +10 -12
- biotite/sequence/io/fasta/convert.py +1 -2
- biotite/sequence/io/fasta/file.py +1 -1
- biotite/sequence/io/fastq/file.py +3 -3
- biotite/sequence/io/genbank/file.py +3 -3
- biotite/sequence/io/genbank/sequence.py +2 -0
- biotite/sequence/io/gff/convert.py +1 -1
- biotite/sequence/io/gff/file.py +1 -2
- biotite/sequence/phylo/nj.cp310-win_amd64.pyd +0 -0
- biotite/sequence/phylo/tree.cp310-win_amd64.pyd +0 -0
- biotite/sequence/phylo/upgma.cp310-win_amd64.pyd +0 -0
- biotite/sequence/profile.py +105 -29
- biotite/sequence/search.py +0 -1
- biotite/sequence/seqtypes.py +136 -8
- biotite/sequence/sequence.py +1 -2
- biotite/setup_ccd.py +197 -0
- biotite/structure/__init__.py +6 -3
- biotite/structure/alphabet/__init__.py +25 -0
- biotite/structure/alphabet/encoder.py +332 -0
- biotite/structure/alphabet/encoder_weights_3di.kerasify +0 -0
- biotite/structure/alphabet/i3d.py +109 -0
- biotite/structure/alphabet/layers.py +86 -0
- biotite/structure/alphabet/pb.license +21 -0
- biotite/structure/alphabet/pb.py +170 -0
- biotite/structure/alphabet/unkerasify.py +128 -0
- biotite/structure/atoms.py +163 -66
- biotite/structure/basepairs.py +26 -26
- biotite/structure/bonds.cp310-win_amd64.pyd +0 -0
- biotite/structure/bonds.pyx +79 -25
- biotite/structure/box.py +19 -21
- biotite/structure/celllist.cp310-win_amd64.pyd +0 -0
- biotite/structure/celllist.pyx +83 -67
- biotite/structure/chains.py +5 -37
- biotite/structure/charges.cp310-win_amd64.pyd +0 -0
- biotite/structure/compare.py +420 -13
- biotite/structure/density.py +1 -1
- biotite/structure/dotbracket.py +27 -28
- biotite/structure/filter.py +8 -8
- biotite/structure/geometry.py +74 -127
- biotite/structure/hbond.py +17 -19
- biotite/structure/info/__init__.py +1 -0
- biotite/structure/info/atoms.py +24 -15
- biotite/structure/info/bonds.py +12 -6
- biotite/structure/info/ccd.py +125 -34
- biotite/structure/info/{ccd/components.bcif → components.bcif} +0 -0
- biotite/structure/info/groups.py +62 -19
- biotite/structure/info/masses.py +9 -6
- biotite/structure/info/misc.py +15 -22
- biotite/structure/info/radii.py +92 -22
- biotite/structure/info/standardize.py +4 -4
- biotite/structure/integrity.py +4 -6
- biotite/structure/io/general.py +2 -2
- biotite/structure/io/gro/file.py +8 -9
- biotite/structure/io/mol/convert.py +1 -1
- biotite/structure/io/mol/ctab.py +33 -28
- biotite/structure/io/mol/mol.py +1 -1
- biotite/structure/io/mol/sdf.py +80 -53
- biotite/structure/io/pdb/convert.py +4 -3
- biotite/structure/io/pdb/file.py +85 -25
- biotite/structure/io/pdb/hybrid36.cp310-win_amd64.pyd +0 -0
- biotite/structure/io/pdbqt/file.py +36 -36
- biotite/structure/io/pdbx/__init__.py +1 -0
- biotite/structure/io/pdbx/bcif.py +54 -15
- biotite/structure/io/pdbx/cif.py +92 -66
- biotite/structure/io/pdbx/component.py +15 -4
- biotite/structure/io/pdbx/compress.py +321 -0
- biotite/structure/io/pdbx/convert.py +410 -75
- biotite/structure/io/pdbx/encoding.cp310-win_amd64.pyd +0 -0
- biotite/structure/io/pdbx/encoding.pyx +98 -17
- biotite/structure/io/trajfile.py +9 -6
- biotite/structure/io/util.py +38 -0
- biotite/structure/mechanics.py +0 -1
- biotite/structure/molecules.py +141 -156
- biotite/structure/pseudoknots.py +7 -13
- biotite/structure/repair.py +2 -4
- biotite/structure/residues.py +13 -24
- biotite/structure/rings.py +335 -0
- biotite/structure/sasa.cp310-win_amd64.pyd +0 -0
- biotite/structure/sasa.pyx +2 -1
- biotite/structure/segments.py +69 -11
- biotite/structure/sequence.py +0 -1
- biotite/structure/sse.py +0 -2
- biotite/structure/superimpose.py +74 -62
- biotite/structure/tm.py +581 -0
- biotite/structure/transform.py +12 -25
- biotite/structure/util.py +76 -4
- biotite/version.py +9 -4
- biotite/visualize.py +111 -1
- {biotite-1.0.1.dist-info → biotite-1.2.0.dist-info}/METADATA +6 -2
- {biotite-1.0.1.dist-info → biotite-1.2.0.dist-info}/RECORD +173 -143
- biotite/structure/info/ccd/README.rst +0 -8
- biotite/structure/info/ccd/amino_acids.txt +0 -1663
- biotite/structure/info/ccd/carbohydrates.txt +0 -1135
- biotite/structure/info/ccd/nucleotides.txt +0 -798
- {biotite-1.0.1.dist-info → biotite-1.2.0.dist-info}/WHEEL +0 -0
- {biotite-1.0.1.dist-info → biotite-1.2.0.dist-info}/licenses/LICENSE.rst +0 -0
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
# information.
|
|
4
4
|
|
|
5
5
|
__name__ = "biotite.structure.io.pdbx"
|
|
6
|
-
__author__ = "Fabrice Allain, Patrick Kunzmann"
|
|
6
|
+
__author__ = "Fabrice Allain, Patrick Kunzmann, Cheyenne Ziegler"
|
|
7
7
|
__all__ = [
|
|
8
8
|
"get_sequence",
|
|
9
9
|
"get_model_count",
|
|
@@ -13,6 +13,7 @@ __all__ = [
|
|
|
13
13
|
"set_component",
|
|
14
14
|
"list_assemblies",
|
|
15
15
|
"get_assembly",
|
|
16
|
+
"get_sse",
|
|
16
17
|
]
|
|
17
18
|
|
|
18
19
|
import itertools
|
|
@@ -24,6 +25,10 @@ from biotite.structure.atoms import AtomArray, AtomArrayStack, repeat
|
|
|
24
25
|
from biotite.structure.bonds import BondList, BondType, connect_via_residue_names
|
|
25
26
|
from biotite.structure.box import unitcell_from_vectors, vectors_from_unitcell
|
|
26
27
|
from biotite.structure.error import BadStructureError
|
|
28
|
+
from biotite.structure.filter import _canonical_aa_list as canonical_aa_list
|
|
29
|
+
from biotite.structure.filter import (
|
|
30
|
+
_canonical_nucleotide_list as canonical_nucleotide_list,
|
|
31
|
+
)
|
|
27
32
|
from biotite.structure.filter import (
|
|
28
33
|
filter_first_altloc,
|
|
29
34
|
filter_highest_occupancy_altloc,
|
|
@@ -36,32 +41,38 @@ from biotite.structure.io.pdbx.bcif import (
|
|
|
36
41
|
from biotite.structure.io.pdbx.cif import CIFBlock, CIFFile
|
|
37
42
|
from biotite.structure.io.pdbx.component import MaskValue
|
|
38
43
|
from biotite.structure.io.pdbx.encoding import StringArrayEncoding
|
|
39
|
-
from biotite.structure.residues import
|
|
44
|
+
from biotite.structure.residues import (
|
|
45
|
+
get_residue_count,
|
|
46
|
+
get_residue_positions,
|
|
47
|
+
get_residue_starts_for,
|
|
48
|
+
)
|
|
40
49
|
from biotite.structure.util import matrix_rotate
|
|
41
50
|
|
|
42
|
-
#
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
"
|
|
47
|
-
"
|
|
48
|
-
"
|
|
49
|
-
"
|
|
50
|
-
"
|
|
51
|
-
"
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
"
|
|
57
|
-
"
|
|
58
|
-
"
|
|
59
|
-
"
|
|
51
|
+
# Bond types in `struct_conn` category that refer to covalent bonds
|
|
52
|
+
PDBX_BOND_TYPE_ID_TO_TYPE = {
|
|
53
|
+
# Although a covalent bond, could in theory have a higher bond order,
|
|
54
|
+
# practically inter-residue bonds are always single
|
|
55
|
+
"covale": BondType.SINGLE,
|
|
56
|
+
"covale_base": BondType.SINGLE,
|
|
57
|
+
"covale_phosphate": BondType.SINGLE,
|
|
58
|
+
"covale_sugar": BondType.SINGLE,
|
|
59
|
+
"disulf": BondType.SINGLE,
|
|
60
|
+
"modres": BondType.SINGLE,
|
|
61
|
+
"modres_link": BondType.SINGLE,
|
|
62
|
+
"metalc": BondType.COORDINATION,
|
|
63
|
+
}
|
|
64
|
+
PDBX_BOND_TYPE_TO_TYPE_ID = {
|
|
65
|
+
BondType.ANY: "covale",
|
|
66
|
+
BondType.SINGLE: "covale",
|
|
67
|
+
BondType.DOUBLE: "covale",
|
|
68
|
+
BondType.TRIPLE: "covale",
|
|
69
|
+
BondType.QUADRUPLE: "covale",
|
|
70
|
+
BondType.AROMATIC_SINGLE: "covale",
|
|
71
|
+
BondType.AROMATIC_DOUBLE: "covale",
|
|
72
|
+
BondType.AROMATIC_TRIPLE: "covale",
|
|
73
|
+
BondType.COORDINATION: "metalc",
|
|
60
74
|
}
|
|
61
|
-
# ...and vice versa
|
|
62
75
|
PDBX_BOND_TYPE_TO_ORDER = {
|
|
63
|
-
# 'ANY' is masked later, it is merely added here to avoid a KeyError
|
|
64
|
-
BondType.ANY: "",
|
|
65
76
|
BondType.SINGLE: "sing",
|
|
66
77
|
BondType.DOUBLE: "doub",
|
|
67
78
|
BondType.TRIPLE: "trip",
|
|
@@ -69,6 +80,10 @@ PDBX_BOND_TYPE_TO_ORDER = {
|
|
|
69
80
|
BondType.AROMATIC_SINGLE: "sing",
|
|
70
81
|
BondType.AROMATIC_DOUBLE: "doub",
|
|
71
82
|
BondType.AROMATIC_TRIPLE: "trip",
|
|
83
|
+
# These are masked later, it is merely added here to avoid a KeyError
|
|
84
|
+
BondType.ANY: "",
|
|
85
|
+
BondType.AROMATIC: "",
|
|
86
|
+
BondType.COORDINATION: "",
|
|
72
87
|
}
|
|
73
88
|
# Map 'chem_comp_bond' bond orders and aromaticity to 'BondType'...
|
|
74
89
|
COMP_BOND_ORDER_TO_TYPE = {
|
|
@@ -79,11 +94,19 @@ COMP_BOND_ORDER_TO_TYPE = {
|
|
|
79
94
|
("SING", "Y"): BondType.AROMATIC_SINGLE,
|
|
80
95
|
("DOUB", "Y"): BondType.AROMATIC_DOUBLE,
|
|
81
96
|
("TRIP", "Y"): BondType.AROMATIC_TRIPLE,
|
|
97
|
+
("AROM", "Y"): BondType.AROMATIC,
|
|
82
98
|
}
|
|
83
99
|
# ...and vice versa
|
|
84
100
|
COMP_BOND_TYPE_TO_ORDER = {
|
|
85
101
|
bond_type: order for order, bond_type in COMP_BOND_ORDER_TO_TYPE.items()
|
|
86
102
|
}
|
|
103
|
+
CANONICAL_RESIDUE_LIST = canonical_aa_list + canonical_nucleotide_list
|
|
104
|
+
# it was observed that when the number or rows in `atom_site` and `struct_conn`
|
|
105
|
+
# exceed a certain threshold,
|
|
106
|
+
# a dictionary approach is less computation and memory intensive than the dense
|
|
107
|
+
# vectorized approach.
|
|
108
|
+
# https://github.com/biotite-dev/biotite/pull/765#issuecomment-2708867357
|
|
109
|
+
FIND_MATCHES_SWITCH_THRESHOLD = 4000000
|
|
87
110
|
|
|
88
111
|
_proteinseq_type_list = ["polypeptide(D)", "polypeptide(L)"]
|
|
89
112
|
_nucleotideseq_type_list = [
|
|
@@ -146,8 +169,8 @@ def get_sequence(pdbx_file, data_block=None):
|
|
|
146
169
|
-------
|
|
147
170
|
sequence_dict : Dictionary of Sequences
|
|
148
171
|
Dictionary keys are derived from ``entity_poly.pdbx_strand_id``
|
|
149
|
-
(
|
|
150
|
-
|
|
172
|
+
(equivalent to ``atom_site.auth_asym_id``).
|
|
173
|
+
Dictionary values are sequences.
|
|
151
174
|
|
|
152
175
|
Notes
|
|
153
176
|
-----
|
|
@@ -203,9 +226,7 @@ def get_model_count(pdbx_file, data_block=None):
|
|
|
203
226
|
The number of models.
|
|
204
227
|
"""
|
|
205
228
|
block = _get_block(pdbx_file, data_block)
|
|
206
|
-
return len(
|
|
207
|
-
_get_model_starts(block["atom_site"]["pdbx_PDB_model_num"].as_array(np.int32))
|
|
208
|
-
)
|
|
229
|
+
return len(np.unique((block["atom_site"]["pdbx_PDB_model_num"].as_array(np.int32))))
|
|
209
230
|
|
|
210
231
|
|
|
211
232
|
def get_structure(
|
|
@@ -296,7 +317,6 @@ def get_structure(
|
|
|
296
317
|
>>> arr = get_structure(file, model=1)
|
|
297
318
|
>>> print(len(arr))
|
|
298
319
|
304
|
|
299
|
-
|
|
300
320
|
"""
|
|
301
321
|
block = _get_block(pdbx_file, data_block)
|
|
302
322
|
|
|
@@ -307,13 +327,12 @@ def get_structure(
|
|
|
307
327
|
raise InvalidFileError("Missing 'atom_site' category in file")
|
|
308
328
|
|
|
309
329
|
models = atom_site["pdbx_PDB_model_num"].as_array(np.int32)
|
|
310
|
-
|
|
311
|
-
model_count = len(model_starts)
|
|
330
|
+
model_count = len(np.unique(models))
|
|
312
331
|
atom_count = len(models)
|
|
313
332
|
|
|
314
333
|
if model is None:
|
|
315
334
|
# For a stack, the annotations are derived from the first model
|
|
316
|
-
model_atom_site = _filter_model(atom_site,
|
|
335
|
+
model_atom_site = _filter_model(atom_site, 1)
|
|
317
336
|
# Any field of the category would work here to get the length
|
|
318
337
|
model_length = model_atom_site.row_count
|
|
319
338
|
atoms = AtomArrayStack(model_count, model_length)
|
|
@@ -359,7 +378,7 @@ def get_structure(
|
|
|
359
378
|
f"the given model {model} does not exist"
|
|
360
379
|
)
|
|
361
380
|
|
|
362
|
-
model_atom_site = _filter_model(atom_site,
|
|
381
|
+
model_atom_site = _filter_model(atom_site, model)
|
|
363
382
|
# Any field of the category would work here to get the length
|
|
364
383
|
model_length = model_atom_site.row_count
|
|
365
384
|
atoms = AtomArray(model_length)
|
|
@@ -475,16 +494,53 @@ def _fill_annotations(array, atom_site, extra_fields, use_author_fields):
|
|
|
475
494
|
array.set_annotation("element", atom_site["type_symbol"].as_array(str))
|
|
476
495
|
|
|
477
496
|
if "atom_id" in extra_fields:
|
|
478
|
-
|
|
497
|
+
if "id" in atom_site:
|
|
498
|
+
array.set_annotation("atom_id", atom_site["id"].as_array(int))
|
|
499
|
+
else:
|
|
500
|
+
warnings.warn(
|
|
501
|
+
"Missing 'id' in 'atom_site' category. 'atom_id' generated automatically.",
|
|
502
|
+
UserWarning,
|
|
503
|
+
)
|
|
504
|
+
array.set_annotation("atom_id", np.arange(array.array_length()))
|
|
479
505
|
extra_fields.remove("atom_id")
|
|
480
506
|
if "b_factor" in extra_fields:
|
|
481
|
-
|
|
507
|
+
if "B_iso_or_equiv" in atom_site:
|
|
508
|
+
array.set_annotation(
|
|
509
|
+
"b_factor", atom_site["B_iso_or_equiv"].as_array(float)
|
|
510
|
+
)
|
|
511
|
+
else:
|
|
512
|
+
warnings.warn(
|
|
513
|
+
"Missing 'B_iso_or_equiv' in 'atom_site' category. 'b_factor' will be set to `nan`.",
|
|
514
|
+
UserWarning,
|
|
515
|
+
)
|
|
516
|
+
array.set_annotation("b_factor", np.full(array.array_length(), np.nan))
|
|
482
517
|
extra_fields.remove("b_factor")
|
|
483
518
|
if "occupancy" in extra_fields:
|
|
484
|
-
|
|
519
|
+
if "occupancy" in atom_site:
|
|
520
|
+
array.set_annotation("occupancy", atom_site["occupancy"].as_array(float))
|
|
521
|
+
else:
|
|
522
|
+
warnings.warn(
|
|
523
|
+
"Missing 'occupancy' in 'atom_site' category. 'occupancy' will be assumed to be 1.0",
|
|
524
|
+
UserWarning,
|
|
525
|
+
)
|
|
526
|
+
array.set_annotation(
|
|
527
|
+
"occupancy", np.ones(array.array_length(), dtype=float)
|
|
528
|
+
)
|
|
485
529
|
extra_fields.remove("occupancy")
|
|
486
530
|
if "charge" in extra_fields:
|
|
487
|
-
|
|
531
|
+
if "pdbx_formal_charge" in atom_site:
|
|
532
|
+
array.set_annotation(
|
|
533
|
+
"charge",
|
|
534
|
+
atom_site["pdbx_formal_charge"].as_array(
|
|
535
|
+
int, 0
|
|
536
|
+
), # masked values are set to 0
|
|
537
|
+
)
|
|
538
|
+
else:
|
|
539
|
+
warnings.warn(
|
|
540
|
+
"Missing 'pdbx_formal_charge' in 'atom_site' category. 'charge' will be set to 0",
|
|
541
|
+
UserWarning,
|
|
542
|
+
)
|
|
543
|
+
array.set_annotation("charge", np.zeros(array.array_length(), dtype=int))
|
|
488
544
|
extra_fields.remove("charge")
|
|
489
545
|
|
|
490
546
|
# Handle all remaining custom fields
|
|
@@ -536,7 +592,8 @@ def _parse_inter_residue_bonds(atom_site, struct_conn):
|
|
|
536
592
|
]
|
|
537
593
|
|
|
538
594
|
covale_mask = np.isin(
|
|
539
|
-
struct_conn["conn_type_id"].as_array(str),
|
|
595
|
+
struct_conn["conn_type_id"].as_array(str),
|
|
596
|
+
list(PDBX_BOND_TYPE_ID_TO_TYPE.keys()),
|
|
540
597
|
)
|
|
541
598
|
if "ptnr1_symmetry" in struct_conn:
|
|
542
599
|
covale_mask &= struct_conn["ptnr1_symmetry"].as_array(str, IDENTITY) == IDENTITY
|
|
@@ -576,13 +633,14 @@ def _parse_inter_residue_bonds(atom_site, struct_conn):
|
|
|
576
633
|
atoms_indices_1 = atoms_indices_1[mapping_exists_mask]
|
|
577
634
|
atoms_indices_2 = atoms_indices_2[mapping_exists_mask]
|
|
578
635
|
|
|
579
|
-
|
|
580
|
-
bond_order = struct_conn["pdbx_value_order"].as_array(str, "")
|
|
636
|
+
bond_type_id = struct_conn["conn_type_id"].as_array()
|
|
581
637
|
# Consecutively apply the same masks as applied to the atom indices
|
|
582
638
|
# Logical combination does not work here,
|
|
583
639
|
# as the second mask was created based on already filtered data
|
|
584
|
-
|
|
585
|
-
|
|
640
|
+
bond_type_id = bond_type_id[covale_mask][mapping_exists_mask]
|
|
641
|
+
# The type ID is always present in the dictionary,
|
|
642
|
+
# as it was used to filter the applicable bonds
|
|
643
|
+
bond_types = [PDBX_BOND_TYPE_ID_TO_TYPE[type_id] for type_id in bond_type_id]
|
|
586
644
|
|
|
587
645
|
return BondList(
|
|
588
646
|
atom_site.row_count,
|
|
@@ -593,9 +651,20 @@ def _parse_inter_residue_bonds(atom_site, struct_conn):
|
|
|
593
651
|
def _find_matches(query_arrays, reference_arrays):
|
|
594
652
|
"""
|
|
595
653
|
For each index in the `query_arrays` find the indices in the
|
|
596
|
-
`reference_arrays` where all query values the reference counterpart.
|
|
654
|
+
`reference_arrays` where all query values match the reference counterpart.
|
|
597
655
|
If no match is found for a query, the corresponding index is -1.
|
|
598
656
|
"""
|
|
657
|
+
if (
|
|
658
|
+
query_arrays[0].shape[0] * reference_arrays[0].shape[0]
|
|
659
|
+
<= FIND_MATCHES_SWITCH_THRESHOLD
|
|
660
|
+
):
|
|
661
|
+
match_indices = _find_matches_by_dense_array(query_arrays, reference_arrays)
|
|
662
|
+
else:
|
|
663
|
+
match_indices = _find_matches_by_dict(query_arrays, reference_arrays)
|
|
664
|
+
return match_indices
|
|
665
|
+
|
|
666
|
+
|
|
667
|
+
def _find_matches_by_dense_array(query_arrays, reference_arrays):
|
|
599
668
|
match_masks_for_all_columns = np.stack(
|
|
600
669
|
[
|
|
601
670
|
query[:, np.newaxis] == reference[np.newaxis, :]
|
|
@@ -623,6 +692,38 @@ def _find_matches(query_arrays, reference_arrays):
|
|
|
623
692
|
return match_indices
|
|
624
693
|
|
|
625
694
|
|
|
695
|
+
def _find_matches_by_dict(query_arrays, reference_arrays):
|
|
696
|
+
# Convert reference arrays to a dictionary for O(1) lookups
|
|
697
|
+
reference_dict = {}
|
|
698
|
+
ambiguous_keys = set()
|
|
699
|
+
for ref_idx, ref_row in enumerate(zip(*reference_arrays)):
|
|
700
|
+
ref_key = tuple(ref_row)
|
|
701
|
+
if ref_key in reference_dict:
|
|
702
|
+
ambiguous_keys.add(ref_key)
|
|
703
|
+
continue
|
|
704
|
+
reference_dict[ref_key] = ref_idx
|
|
705
|
+
|
|
706
|
+
match_indices = []
|
|
707
|
+
for query_idx, query_row in enumerate(zip(*query_arrays)):
|
|
708
|
+
query_key = tuple(query_row)
|
|
709
|
+
occurrence = reference_dict.get(query_key)
|
|
710
|
+
|
|
711
|
+
if occurrence is None:
|
|
712
|
+
# -1 indicates that no match was found in the reference
|
|
713
|
+
match_indices.append(-1)
|
|
714
|
+
elif query_key in ambiguous_keys:
|
|
715
|
+
# The query cannot be uniquely matched to an atom in the reference
|
|
716
|
+
raise InvalidFileError(
|
|
717
|
+
f"The covalent bond in the 'struct_conn' category at index "
|
|
718
|
+
f"{query_idx} cannot be unambiguously assigned to atoms in "
|
|
719
|
+
f"the 'atom_site' category"
|
|
720
|
+
)
|
|
721
|
+
else:
|
|
722
|
+
match_indices.append(occurrence)
|
|
723
|
+
|
|
724
|
+
return np.array(match_indices)
|
|
725
|
+
|
|
726
|
+
|
|
626
727
|
def _get_struct_conn_col_name(col_name, partner):
|
|
627
728
|
"""
|
|
628
729
|
For a column name in ``atom_site`` get the corresponding column name
|
|
@@ -661,21 +762,26 @@ def _filter_altloc(array, atom_site, altloc):
|
|
|
661
762
|
raise ValueError(f"'{altloc}' is not a valid 'altloc' option")
|
|
662
763
|
|
|
663
764
|
|
|
664
|
-
def
|
|
665
|
-
"""
|
|
666
|
-
Get the start index for each model in the arrays of the
|
|
667
|
-
``atom_site`` category.
|
|
668
|
-
"""
|
|
669
|
-
_, indices = np.unique(model_array, return_index=True)
|
|
670
|
-
indices.sort()
|
|
671
|
-
return indices
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
def _filter_model(atom_site, model_starts, model):
|
|
765
|
+
def _filter_model(atom_site, model):
|
|
675
766
|
"""
|
|
676
767
|
Reduce the ``atom_site`` category to the values for the given
|
|
677
768
|
model.
|
|
769
|
+
|
|
770
|
+
Parameters
|
|
771
|
+
----------
|
|
772
|
+
atom_site : CIFCategory or BinaryCIFCategory
|
|
773
|
+
``atom_site`` category containing all models.
|
|
774
|
+
model : int
|
|
775
|
+
The model to be selected.
|
|
776
|
+
|
|
777
|
+
Returns
|
|
778
|
+
-------
|
|
779
|
+
atom_site : CIFCategory or BinaryCIFCategory
|
|
780
|
+
The ``atom_site`` category containing only the selected model.
|
|
678
781
|
"""
|
|
782
|
+
models = atom_site["pdbx_PDB_model_num"].as_array(np.int32)
|
|
783
|
+
_, model_starts = np.unique(models, return_index=True)
|
|
784
|
+
model_starts.sort()
|
|
679
785
|
# Append exclusive stop
|
|
680
786
|
model_starts = np.append(model_starts, [atom_site.row_count])
|
|
681
787
|
# Indexing starts at 0, but model number starts at 1
|
|
@@ -703,7 +809,13 @@ def _get_box(block):
|
|
|
703
809
|
return vectors_from_unitcell(len_a, len_b, len_c, alpha, beta, gamma)
|
|
704
810
|
|
|
705
811
|
|
|
706
|
-
def set_structure(
|
|
812
|
+
def set_structure(
|
|
813
|
+
pdbx_file,
|
|
814
|
+
array,
|
|
815
|
+
data_block=None,
|
|
816
|
+
include_bonds=False,
|
|
817
|
+
extra_fields=[],
|
|
818
|
+
):
|
|
707
819
|
"""
|
|
708
820
|
Set the ``atom_site`` category with atom information from an
|
|
709
821
|
:class:`AtomArray` or :class:`AtomArrayStack`.
|
|
@@ -737,6 +849,10 @@ def set_structure(pdbx_file, array, data_block=None, include_bonds=False):
|
|
|
737
849
|
category.
|
|
738
850
|
Inter-residue bonds will be written into the ``struct_conn``
|
|
739
851
|
independent of this parameter.
|
|
852
|
+
extra_fields : list of str, optional
|
|
853
|
+
List of additional fields from the ``atom_site`` category
|
|
854
|
+
that should be written into the file.
|
|
855
|
+
Default is an empty list.
|
|
740
856
|
|
|
741
857
|
Notes
|
|
742
858
|
-----
|
|
@@ -752,7 +868,6 @@ def set_structure(pdbx_file, array, data_block=None, include_bonds=False):
|
|
|
752
868
|
>>> file = CIFFile()
|
|
753
869
|
>>> set_structure(file, atom_array)
|
|
754
870
|
>>> file.write(os.path.join(path_to_directory, "structure.cif"))
|
|
755
|
-
|
|
756
871
|
"""
|
|
757
872
|
_check_non_empty(array)
|
|
758
873
|
|
|
@@ -773,7 +888,11 @@ def set_structure(pdbx_file, array, data_block=None, include_bonds=False):
|
|
|
773
888
|
)
|
|
774
889
|
atom_site["label_comp_id"] = np.copy(array.res_name)
|
|
775
890
|
atom_site["label_asym_id"] = np.copy(array.chain_id)
|
|
776
|
-
atom_site["label_entity_id"] =
|
|
891
|
+
atom_site["label_entity_id"] = (
|
|
892
|
+
np.copy(array.label_entity_id)
|
|
893
|
+
if "label_entity_id" in array.get_annotation_categories()
|
|
894
|
+
else _determine_entity_id(array.chain_id)
|
|
895
|
+
)
|
|
777
896
|
atom_site["label_seq_id"] = np.copy(array.res_id)
|
|
778
897
|
atom_site["pdbx_PDB_ins_code"] = Column(
|
|
779
898
|
np.copy(array.ins_code),
|
|
@@ -797,6 +916,32 @@ def set_structure(pdbx_file, array, data_block=None, include_bonds=False):
|
|
|
797
916
|
np.where(array.charge == 0, MaskValue.MISSING, MaskValue.PRESENT),
|
|
798
917
|
)
|
|
799
918
|
|
|
919
|
+
# Handle all remaining custom fields
|
|
920
|
+
if len(extra_fields) > 0:
|
|
921
|
+
# ... check to avoid clashes with standard annotations
|
|
922
|
+
_standard_annotations = [
|
|
923
|
+
"hetero",
|
|
924
|
+
"element",
|
|
925
|
+
"atom_name",
|
|
926
|
+
"res_name",
|
|
927
|
+
"chain_id",
|
|
928
|
+
"res_id",
|
|
929
|
+
"ins_code",
|
|
930
|
+
"atom_id",
|
|
931
|
+
"b_factor",
|
|
932
|
+
"occupancy",
|
|
933
|
+
"charge",
|
|
934
|
+
]
|
|
935
|
+
_reserved_annotation_names = list(atom_site.keys()) + _standard_annotations
|
|
936
|
+
|
|
937
|
+
for annot in extra_fields:
|
|
938
|
+
if annot in _reserved_annotation_names:
|
|
939
|
+
raise ValueError(
|
|
940
|
+
f"Annotation name '{annot}' is reserved and cannot be written to as extra field. "
|
|
941
|
+
"Please choose another name."
|
|
942
|
+
)
|
|
943
|
+
atom_site[annot] = np.copy(array.get_annotation(annot))
|
|
944
|
+
|
|
800
945
|
if array.bonds is not None:
|
|
801
946
|
struct_conn = _set_inter_residue_bonds(array, atom_site)
|
|
802
947
|
if struct_conn is not None:
|
|
@@ -1021,13 +1166,21 @@ def _set_inter_residue_bonds(array, atom_site):
|
|
|
1021
1166
|
if len(bond_array) == 0:
|
|
1022
1167
|
return None
|
|
1023
1168
|
|
|
1169
|
+
# Filter out 'standard' links, i.e. backbone bonds between adjacent canonical
|
|
1170
|
+
# nucleotide/amino acid residues
|
|
1171
|
+
bond_array = bond_array[~_filter_canonical_links(array, bond_array)]
|
|
1172
|
+
if len(bond_array) == 0:
|
|
1173
|
+
return None
|
|
1174
|
+
|
|
1024
1175
|
struct_conn = Category()
|
|
1025
1176
|
struct_conn["id"] = np.arange(1, len(bond_array) + 1)
|
|
1026
|
-
struct_conn["conn_type_id"] =
|
|
1177
|
+
struct_conn["conn_type_id"] = [
|
|
1178
|
+
PDBX_BOND_TYPE_TO_TYPE_ID[btype] for btype in bond_array[:, 2]
|
|
1179
|
+
]
|
|
1027
1180
|
struct_conn["pdbx_value_order"] = Column(
|
|
1028
1181
|
np.array([PDBX_BOND_TYPE_TO_ORDER[btype] for btype in bond_array[:, 2]]),
|
|
1029
1182
|
np.where(
|
|
1030
|
-
bond_array[:, 2]
|
|
1183
|
+
np.isin(bond_array[:, 2], (BondType.ANY, BondType.COORDINATION)),
|
|
1031
1184
|
MaskValue.MISSING,
|
|
1032
1185
|
MaskValue.PRESENT,
|
|
1033
1186
|
),
|
|
@@ -1063,7 +1216,34 @@ def _filter_bonds(array, connection):
|
|
|
1063
1216
|
raise ValueError("Invalid 'connection' option")
|
|
1064
1217
|
|
|
1065
1218
|
|
|
1066
|
-
def
|
|
1219
|
+
def _filter_canonical_links(array, bond_array):
|
|
1220
|
+
"""
|
|
1221
|
+
Filter out peptide bonds between adjacent canonical amino acid residues.
|
|
1222
|
+
"""
|
|
1223
|
+
# Get the residue index for each bonded atom
|
|
1224
|
+
residue_indices = get_residue_positions(array, bond_array[:, :2].flatten()).reshape(
|
|
1225
|
+
-1, 2
|
|
1226
|
+
)
|
|
1227
|
+
|
|
1228
|
+
return (
|
|
1229
|
+
# Must be canonical residues
|
|
1230
|
+
np.isin(array.res_name[bond_array[:, 0]], CANONICAL_RESIDUE_LIST) &
|
|
1231
|
+
np.isin(array.res_name[bond_array[:, 1]], CANONICAL_RESIDUE_LIST) &
|
|
1232
|
+
# Must be backbone bond
|
|
1233
|
+
np.isin(array.atom_name[bond_array[:, 0]], ("C", "O3'")) &
|
|
1234
|
+
np.isin(array.atom_name[bond_array[:, 1]], ("N", "P")) &
|
|
1235
|
+
# Must connect adjacent residues
|
|
1236
|
+
residue_indices[:, 1] - residue_indices[:, 0] == 1
|
|
1237
|
+
) # fmt: skip
|
|
1238
|
+
|
|
1239
|
+
|
|
1240
|
+
def get_component(
|
|
1241
|
+
pdbx_file,
|
|
1242
|
+
data_block=None,
|
|
1243
|
+
use_ideal_coord=True,
|
|
1244
|
+
res_name=None,
|
|
1245
|
+
allow_missing_coord=False,
|
|
1246
|
+
):
|
|
1067
1247
|
"""
|
|
1068
1248
|
Create an :class:`AtomArray` for a chemical component from the
|
|
1069
1249
|
``chem_comp_atom`` and, if available, the ``chem_comp_bond``
|
|
@@ -1091,6 +1271,11 @@ def get_component(pdbx_file, data_block=None, use_ideal_coord=True, res_name=Non
|
|
|
1091
1271
|
In this case, the component with the given residue name is
|
|
1092
1272
|
read.
|
|
1093
1273
|
By default, all rows would be read in this case.
|
|
1274
|
+
allow_missing_coord : bool, optional
|
|
1275
|
+
Whether to allow missing coordinate values in components.
|
|
1276
|
+
If ``True``, these will be represented as ``nan`` values.
|
|
1277
|
+
If ``False``, a ``ValueError`` is raised when missing coordinates
|
|
1278
|
+
are encountered.
|
|
1094
1279
|
|
|
1095
1280
|
Returns
|
|
1096
1281
|
-------
|
|
@@ -1161,17 +1346,29 @@ def get_component(pdbx_file, data_block=None, use_ideal_coord=True, res_name=Non
|
|
|
1161
1346
|
# Swap with the fallback option
|
|
1162
1347
|
coord_fields, alt_coord_fields = alt_coord_fields, coord_fields
|
|
1163
1348
|
try:
|
|
1164
|
-
|
|
1165
|
-
|
|
1166
|
-
|
|
1167
|
-
|
|
1168
|
-
|
|
1169
|
-
|
|
1170
|
-
|
|
1171
|
-
|
|
1349
|
+
array.coord = _parse_component_coordinates(
|
|
1350
|
+
[atom_category[field] for field in coord_fields]
|
|
1351
|
+
)
|
|
1352
|
+
except Exception as err:
|
|
1353
|
+
if isinstance(err, KeyError):
|
|
1354
|
+
key = err.args[0]
|
|
1355
|
+
warnings.warn(
|
|
1356
|
+
f"Attribute '{key}' not found within 'chem_comp_atom' category. "
|
|
1357
|
+
f"The fallback coordinates will be used instead",
|
|
1358
|
+
UserWarning,
|
|
1359
|
+
)
|
|
1360
|
+
elif isinstance(err, ValueError):
|
|
1361
|
+
warnings.warn(
|
|
1362
|
+
"The coordinates are missing for some atoms. "
|
|
1363
|
+
"The fallback coordinates will be used instead",
|
|
1364
|
+
UserWarning,
|
|
1365
|
+
)
|
|
1366
|
+
else:
|
|
1367
|
+
raise
|
|
1368
|
+
array.coord = _parse_component_coordinates(
|
|
1369
|
+
[atom_category[field] for field in alt_coord_fields],
|
|
1370
|
+
allow_missing=allow_missing_coord,
|
|
1172
1371
|
)
|
|
1173
|
-
for i, field in enumerate(alt_coord_fields):
|
|
1174
|
-
array.coord[:, i] = atom_category[field].as_array(np.float32)
|
|
1175
1372
|
|
|
1176
1373
|
try:
|
|
1177
1374
|
bond_category = block["chem_comp_bond"]
|
|
@@ -1181,7 +1378,7 @@ def get_component(pdbx_file, data_block=None, use_ideal_coord=True, res_name=Non
|
|
|
1181
1378
|
)
|
|
1182
1379
|
except KeyError:
|
|
1183
1380
|
warnings.warn(
|
|
1184
|
-
"Category 'chem_comp_bond' not found.
|
|
1381
|
+
"Category 'chem_comp_bond' not found. No bonds will be parsed",
|
|
1185
1382
|
UserWarning,
|
|
1186
1383
|
)
|
|
1187
1384
|
else:
|
|
@@ -1201,6 +1398,23 @@ def get_component(pdbx_file, data_block=None, use_ideal_coord=True, res_name=Non
|
|
|
1201
1398
|
return array
|
|
1202
1399
|
|
|
1203
1400
|
|
|
1401
|
+
def _parse_component_coordinates(coord_columns, allow_missing=False):
|
|
1402
|
+
coord = np.zeros((len(coord_columns[0]), 3), dtype=np.float32)
|
|
1403
|
+
for i, column in enumerate(coord_columns):
|
|
1404
|
+
if column.mask is not None and column.mask.array.any():
|
|
1405
|
+
if allow_missing:
|
|
1406
|
+
warnings.warn(
|
|
1407
|
+
"Missing coordinates for some atoms. Those will be set to nan",
|
|
1408
|
+
UserWarning,
|
|
1409
|
+
)
|
|
1410
|
+
else:
|
|
1411
|
+
raise ValueError(
|
|
1412
|
+
"Missing coordinates for some atoms",
|
|
1413
|
+
)
|
|
1414
|
+
coord[:, i] = column.as_array(np.float32, masked_value=np.nan)
|
|
1415
|
+
return coord
|
|
1416
|
+
|
|
1417
|
+
|
|
1204
1418
|
def set_component(pdbx_file, array, data_block=None):
|
|
1205
1419
|
"""
|
|
1206
1420
|
Set the ``chem_comp_atom`` and, if bonds are available,
|
|
@@ -1305,6 +1519,7 @@ def list_assemblies(pdbx_file, data_block=None):
|
|
|
1305
1519
|
|
|
1306
1520
|
Examples
|
|
1307
1521
|
--------
|
|
1522
|
+
|
|
1308
1523
|
>>> import os.path
|
|
1309
1524
|
>>> file = CIFFile.read(os.path.join(path_to_structures, "1f2n.cif"))
|
|
1310
1525
|
>>> assembly_ids = list_assemblies(file)
|
|
@@ -1417,7 +1632,10 @@ def get_assembly(
|
|
|
1417
1632
|
Returns
|
|
1418
1633
|
-------
|
|
1419
1634
|
assembly : AtomArray or AtomArrayStack
|
|
1420
|
-
The assembly.
|
|
1635
|
+
The assembly.
|
|
1636
|
+
The return type depends on the `model` parameter.
|
|
1637
|
+
Contains the `sym_id` annotation, which enumerates the copies of the asymmetric
|
|
1638
|
+
unit in the assembly.
|
|
1421
1639
|
|
|
1422
1640
|
Examples
|
|
1423
1641
|
--------
|
|
@@ -1506,7 +1724,6 @@ def _apply_transformations(structure, transformation_dict, operations):
|
|
|
1506
1724
|
"""
|
|
1507
1725
|
# Additional first dimesion for 'structure.repeat()'
|
|
1508
1726
|
assembly_coord = np.zeros((len(operations),) + structure.coord.shape)
|
|
1509
|
-
|
|
1510
1727
|
# Apply corresponding transformation for each copy in the assembly
|
|
1511
1728
|
for i, operation in enumerate(operations):
|
|
1512
1729
|
coord = structure.coord
|
|
@@ -1520,7 +1737,11 @@ def _apply_transformations(structure, transformation_dict, operations):
|
|
|
1520
1737
|
coord += translation_vector
|
|
1521
1738
|
assembly_coord[i] = coord
|
|
1522
1739
|
|
|
1523
|
-
|
|
1740
|
+
assembly = repeat(structure, assembly_coord)
|
|
1741
|
+
assembly.set_annotation(
|
|
1742
|
+
"sym_id", np.repeat(np.arange(len(operations)), structure.array_length())
|
|
1743
|
+
)
|
|
1744
|
+
return assembly
|
|
1524
1745
|
|
|
1525
1746
|
|
|
1526
1747
|
def _get_transformations(struct_oper):
|
|
@@ -1596,4 +1817,118 @@ def _convert_string_to_sequence(string, stype):
|
|
|
1596
1817
|
elif stype in _other_type_list:
|
|
1597
1818
|
return None
|
|
1598
1819
|
else:
|
|
1599
|
-
raise InvalidFileError("mmCIF _entity_poly.type unsupported
|
|
1820
|
+
raise InvalidFileError("mmCIF _entity_poly.type unsupported type: " + stype)
|
|
1821
|
+
|
|
1822
|
+
|
|
1823
|
+
def get_sse(pdbx_file, data_block=None, match_model=None):
|
|
1824
|
+
"""
|
|
1825
|
+
Get the secondary structure from a PDBx file.
|
|
1826
|
+
|
|
1827
|
+
Parameters
|
|
1828
|
+
----------
|
|
1829
|
+
pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
|
|
1830
|
+
The file object.
|
|
1831
|
+
The following categories are required:
|
|
1832
|
+
|
|
1833
|
+
- ``entity_poly``
|
|
1834
|
+
- ``struct_conf`` (if alpha-helices are present)
|
|
1835
|
+
- ``struct_sheet_range`` (if beta-strands are present)
|
|
1836
|
+
- ``atom_site`` (if `match_model` is set)
|
|
1837
|
+
|
|
1838
|
+
data_block : str, optional
|
|
1839
|
+
The name of the data block.
|
|
1840
|
+
Default is the first (and most times only) data block of the
|
|
1841
|
+
file.
|
|
1842
|
+
If the data block object is passed directly to `pdbx_file`,
|
|
1843
|
+
this parameter is ignored.
|
|
1844
|
+
match_model : None, optional
|
|
1845
|
+
If a model number is given, only secondary structure elements for residues are
|
|
1846
|
+
kept, that are resolved in the given model.
|
|
1847
|
+
This means secondary structure elements for residues that would not appear
|
|
1848
|
+
in a corresponding :class:`AtomArray` from :func:`get_structure()` are removed.
|
|
1849
|
+
By default, all residues in the sequence are kept.
|
|
1850
|
+
|
|
1851
|
+
Returns
|
|
1852
|
+
-------
|
|
1853
|
+
sse_dict : dict of str -> ndarray, dtype=str
|
|
1854
|
+
The dictionary maps the chain ID (derived from ``auth_asym_id``) to the
|
|
1855
|
+
secondary structure of the respective chain.
|
|
1856
|
+
|
|
1857
|
+
- ``"a"``: alpha-helix
|
|
1858
|
+
- ``"b"``: beta-strand
|
|
1859
|
+
- ``"c"``: coil or not an amino acid
|
|
1860
|
+
|
|
1861
|
+
Each secondary structure element corresponds to the ``label_seq_id`` of the
|
|
1862
|
+
``atom_site`` category.
|
|
1863
|
+
This means that the 0-th position of the array corresponds to the residue
|
|
1864
|
+
in ``atom_site`` with ``label_seq_id`` ``1``.
|
|
1865
|
+
|
|
1866
|
+
Examples
|
|
1867
|
+
--------
|
|
1868
|
+
|
|
1869
|
+
>>> import os.path
|
|
1870
|
+
>>> file = CIFFile.read(os.path.join(path_to_structures, "1aki.cif"))
|
|
1871
|
+
>>> sse = get_sse(file, match_model=1)
|
|
1872
|
+
>>> print(sse)
|
|
1873
|
+
{'A': array(['c', 'c', 'c', 'c', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a',
|
|
1874
|
+
'a', 'c', 'c', 'c', 'c', 'c', 'a', 'a', 'a', 'c', 'c', 'a', 'a',
|
|
1875
|
+
'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c', 'c', 'c',
|
|
1876
|
+
'c', 'c', 'c', 'b', 'b', 'b', 'c', 'c', 'c', 'c', 'c', 'b', 'b',
|
|
1877
|
+
'b', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c',
|
|
1878
|
+
'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c',
|
|
1879
|
+
'c', 'a', 'a', 'a', 'a', 'a', 'c', 'c', 'c', 'c', 'a', 'a', 'a',
|
|
1880
|
+
'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c', 'c', 'a',
|
|
1881
|
+
'a', 'a', 'a', 'c', 'a', 'a', 'a', 'a', 'a', 'a', 'c', 'c', 'c',
|
|
1882
|
+
'c', 'c', 'a', 'a', 'a', 'a', 'c', 'c', 'c', 'c', 'c', 'c'],
|
|
1883
|
+
dtype='<U1')}
|
|
1884
|
+
|
|
1885
|
+
If only secondary structure elements for resolved residues are requested, the length
|
|
1886
|
+
of the returned array matches the number of peptide residues in the structure.
|
|
1887
|
+
|
|
1888
|
+
>>> file = CIFFile.read(os.path.join(path_to_structures, "3o5r.cif"))
|
|
1889
|
+
>>> print(len(get_sse(file, match_model=1)["A"]))
|
|
1890
|
+
128
|
|
1891
|
+
>>> atoms = get_structure(file, model=1)
|
|
1892
|
+
>>> atoms = atoms[filter_amino_acids(atoms) & (atoms.chain_id == "A")]
|
|
1893
|
+
>>> print(get_residue_count(atoms))
|
|
1894
|
+
128
|
|
1895
|
+
"""
|
|
1896
|
+
block = _get_block(pdbx_file, data_block)
|
|
1897
|
+
|
|
1898
|
+
# Init all chains with "c" for coil
|
|
1899
|
+
sse_dict = {
|
|
1900
|
+
chain_id: np.repeat("c", len(sequence))
|
|
1901
|
+
for chain_id, sequence in get_sequence(block).items()
|
|
1902
|
+
}
|
|
1903
|
+
|
|
1904
|
+
# Populate SSE arrays with helices and strands
|
|
1905
|
+
for sse_symbol, category_name in [
|
|
1906
|
+
("a", "struct_conf"),
|
|
1907
|
+
("b", "struct_sheet_range"),
|
|
1908
|
+
]:
|
|
1909
|
+
if category_name in block:
|
|
1910
|
+
category = block[category_name]
|
|
1911
|
+
chains = category["beg_auth_asym_id"].as_array(str)
|
|
1912
|
+
start_positions = category["beg_label_seq_id"].as_array(int)
|
|
1913
|
+
end_positions = category["end_label_seq_id"].as_array(int)
|
|
1914
|
+
|
|
1915
|
+
# set alpha helix positions
|
|
1916
|
+
for chain, start, end in zip(chains, start_positions, end_positions):
|
|
1917
|
+
# Translate the 1-based positions from PDBx into 0-based array indices
|
|
1918
|
+
sse_dict[chain][start - 1 : end] = sse_symbol
|
|
1919
|
+
|
|
1920
|
+
if match_model is not None:
|
|
1921
|
+
model_atom_site = _filter_model(block["atom_site"], match_model)
|
|
1922
|
+
chain_ids = model_atom_site["auth_asym_id"].as_array(str)
|
|
1923
|
+
res_ids = model_atom_site["label_seq_id"].as_array(int, masked_value=-1)
|
|
1924
|
+
# Filter out masked residues, i.e. residues not part of a chain
|
|
1925
|
+
mask = res_ids != -1
|
|
1926
|
+
chain_ids = chain_ids[mask]
|
|
1927
|
+
res_ids = res_ids[mask]
|
|
1928
|
+
for chain_id, sse in sse_dict.items():
|
|
1929
|
+
res_ids_in_chain = res_ids[chain_ids == chain_id]
|
|
1930
|
+
# Transform from 1-based residue ID to 0-based index
|
|
1931
|
+
indices = np.unique(res_ids_in_chain) - 1
|
|
1932
|
+
sse_dict[chain_id] = sse[indices]
|
|
1933
|
+
|
|
1934
|
+
return sse_dict
|