biotite 1.1.0__cp313-cp313-win_amd64.whl → 1.3.0__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biotite might be problematic. Click here for more details.
- biotite/application/application.py +3 -3
- biotite/application/autodock/app.py +1 -1
- biotite/application/blast/webapp.py +1 -1
- biotite/application/clustalo/app.py +1 -1
- biotite/application/localapp.py +2 -2
- biotite/application/msaapp.py +10 -10
- biotite/application/muscle/app3.py +3 -3
- biotite/application/muscle/app5.py +3 -3
- biotite/application/sra/app.py +0 -5
- biotite/application/util.py +21 -1
- biotite/application/viennarna/rnaalifold.py +8 -8
- biotite/application/viennarna/rnaplot.py +10 -8
- biotite/application/viennarna/util.py +1 -1
- biotite/application/webapp.py +1 -1
- biotite/database/afdb/__init__.py +12 -0
- biotite/database/afdb/download.py +191 -0
- biotite/database/entrez/dbnames.py +10 -0
- biotite/database/entrez/download.py +9 -10
- biotite/database/entrez/key.py +1 -1
- biotite/database/entrez/query.py +5 -4
- biotite/database/pubchem/download.py +6 -6
- biotite/database/pubchem/error.py +10 -0
- biotite/database/pubchem/query.py +12 -23
- biotite/database/rcsb/download.py +3 -2
- biotite/database/rcsb/query.py +2 -3
- biotite/database/uniprot/check.py +2 -2
- biotite/database/uniprot/download.py +2 -5
- biotite/database/uniprot/query.py +3 -4
- biotite/file.py +14 -2
- biotite/interface/__init__.py +19 -0
- biotite/interface/openmm/__init__.py +20 -0
- biotite/interface/openmm/state.py +93 -0
- biotite/interface/openmm/system.py +227 -0
- biotite/interface/pymol/__init__.py +201 -0
- biotite/interface/pymol/cgo.py +346 -0
- biotite/interface/pymol/convert.py +185 -0
- biotite/interface/pymol/display.py +267 -0
- biotite/interface/pymol/object.py +1226 -0
- biotite/interface/pymol/shapes.py +178 -0
- biotite/interface/pymol/startup.py +169 -0
- biotite/interface/rdkit/__init__.py +19 -0
- biotite/interface/rdkit/mol.py +490 -0
- biotite/interface/version.py +94 -0
- biotite/interface/warning.py +19 -0
- biotite/sequence/align/__init__.py +0 -4
- biotite/sequence/align/alignment.py +33 -11
- biotite/sequence/align/banded.cp313-win_amd64.pyd +0 -0
- biotite/sequence/align/banded.pyx +22 -22
- biotite/sequence/align/cigar.py +2 -2
- biotite/sequence/align/kmeralphabet.cp313-win_amd64.pyd +0 -0
- biotite/sequence/align/kmeralphabet.pyx +2 -2
- biotite/sequence/align/kmersimilarity.cp313-win_amd64.pyd +0 -0
- biotite/sequence/align/kmertable.cp313-win_amd64.pyd +0 -0
- biotite/sequence/align/kmertable.pyx +6 -6
- biotite/sequence/align/localgapped.cp313-win_amd64.pyd +0 -0
- biotite/sequence/align/localgapped.pyx +47 -47
- biotite/sequence/align/localungapped.cp313-win_amd64.pyd +0 -0
- biotite/sequence/align/localungapped.pyx +10 -10
- biotite/sequence/align/matrix.py +12 -3
- biotite/sequence/align/multiple.cp313-win_amd64.pyd +0 -0
- biotite/sequence/align/multiple.pyx +1 -2
- biotite/sequence/align/pairwise.cp313-win_amd64.pyd +0 -0
- biotite/sequence/align/pairwise.pyx +37 -39
- biotite/sequence/align/permutation.cp313-win_amd64.pyd +0 -0
- biotite/sequence/align/selector.cp313-win_amd64.pyd +0 -0
- biotite/sequence/align/selector.pyx +2 -2
- biotite/sequence/align/statistics.py +1 -1
- biotite/sequence/align/tracetable.cp313-win_amd64.pyd +0 -0
- biotite/sequence/alphabet.py +2 -2
- biotite/sequence/annotation.py +19 -13
- biotite/sequence/codec.cp313-win_amd64.pyd +0 -0
- biotite/sequence/codon.py +1 -2
- biotite/sequence/graphics/alignment.py +25 -39
- biotite/sequence/graphics/dendrogram.py +4 -2
- biotite/sequence/graphics/features.py +2 -2
- biotite/sequence/graphics/logo.py +10 -12
- biotite/sequence/io/fasta/convert.py +1 -2
- biotite/sequence/io/fasta/file.py +1 -1
- biotite/sequence/io/fastq/file.py +3 -3
- biotite/sequence/io/genbank/file.py +3 -3
- biotite/sequence/io/genbank/sequence.py +2 -0
- biotite/sequence/io/gff/convert.py +1 -1
- biotite/sequence/io/gff/file.py +1 -2
- biotite/sequence/phylo/nj.cp313-win_amd64.pyd +0 -0
- biotite/sequence/phylo/tree.cp313-win_amd64.pyd +0 -0
- biotite/sequence/phylo/upgma.cp313-win_amd64.pyd +0 -0
- biotite/sequence/profile.py +19 -25
- biotite/sequence/search.py +0 -1
- biotite/sequence/seqtypes.py +12 -5
- biotite/sequence/sequence.py +1 -2
- biotite/structure/__init__.py +2 -0
- biotite/structure/alphabet/i3d.py +1 -2
- biotite/structure/alphabet/pb.py +1 -2
- biotite/structure/alphabet/unkerasify.py +8 -2
- biotite/structure/atoms.py +35 -27
- biotite/structure/basepairs.py +39 -40
- biotite/structure/bonds.cp313-win_amd64.pyd +0 -0
- biotite/structure/bonds.pyx +8 -5
- biotite/structure/box.py +159 -23
- biotite/structure/celllist.cp313-win_amd64.pyd +0 -0
- biotite/structure/celllist.pyx +83 -68
- biotite/structure/chains.py +17 -55
- biotite/structure/charges.cp313-win_amd64.pyd +0 -0
- biotite/structure/compare.py +420 -13
- biotite/structure/density.py +1 -1
- biotite/structure/dotbracket.py +31 -32
- biotite/structure/filter.py +8 -8
- biotite/structure/geometry.py +15 -15
- biotite/structure/graphics/rna.py +19 -16
- biotite/structure/hbond.py +18 -21
- biotite/structure/info/atoms.py +11 -2
- biotite/structure/info/ccd.py +0 -2
- biotite/structure/info/components.bcif +0 -0
- biotite/structure/info/groups.py +0 -3
- biotite/structure/info/misc.py +0 -1
- biotite/structure/info/radii.py +92 -22
- biotite/structure/info/standardize.py +1 -2
- biotite/structure/integrity.py +4 -6
- biotite/structure/io/general.py +2 -2
- biotite/structure/io/gro/file.py +8 -9
- biotite/structure/io/mol/convert.py +1 -1
- biotite/structure/io/mol/ctab.py +33 -28
- biotite/structure/io/mol/mol.py +1 -1
- biotite/structure/io/mol/sdf.py +39 -13
- biotite/structure/io/pdb/convert.py +86 -5
- biotite/structure/io/pdb/file.py +90 -24
- biotite/structure/io/pdb/hybrid36.cp313-win_amd64.pyd +0 -0
- biotite/structure/io/pdbqt/file.py +4 -4
- biotite/structure/io/pdbx/bcif.py +22 -7
- biotite/structure/io/pdbx/cif.py +20 -7
- biotite/structure/io/pdbx/component.py +6 -0
- biotite/structure/io/pdbx/compress.py +71 -34
- biotite/structure/io/pdbx/convert.py +429 -77
- biotite/structure/io/pdbx/encoding.cp313-win_amd64.pyd +0 -0
- biotite/structure/io/pdbx/encoding.pyx +39 -23
- biotite/structure/io/trajfile.py +9 -6
- biotite/structure/io/util.py +38 -0
- biotite/structure/mechanics.py +0 -1
- biotite/structure/molecules.py +0 -15
- biotite/structure/pseudoknots.py +13 -19
- biotite/structure/repair.py +2 -4
- biotite/structure/residues.py +20 -48
- biotite/structure/rings.py +335 -0
- biotite/structure/sasa.cp313-win_amd64.pyd +0 -0
- biotite/structure/sasa.pyx +30 -30
- biotite/structure/segments.py +123 -9
- biotite/structure/sequence.py +0 -1
- biotite/structure/spacegroups.json +1567 -0
- biotite/structure/spacegroups.license +26 -0
- biotite/structure/sse.py +0 -2
- biotite/structure/superimpose.py +75 -253
- biotite/structure/tm.py +581 -0
- biotite/structure/transform.py +232 -26
- biotite/structure/util.py +3 -3
- biotite/version.py +9 -4
- biotite/visualize.py +111 -1
- {biotite-1.1.0.dist-info → biotite-1.3.0.dist-info}/METADATA +8 -36
- {biotite-1.1.0.dist-info → biotite-1.3.0.dist-info}/RECORD +160 -138
- {biotite-1.1.0.dist-info → biotite-1.3.0.dist-info}/WHEEL +1 -1
- {biotite-1.1.0.dist-info → biotite-1.3.0.dist-info}/licenses/LICENSE.rst +0 -0
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
# information.
|
|
4
4
|
|
|
5
5
|
__name__ = "biotite.structure.io.pdbx"
|
|
6
|
-
__author__ = "Fabrice Allain, Patrick Kunzmann"
|
|
6
|
+
__author__ = "Fabrice Allain, Patrick Kunzmann, Cheyenne Ziegler"
|
|
7
7
|
__all__ = [
|
|
8
8
|
"get_sequence",
|
|
9
9
|
"get_model_count",
|
|
@@ -13,16 +13,30 @@ __all__ = [
|
|
|
13
13
|
"set_component",
|
|
14
14
|
"list_assemblies",
|
|
15
15
|
"get_assembly",
|
|
16
|
+
"get_unit_cell",
|
|
17
|
+
"get_sse",
|
|
16
18
|
]
|
|
17
19
|
|
|
18
20
|
import itertools
|
|
19
21
|
import warnings
|
|
22
|
+
from collections import defaultdict
|
|
20
23
|
import numpy as np
|
|
21
24
|
from biotite.file import InvalidFileError
|
|
22
25
|
from biotite.sequence.seqtypes import NucleotideSequence, ProteinSequence
|
|
23
|
-
from biotite.structure.atoms import
|
|
26
|
+
from biotite.structure.atoms import (
|
|
27
|
+
AtomArray,
|
|
28
|
+
AtomArrayStack,
|
|
29
|
+
concatenate,
|
|
30
|
+
repeat,
|
|
31
|
+
)
|
|
24
32
|
from biotite.structure.bonds import BondList, BondType, connect_via_residue_names
|
|
25
|
-
from biotite.structure.box import
|
|
33
|
+
from biotite.structure.box import (
|
|
34
|
+
coord_to_fraction,
|
|
35
|
+
fraction_to_coord,
|
|
36
|
+
space_group_transforms,
|
|
37
|
+
unitcell_from_vectors,
|
|
38
|
+
vectors_from_unitcell,
|
|
39
|
+
)
|
|
26
40
|
from biotite.structure.error import BadStructureError
|
|
27
41
|
from biotite.structure.filter import _canonical_aa_list as canonical_aa_list
|
|
28
42
|
from biotite.structure.filter import (
|
|
@@ -32,6 +46,7 @@ from biotite.structure.filter import (
|
|
|
32
46
|
filter_first_altloc,
|
|
33
47
|
filter_highest_occupancy_altloc,
|
|
34
48
|
)
|
|
49
|
+
from biotite.structure.geometry import centroid
|
|
35
50
|
from biotite.structure.io.pdbx.bcif import (
|
|
36
51
|
BinaryCIFBlock,
|
|
37
52
|
BinaryCIFColumn,
|
|
@@ -45,7 +60,7 @@ from biotite.structure.residues import (
|
|
|
45
60
|
get_residue_positions,
|
|
46
61
|
get_residue_starts_for,
|
|
47
62
|
)
|
|
48
|
-
from biotite.structure.
|
|
63
|
+
from biotite.structure.transform import AffineTransformation
|
|
49
64
|
|
|
50
65
|
# Bond types in `struct_conn` category that refer to covalent bonds
|
|
51
66
|
PDBX_BOND_TYPE_ID_TO_TYPE = {
|
|
@@ -81,6 +96,7 @@ PDBX_BOND_TYPE_TO_ORDER = {
|
|
|
81
96
|
BondType.AROMATIC_TRIPLE: "trip",
|
|
82
97
|
# These are masked later, it is merely added here to avoid a KeyError
|
|
83
98
|
BondType.ANY: "",
|
|
99
|
+
BondType.AROMATIC: "",
|
|
84
100
|
BondType.COORDINATION: "",
|
|
85
101
|
}
|
|
86
102
|
# Map 'chem_comp_bond' bond orders and aromaticity to 'BondType'...
|
|
@@ -92,12 +108,19 @@ COMP_BOND_ORDER_TO_TYPE = {
|
|
|
92
108
|
("SING", "Y"): BondType.AROMATIC_SINGLE,
|
|
93
109
|
("DOUB", "Y"): BondType.AROMATIC_DOUBLE,
|
|
94
110
|
("TRIP", "Y"): BondType.AROMATIC_TRIPLE,
|
|
111
|
+
("AROM", "Y"): BondType.AROMATIC,
|
|
95
112
|
}
|
|
96
113
|
# ...and vice versa
|
|
97
114
|
COMP_BOND_TYPE_TO_ORDER = {
|
|
98
115
|
bond_type: order for order, bond_type in COMP_BOND_ORDER_TO_TYPE.items()
|
|
99
116
|
}
|
|
100
117
|
CANONICAL_RESIDUE_LIST = canonical_aa_list + canonical_nucleotide_list
|
|
118
|
+
# it was observed that when the number or rows in `atom_site` and `struct_conn`
|
|
119
|
+
# exceed a certain threshold,
|
|
120
|
+
# a dictionary approach is less computation and memory intensive than the dense
|
|
121
|
+
# vectorized approach.
|
|
122
|
+
# https://github.com/biotite-dev/biotite/pull/765#issuecomment-2708867357
|
|
123
|
+
FIND_MATCHES_SWITCH_THRESHOLD = 4000000
|
|
101
124
|
|
|
102
125
|
_proteinseq_type_list = ["polypeptide(D)", "polypeptide(L)"]
|
|
103
126
|
_nucleotideseq_type_list = [
|
|
@@ -116,8 +139,7 @@ _other_type_list = [
|
|
|
116
139
|
|
|
117
140
|
def _filter(category, index):
|
|
118
141
|
"""
|
|
119
|
-
Reduce the
|
|
120
|
-
model.
|
|
142
|
+
Reduce the given category to the values selected by the given index,
|
|
121
143
|
"""
|
|
122
144
|
Category = type(category)
|
|
123
145
|
Column = Category.subcomponent_class()
|
|
@@ -160,8 +182,8 @@ def get_sequence(pdbx_file, data_block=None):
|
|
|
160
182
|
-------
|
|
161
183
|
sequence_dict : Dictionary of Sequences
|
|
162
184
|
Dictionary keys are derived from ``entity_poly.pdbx_strand_id``
|
|
163
|
-
(
|
|
164
|
-
|
|
185
|
+
(equivalent to ``atom_site.auth_asym_id``).
|
|
186
|
+
Dictionary values are sequences.
|
|
165
187
|
|
|
166
188
|
Notes
|
|
167
189
|
-----
|
|
@@ -217,9 +239,7 @@ def get_model_count(pdbx_file, data_block=None):
|
|
|
217
239
|
The number of models.
|
|
218
240
|
"""
|
|
219
241
|
block = _get_block(pdbx_file, data_block)
|
|
220
|
-
return len(
|
|
221
|
-
_get_model_starts(block["atom_site"]["pdbx_PDB_model_num"].as_array(np.int32))
|
|
222
|
-
)
|
|
242
|
+
return len(np.unique((block["atom_site"]["pdbx_PDB_model_num"].as_array(np.int32))))
|
|
223
243
|
|
|
224
244
|
|
|
225
245
|
def get_structure(
|
|
@@ -310,7 +330,6 @@ def get_structure(
|
|
|
310
330
|
>>> arr = get_structure(file, model=1)
|
|
311
331
|
>>> print(len(arr))
|
|
312
332
|
304
|
|
313
|
-
|
|
314
333
|
"""
|
|
315
334
|
block = _get_block(pdbx_file, data_block)
|
|
316
335
|
|
|
@@ -321,13 +340,12 @@ def get_structure(
|
|
|
321
340
|
raise InvalidFileError("Missing 'atom_site' category in file")
|
|
322
341
|
|
|
323
342
|
models = atom_site["pdbx_PDB_model_num"].as_array(np.int32)
|
|
324
|
-
|
|
325
|
-
model_count = len(model_starts)
|
|
343
|
+
model_count = len(np.unique(models))
|
|
326
344
|
atom_count = len(models)
|
|
327
345
|
|
|
328
346
|
if model is None:
|
|
329
347
|
# For a stack, the annotations are derived from the first model
|
|
330
|
-
model_atom_site = _filter_model(atom_site,
|
|
348
|
+
model_atom_site = _filter_model(atom_site, 1)
|
|
331
349
|
# Any field of the category would work here to get the length
|
|
332
350
|
model_length = model_atom_site.row_count
|
|
333
351
|
atoms = AtomArrayStack(model_count, model_length)
|
|
@@ -373,7 +391,7 @@ def get_structure(
|
|
|
373
391
|
f"the given model {model} does not exist"
|
|
374
392
|
)
|
|
375
393
|
|
|
376
|
-
model_atom_site = _filter_model(atom_site,
|
|
394
|
+
model_atom_site = _filter_model(atom_site, model)
|
|
377
395
|
# Any field of the category would work here to get the length
|
|
378
396
|
model_length = model_atom_site.row_count
|
|
379
397
|
atoms = AtomArray(model_length)
|
|
@@ -386,7 +404,16 @@ def get_structure(
|
|
|
386
404
|
|
|
387
405
|
# The below part is the same for both, AtomArray and AtomArrayStack
|
|
388
406
|
_fill_annotations(atoms, model_atom_site, extra_fields, use_author_fields)
|
|
407
|
+
|
|
408
|
+
atoms, altloc_filtered_atom_site = _filter_altloc(atoms, model_atom_site, altloc)
|
|
409
|
+
|
|
389
410
|
if include_bonds:
|
|
411
|
+
if altloc == "all":
|
|
412
|
+
raise ValueError(
|
|
413
|
+
"Bond computation is not supported with `altloc='all', consider using "
|
|
414
|
+
"'connect_via_residue_names()' afterwards"
|
|
415
|
+
)
|
|
416
|
+
|
|
390
417
|
if "chem_comp_bond" in block:
|
|
391
418
|
try:
|
|
392
419
|
custom_bond_dict = _parse_intra_residue_bonds(block["chem_comp_bond"])
|
|
@@ -402,10 +429,13 @@ def get_structure(
|
|
|
402
429
|
bonds = connect_via_residue_names(atoms)
|
|
403
430
|
if "struct_conn" in block:
|
|
404
431
|
bonds = bonds.merge(
|
|
405
|
-
_parse_inter_residue_bonds(
|
|
432
|
+
_parse_inter_residue_bonds(
|
|
433
|
+
altloc_filtered_atom_site,
|
|
434
|
+
block["struct_conn"],
|
|
435
|
+
atom_count=atoms.array_length(),
|
|
436
|
+
)
|
|
406
437
|
)
|
|
407
438
|
atoms.bonds = bonds
|
|
408
|
-
atoms = _filter_altloc(atoms, model_atom_site, altloc)
|
|
409
439
|
|
|
410
440
|
return atoms
|
|
411
441
|
|
|
@@ -565,11 +595,12 @@ def _parse_intra_residue_bonds(chem_comp_bond):
|
|
|
565
595
|
return custom_bond_dict
|
|
566
596
|
|
|
567
597
|
|
|
568
|
-
def _parse_inter_residue_bonds(atom_site, struct_conn):
|
|
598
|
+
def _parse_inter_residue_bonds(atom_site, struct_conn, atom_count=None):
|
|
569
599
|
"""
|
|
570
600
|
Create inter-residue bonds by parsing the ``struct_conn`` category.
|
|
571
601
|
The atom indices of each bond are found by matching the bond labels
|
|
572
602
|
to the ``atom_site`` category.
|
|
603
|
+
If atom_count is None, it will be inferred from the ``atom_site`` category.
|
|
573
604
|
"""
|
|
574
605
|
# Identity symmetry operation
|
|
575
606
|
IDENTITY = "1_555"
|
|
@@ -638,7 +669,7 @@ def _parse_inter_residue_bonds(atom_site, struct_conn):
|
|
|
638
669
|
bond_types = [PDBX_BOND_TYPE_ID_TO_TYPE[type_id] for type_id in bond_type_id]
|
|
639
670
|
|
|
640
671
|
return BondList(
|
|
641
|
-
atom_site.row_count,
|
|
672
|
+
atom_count if atom_count is not None else atom_site.row_count,
|
|
642
673
|
np.stack([atoms_indices_1, atoms_indices_2, bond_types], axis=-1),
|
|
643
674
|
)
|
|
644
675
|
|
|
@@ -649,6 +680,17 @@ def _find_matches(query_arrays, reference_arrays):
|
|
|
649
680
|
`reference_arrays` where all query values match the reference counterpart.
|
|
650
681
|
If no match is found for a query, the corresponding index is -1.
|
|
651
682
|
"""
|
|
683
|
+
if (
|
|
684
|
+
query_arrays[0].shape[0] * reference_arrays[0].shape[0]
|
|
685
|
+
<= FIND_MATCHES_SWITCH_THRESHOLD
|
|
686
|
+
):
|
|
687
|
+
match_indices = _find_matches_by_dense_array(query_arrays, reference_arrays)
|
|
688
|
+
else:
|
|
689
|
+
match_indices = _find_matches_by_dict(query_arrays, reference_arrays)
|
|
690
|
+
return match_indices
|
|
691
|
+
|
|
692
|
+
|
|
693
|
+
def _find_matches_by_dense_array(query_arrays, reference_arrays):
|
|
652
694
|
match_masks_for_all_columns = np.stack(
|
|
653
695
|
[
|
|
654
696
|
query[:, np.newaxis] == reference[np.newaxis, :]
|
|
@@ -676,6 +718,38 @@ def _find_matches(query_arrays, reference_arrays):
|
|
|
676
718
|
return match_indices
|
|
677
719
|
|
|
678
720
|
|
|
721
|
+
def _find_matches_by_dict(query_arrays, reference_arrays):
|
|
722
|
+
# Convert reference arrays to a dictionary for O(1) lookups
|
|
723
|
+
reference_dict = {}
|
|
724
|
+
ambiguous_keys = set()
|
|
725
|
+
for ref_idx, ref_row in enumerate(zip(*reference_arrays)):
|
|
726
|
+
ref_key = tuple(ref_row)
|
|
727
|
+
if ref_key in reference_dict:
|
|
728
|
+
ambiguous_keys.add(ref_key)
|
|
729
|
+
continue
|
|
730
|
+
reference_dict[ref_key] = ref_idx
|
|
731
|
+
|
|
732
|
+
match_indices = []
|
|
733
|
+
for query_idx, query_row in enumerate(zip(*query_arrays)):
|
|
734
|
+
query_key = tuple(query_row)
|
|
735
|
+
occurrence = reference_dict.get(query_key)
|
|
736
|
+
|
|
737
|
+
if occurrence is None:
|
|
738
|
+
# -1 indicates that no match was found in the reference
|
|
739
|
+
match_indices.append(-1)
|
|
740
|
+
elif query_key in ambiguous_keys:
|
|
741
|
+
# The query cannot be uniquely matched to an atom in the reference
|
|
742
|
+
raise InvalidFileError(
|
|
743
|
+
f"The covalent bond in the 'struct_conn' category at index "
|
|
744
|
+
f"{query_idx} cannot be unambiguously assigned to atoms in "
|
|
745
|
+
f"the 'atom_site' category"
|
|
746
|
+
)
|
|
747
|
+
else:
|
|
748
|
+
match_indices.append(occurrence)
|
|
749
|
+
|
|
750
|
+
return np.array(match_indices)
|
|
751
|
+
|
|
752
|
+
|
|
679
753
|
def _get_struct_conn_col_name(col_name, partner):
|
|
680
754
|
"""
|
|
681
755
|
For a column name in ``atom_site`` get the corresponding column name
|
|
@@ -691,44 +765,52 @@ def _get_struct_conn_col_name(col_name, partner):
|
|
|
691
765
|
|
|
692
766
|
|
|
693
767
|
def _filter_altloc(array, atom_site, altloc):
|
|
768
|
+
"""
|
|
769
|
+
Filter the given :class:`AtomArray` and ``atom_site`` category to the rows
|
|
770
|
+
specified by the given *altloc* identifier.
|
|
771
|
+
"""
|
|
694
772
|
altloc_ids = atom_site.get("label_alt_id")
|
|
695
773
|
occupancy = atom_site.get("occupancy")
|
|
696
774
|
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
return array
|
|
775
|
+
if altloc == "all":
|
|
776
|
+
array.set_annotation("altloc_id", altloc_ids.as_array(str))
|
|
777
|
+
return array, atom_site
|
|
778
|
+
elif altloc_ids is None or (altloc_ids.mask.array != MaskValue.PRESENT).all():
|
|
779
|
+
# No altlocs in atom_site category
|
|
780
|
+
return array, atom_site
|
|
700
781
|
elif altloc == "occupancy" and occupancy is not None:
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
),
|
|
706
|
-
]
|
|
782
|
+
mask = filter_highest_occupancy_altloc(
|
|
783
|
+
array, altloc_ids.as_array(str), occupancy.as_array(float)
|
|
784
|
+
)
|
|
785
|
+
return array[..., mask], _filter(atom_site, mask)
|
|
707
786
|
# 'first' is also fallback if file has no occupancy information
|
|
708
787
|
elif altloc == "first":
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
array.set_annotation("altloc_id", altloc_ids.as_array(str))
|
|
712
|
-
return array
|
|
788
|
+
mask = filter_first_altloc(array, altloc_ids.as_array(str))
|
|
789
|
+
return array[..., mask], _filter(atom_site, mask)
|
|
713
790
|
else:
|
|
714
791
|
raise ValueError(f"'{altloc}' is not a valid 'altloc' option")
|
|
715
792
|
|
|
716
793
|
|
|
717
|
-
def
|
|
718
|
-
"""
|
|
719
|
-
Get the start index for each model in the arrays of the
|
|
720
|
-
``atom_site`` category.
|
|
721
|
-
"""
|
|
722
|
-
_, indices = np.unique(model_array, return_index=True)
|
|
723
|
-
indices.sort()
|
|
724
|
-
return indices
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
def _filter_model(atom_site, model_starts, model):
|
|
794
|
+
def _filter_model(atom_site, model):
|
|
728
795
|
"""
|
|
729
796
|
Reduce the ``atom_site`` category to the values for the given
|
|
730
797
|
model.
|
|
798
|
+
|
|
799
|
+
Parameters
|
|
800
|
+
----------
|
|
801
|
+
atom_site : CIFCategory or BinaryCIFCategory
|
|
802
|
+
``atom_site`` category containing all models.
|
|
803
|
+
model : int
|
|
804
|
+
The model to be selected.
|
|
805
|
+
|
|
806
|
+
Returns
|
|
807
|
+
-------
|
|
808
|
+
atom_site : CIFCategory or BinaryCIFCategory
|
|
809
|
+
The ``atom_site`` category containing only the selected model.
|
|
731
810
|
"""
|
|
811
|
+
models = atom_site["pdbx_PDB_model_num"].as_array(np.int32)
|
|
812
|
+
_, model_starts = np.unique(models, return_index=True)
|
|
813
|
+
model_starts.sort()
|
|
732
814
|
# Append exclusive stop
|
|
733
815
|
model_starts = np.append(model_starts, [atom_site.row_count])
|
|
734
816
|
# Indexing starts at 0, but model number starts at 1
|
|
@@ -815,7 +897,6 @@ def set_structure(
|
|
|
815
897
|
>>> file = CIFFile()
|
|
816
898
|
>>> set_structure(file, atom_array)
|
|
817
899
|
>>> file.write(os.path.join(path_to_directory, "structure.cif"))
|
|
818
|
-
|
|
819
900
|
"""
|
|
820
901
|
_check_non_empty(array)
|
|
821
902
|
|
|
@@ -836,7 +917,11 @@ def set_structure(
|
|
|
836
917
|
)
|
|
837
918
|
atom_site["label_comp_id"] = np.copy(array.res_name)
|
|
838
919
|
atom_site["label_asym_id"] = np.copy(array.chain_id)
|
|
839
|
-
atom_site["label_entity_id"] =
|
|
920
|
+
atom_site["label_entity_id"] = (
|
|
921
|
+
np.copy(array.label_entity_id)
|
|
922
|
+
if "label_entity_id" in array.get_annotation_categories()
|
|
923
|
+
else _determine_entity_id(array.chain_id)
|
|
924
|
+
)
|
|
840
925
|
atom_site["label_seq_id"] = np.copy(array.res_id)
|
|
841
926
|
atom_site["pdbx_PDB_ins_code"] = Column(
|
|
842
927
|
np.copy(array.ins_code),
|
|
@@ -1181,7 +1266,13 @@ def _filter_canonical_links(array, bond_array):
|
|
|
1181
1266
|
) # fmt: skip
|
|
1182
1267
|
|
|
1183
1268
|
|
|
1184
|
-
def get_component(
|
|
1269
|
+
def get_component(
|
|
1270
|
+
pdbx_file,
|
|
1271
|
+
data_block=None,
|
|
1272
|
+
use_ideal_coord=True,
|
|
1273
|
+
res_name=None,
|
|
1274
|
+
allow_missing_coord=False,
|
|
1275
|
+
):
|
|
1185
1276
|
"""
|
|
1186
1277
|
Create an :class:`AtomArray` for a chemical component from the
|
|
1187
1278
|
``chem_comp_atom`` and, if available, the ``chem_comp_bond``
|
|
@@ -1209,6 +1300,11 @@ def get_component(pdbx_file, data_block=None, use_ideal_coord=True, res_name=Non
|
|
|
1209
1300
|
In this case, the component with the given residue name is
|
|
1210
1301
|
read.
|
|
1211
1302
|
By default, all rows would be read in this case.
|
|
1303
|
+
allow_missing_coord : bool, optional
|
|
1304
|
+
Whether to allow missing coordinate values in components.
|
|
1305
|
+
If ``True``, these will be represented as ``nan`` values.
|
|
1306
|
+
If ``False``, a ``ValueError`` is raised when missing coordinates
|
|
1307
|
+
are encountered.
|
|
1212
1308
|
|
|
1213
1309
|
Returns
|
|
1214
1310
|
-------
|
|
@@ -1299,7 +1395,8 @@ def get_component(pdbx_file, data_block=None, use_ideal_coord=True, res_name=Non
|
|
|
1299
1395
|
else:
|
|
1300
1396
|
raise
|
|
1301
1397
|
array.coord = _parse_component_coordinates(
|
|
1302
|
-
[atom_category[field] for field in alt_coord_fields]
|
|
1398
|
+
[atom_category[field] for field in alt_coord_fields],
|
|
1399
|
+
allow_missing=allow_missing_coord,
|
|
1303
1400
|
)
|
|
1304
1401
|
|
|
1305
1402
|
try:
|
|
@@ -1310,7 +1407,7 @@ def get_component(pdbx_file, data_block=None, use_ideal_coord=True, res_name=Non
|
|
|
1310
1407
|
)
|
|
1311
1408
|
except KeyError:
|
|
1312
1409
|
warnings.warn(
|
|
1313
|
-
"Category 'chem_comp_bond' not found.
|
|
1410
|
+
"Category 'chem_comp_bond' not found. No bonds will be parsed",
|
|
1314
1411
|
UserWarning,
|
|
1315
1412
|
)
|
|
1316
1413
|
else:
|
|
@@ -1330,14 +1427,20 @@ def get_component(pdbx_file, data_block=None, use_ideal_coord=True, res_name=Non
|
|
|
1330
1427
|
return array
|
|
1331
1428
|
|
|
1332
1429
|
|
|
1333
|
-
def _parse_component_coordinates(coord_columns):
|
|
1430
|
+
def _parse_component_coordinates(coord_columns, allow_missing=False):
|
|
1334
1431
|
coord = np.zeros((len(coord_columns[0]), 3), dtype=np.float32)
|
|
1335
1432
|
for i, column in enumerate(coord_columns):
|
|
1336
1433
|
if column.mask is not None and column.mask.array.any():
|
|
1337
|
-
|
|
1338
|
-
|
|
1339
|
-
|
|
1340
|
-
|
|
1434
|
+
if allow_missing:
|
|
1435
|
+
warnings.warn(
|
|
1436
|
+
"Missing coordinates for some atoms. Those will be set to nan",
|
|
1437
|
+
UserWarning,
|
|
1438
|
+
)
|
|
1439
|
+
else:
|
|
1440
|
+
raise ValueError(
|
|
1441
|
+
"Missing coordinates for some atoms",
|
|
1442
|
+
)
|
|
1443
|
+
coord[:, i] = column.as_array(np.float32, masked_value=np.nan)
|
|
1341
1444
|
return coord
|
|
1342
1445
|
|
|
1343
1446
|
|
|
@@ -1445,6 +1548,7 @@ def list_assemblies(pdbx_file, data_block=None):
|
|
|
1445
1548
|
|
|
1446
1549
|
Examples
|
|
1447
1550
|
--------
|
|
1551
|
+
|
|
1448
1552
|
>>> import os.path
|
|
1449
1553
|
>>> file = CIFFile.read(os.path.join(path_to_structures, "1f2n.cif"))
|
|
1450
1554
|
>>> assembly_ids = list_assemblies(file)
|
|
@@ -1611,7 +1715,7 @@ def get_assembly(
|
|
|
1611
1715
|
)
|
|
1612
1716
|
|
|
1613
1717
|
### Get transformations and apply them to the affected asym IDs
|
|
1614
|
-
|
|
1718
|
+
chain_ops = defaultdict(list)
|
|
1615
1719
|
for id, op_expr, asym_id_expr in zip(
|
|
1616
1720
|
assembly_gen_category["assembly_id"].as_array(str),
|
|
1617
1721
|
assembly_gen_category["oper_expression"].as_array(str),
|
|
@@ -1620,19 +1724,22 @@ def get_assembly(
|
|
|
1620
1724
|
# Find the operation expressions for given assembly ID
|
|
1621
1725
|
# We already asserted that the ID is actually present
|
|
1622
1726
|
if id == assembly_id:
|
|
1623
|
-
|
|
1624
|
-
|
|
1625
|
-
|
|
1626
|
-
|
|
1627
|
-
|
|
1628
|
-
|
|
1629
|
-
|
|
1630
|
-
|
|
1631
|
-
|
|
1632
|
-
|
|
1633
|
-
|
|
1634
|
-
|
|
1635
|
-
|
|
1727
|
+
for chain_id in asym_id_expr.split(","):
|
|
1728
|
+
chain_ops[chain_id].extend(_parse_operation_expression(op_expr))
|
|
1729
|
+
|
|
1730
|
+
sub_assemblies = []
|
|
1731
|
+
for asym_id, op_list in chain_ops.items():
|
|
1732
|
+
sub_struct = structure[..., structure.label_asym_id == asym_id]
|
|
1733
|
+
sub_assembly = _apply_transformations(sub_struct, transformations, op_list)
|
|
1734
|
+
# Merge the chain's sub_assembly into the rest of the assembly
|
|
1735
|
+
sub_assemblies.append(sub_assembly)
|
|
1736
|
+
assembly = concatenate(sub_assemblies)
|
|
1737
|
+
|
|
1738
|
+
# Sort AtomArray or AtomArrayStack by 'sym_id'
|
|
1739
|
+
max_sym_id = assembly.sym_id.max()
|
|
1740
|
+
assembly = concatenate(
|
|
1741
|
+
[assembly[..., assembly.sym_id == sym_id] for sym_id in range(max_sym_id + 1)]
|
|
1742
|
+
)
|
|
1636
1743
|
|
|
1637
1744
|
# Remove 'label_asym_id', if it was not included in the original
|
|
1638
1745
|
# user-supplied 'extra_fields'
|
|
@@ -1655,11 +1762,7 @@ def _apply_transformations(structure, transformation_dict, operations):
|
|
|
1655
1762
|
# Execute for each transformation step
|
|
1656
1763
|
# in the operation expression
|
|
1657
1764
|
for op_step in operation:
|
|
1658
|
-
|
|
1659
|
-
# Rotate
|
|
1660
|
-
coord = matrix_rotate(coord, rotation_matrix)
|
|
1661
|
-
# Translate
|
|
1662
|
-
coord += translation_vector
|
|
1765
|
+
coord = transformation_dict[op_step].apply(coord)
|
|
1663
1766
|
assembly_coord[i] = coord
|
|
1664
1767
|
|
|
1665
1768
|
assembly = repeat(structure, assembly_coord)
|
|
@@ -1671,8 +1774,7 @@ def _apply_transformations(structure, transformation_dict, operations):
|
|
|
1671
1774
|
|
|
1672
1775
|
def _get_transformations(struct_oper):
|
|
1673
1776
|
"""
|
|
1674
|
-
Get transformation
|
|
1675
|
-
translation for each operation ID in ``pdbx_struct_oper_list``.
|
|
1777
|
+
Get affine transformation for each operation ID in ``pdbx_struct_oper_list``.
|
|
1676
1778
|
"""
|
|
1677
1779
|
transformation_dict = {}
|
|
1678
1780
|
for index, id in enumerate(struct_oper["id"].as_array(str)):
|
|
@@ -1688,7 +1790,9 @@ def _get_transformations(struct_oper):
|
|
|
1688
1790
|
translation_vector = np.array(
|
|
1689
1791
|
[struct_oper[f"vector[{i}]"].as_array(float)[index] for i in (1, 2, 3)]
|
|
1690
1792
|
)
|
|
1691
|
-
transformation_dict[id] = (
|
|
1793
|
+
transformation_dict[id] = AffineTransformation(
|
|
1794
|
+
np.zeros(3), rotation_matrix, translation_vector
|
|
1795
|
+
)
|
|
1692
1796
|
return transformation_dict
|
|
1693
1797
|
|
|
1694
1798
|
|
|
@@ -1742,4 +1846,252 @@ def _convert_string_to_sequence(string, stype):
|
|
|
1742
1846
|
elif stype in _other_type_list:
|
|
1743
1847
|
return None
|
|
1744
1848
|
else:
|
|
1745
|
-
raise InvalidFileError("mmCIF _entity_poly.type unsupported
|
|
1849
|
+
raise InvalidFileError("mmCIF _entity_poly.type unsupported type: " + stype)
|
|
1850
|
+
|
|
1851
|
+
|
|
1852
|
+
def get_unit_cell(
|
|
1853
|
+
pdbx_file,
|
|
1854
|
+
center=True,
|
|
1855
|
+
model=None,
|
|
1856
|
+
data_block=None,
|
|
1857
|
+
altloc="first",
|
|
1858
|
+
extra_fields=None,
|
|
1859
|
+
use_author_fields=True,
|
|
1860
|
+
include_bonds=False,
|
|
1861
|
+
):
|
|
1862
|
+
"""
|
|
1863
|
+
Build a structure model containing all symmetric copies of the structure within a
|
|
1864
|
+
single unit cell.
|
|
1865
|
+
|
|
1866
|
+
This function receives the data from the ``symmetry`` and ``atom_site`` categories
|
|
1867
|
+
in the file.
|
|
1868
|
+
Consequently, these categories must be present in the file.
|
|
1869
|
+
|
|
1870
|
+
Parameters
|
|
1871
|
+
----------
|
|
1872
|
+
pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
|
|
1873
|
+
The file object.
|
|
1874
|
+
center : bool, optional
|
|
1875
|
+
If set to true, each symmetric copy will be moved inside the unit cell
|
|
1876
|
+
dimensions, if its centroid is outside.
|
|
1877
|
+
By default, the copies are are created using the raw space group
|
|
1878
|
+
transformations, which may put them one unit cell length further away.
|
|
1879
|
+
model : int, optional
|
|
1880
|
+
If this parameter is given, the function will return an
|
|
1881
|
+
:class:`AtomArray` from the atoms corresponding to the given
|
|
1882
|
+
model number (starting at 1).
|
|
1883
|
+
Negative values are used to index models starting from the last
|
|
1884
|
+
model insted of the first model.
|
|
1885
|
+
If this parameter is omitted, an :class:`AtomArrayStack`
|
|
1886
|
+
containing all models will be returned, even if the structure
|
|
1887
|
+
contains only one model.
|
|
1888
|
+
data_block : str, optional
|
|
1889
|
+
The name of the data block.
|
|
1890
|
+
Default is the first (and most times only) data block of the
|
|
1891
|
+
file.
|
|
1892
|
+
If the data block object is passed directly to `pdbx_file`,
|
|
1893
|
+
this parameter is ignored.
|
|
1894
|
+
altloc : {'first', 'occupancy', 'all'}
|
|
1895
|
+
This parameter defines how *altloc* IDs are handled:
|
|
1896
|
+
- ``'first'`` - Use atoms that have the first *altloc* ID
|
|
1897
|
+
appearing in a residue.
|
|
1898
|
+
- ``'occupancy'`` - Use atoms that have the *altloc* ID
|
|
1899
|
+
with the highest occupancy for a residue.
|
|
1900
|
+
- ``'all'`` - Use all atoms.
|
|
1901
|
+
Note that this leads to duplicate atoms.
|
|
1902
|
+
When this option is chosen, the ``altloc_id`` annotation
|
|
1903
|
+
array is added to the returned structure.
|
|
1904
|
+
extra_fields : list of str, optional
|
|
1905
|
+
The strings in the list are entry names, that are
|
|
1906
|
+
additionally added as annotation arrays.
|
|
1907
|
+
The annotation category name will be the same as the PDBx
|
|
1908
|
+
subcategory name.
|
|
1909
|
+
The array type is always `str`.
|
|
1910
|
+
An exception are the special field identifiers:
|
|
1911
|
+
``'atom_id'``, ``'b_factor'``, ``'occupancy'`` and ``'charge'``.
|
|
1912
|
+
These will convert the fitting subcategory into an
|
|
1913
|
+
annotation array with reasonable type.
|
|
1914
|
+
use_author_fields : bool, optional
|
|
1915
|
+
Some fields can be read from two alternative sources,
|
|
1916
|
+
for example both, ``label_seq_id`` and ``auth_seq_id`` describe
|
|
1917
|
+
the ID of the residue.
|
|
1918
|
+
While, the ``label_xxx`` fields can be used as official pointers
|
|
1919
|
+
to other categories in the file, the ``auth_xxx``
|
|
1920
|
+
fields are set by the author(s) of the structure and are
|
|
1921
|
+
consistent with the corresponding values in PDB files.
|
|
1922
|
+
If `use_author_fields` is true, the annotation arrays will be
|
|
1923
|
+
read from the ``auth_xxx`` fields (if applicable),
|
|
1924
|
+
otherwise from the the ``label_xxx`` fields.
|
|
1925
|
+
include_bonds : bool, optional
|
|
1926
|
+
If set to true, a :class:`BondList` will be created for the
|
|
1927
|
+
resulting :class:`AtomArray` containing the bond information
|
|
1928
|
+
from the file.
|
|
1929
|
+
Bonds, whose order could not be determined from the
|
|
1930
|
+
*Chemical Component Dictionary*
|
|
1931
|
+
(e.g. especially inter-residue bonds),
|
|
1932
|
+
have :attr:`BondType.ANY`, since the PDB format itself does
|
|
1933
|
+
not support bond orders.
|
|
1934
|
+
|
|
1935
|
+
Returns
|
|
1936
|
+
-------
|
|
1937
|
+
unit_cell : AtomArray or AtomArrayStack
|
|
1938
|
+
The structure representing the unit cell.
|
|
1939
|
+
The return type depends on the `model` parameter.
|
|
1940
|
+
Contains the `sym_id` annotation, which enumerates the copies of the asymmetric
|
|
1941
|
+
unit in the unit cell.
|
|
1942
|
+
|
|
1943
|
+
Examples
|
|
1944
|
+
--------
|
|
1945
|
+
|
|
1946
|
+
>>> import os.path
|
|
1947
|
+
>>> file = CIFFile.read(os.path.join(path_to_structures, "1f2n.cif"))
|
|
1948
|
+
>>> unit_cell = get_unit_cell(file, model=1)
|
|
1949
|
+
"""
|
|
1950
|
+
block = _get_block(pdbx_file, data_block)
|
|
1951
|
+
|
|
1952
|
+
try:
|
|
1953
|
+
space_group = block["symmetry"]["space_group_name_H-M"].as_item()
|
|
1954
|
+
except KeyError:
|
|
1955
|
+
raise InvalidFileError("File has no 'symmetry.space_group_name_H-M' field")
|
|
1956
|
+
transforms = space_group_transforms(space_group)
|
|
1957
|
+
|
|
1958
|
+
asym = get_structure(
|
|
1959
|
+
pdbx_file,
|
|
1960
|
+
model,
|
|
1961
|
+
data_block,
|
|
1962
|
+
altloc,
|
|
1963
|
+
extra_fields,
|
|
1964
|
+
use_author_fields,
|
|
1965
|
+
include_bonds,
|
|
1966
|
+
)
|
|
1967
|
+
|
|
1968
|
+
fractional_asym_coord = coord_to_fraction(asym.coord, asym.box)
|
|
1969
|
+
unit_cell_copies = []
|
|
1970
|
+
for transform in transforms:
|
|
1971
|
+
fractional_coord = transform.apply(fractional_asym_coord)
|
|
1972
|
+
if center:
|
|
1973
|
+
# If the centroid is outside the box, move the copy inside the box
|
|
1974
|
+
orig_centroid = centroid(fractional_coord)
|
|
1975
|
+
new_centroid = orig_centroid % 1
|
|
1976
|
+
fractional_coord += (new_centroid - orig_centroid)[..., np.newaxis, :]
|
|
1977
|
+
unit_cell_copies.append(fraction_to_coord(fractional_coord, asym.box))
|
|
1978
|
+
|
|
1979
|
+
unit_cell = repeat(asym, np.stack(unit_cell_copies, axis=0))
|
|
1980
|
+
unit_cell.set_annotation(
|
|
1981
|
+
"sym_id", np.repeat(np.arange(len(transforms)), asym.array_length())
|
|
1982
|
+
)
|
|
1983
|
+
return unit_cell
|
|
1984
|
+
|
|
1985
|
+
|
|
1986
|
+
def get_sse(pdbx_file, data_block=None, match_model=None):
|
|
1987
|
+
"""
|
|
1988
|
+
Get the secondary structure from a PDBx file.
|
|
1989
|
+
|
|
1990
|
+
Parameters
|
|
1991
|
+
----------
|
|
1992
|
+
pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
|
|
1993
|
+
The file object.
|
|
1994
|
+
The following categories are required:
|
|
1995
|
+
|
|
1996
|
+
- ``entity_poly``
|
|
1997
|
+
- ``struct_conf`` (if alpha-helices are present)
|
|
1998
|
+
- ``struct_sheet_range`` (if beta-strands are present)
|
|
1999
|
+
- ``atom_site`` (if `match_model` is set)
|
|
2000
|
+
|
|
2001
|
+
data_block : str, optional
|
|
2002
|
+
The name of the data block.
|
|
2003
|
+
Default is the first (and most times only) data block of the
|
|
2004
|
+
file.
|
|
2005
|
+
If the data block object is passed directly to `pdbx_file`,
|
|
2006
|
+
this parameter is ignored.
|
|
2007
|
+
match_model : None, optional
|
|
2008
|
+
If a model number is given, only secondary structure elements for residues are
|
|
2009
|
+
kept, that are resolved in the given model.
|
|
2010
|
+
This means secondary structure elements for residues that would not appear
|
|
2011
|
+
in a corresponding :class:`AtomArray` from :func:`get_structure()` are removed.
|
|
2012
|
+
By default, all residues in the sequence are kept.
|
|
2013
|
+
|
|
2014
|
+
Returns
|
|
2015
|
+
-------
|
|
2016
|
+
sse_dict : dict of str -> ndarray, dtype=str
|
|
2017
|
+
The dictionary maps the chain ID (derived from ``auth_asym_id``) to the
|
|
2018
|
+
secondary structure of the respective chain.
|
|
2019
|
+
|
|
2020
|
+
- ``"a"``: alpha-helix
|
|
2021
|
+
- ``"b"``: beta-strand
|
|
2022
|
+
- ``"c"``: coil or not an amino acid
|
|
2023
|
+
|
|
2024
|
+
Each secondary structure element corresponds to the ``label_seq_id`` of the
|
|
2025
|
+
``atom_site`` category.
|
|
2026
|
+
This means that the 0-th position of the array corresponds to the residue
|
|
2027
|
+
in ``atom_site`` with ``label_seq_id`` ``1``.
|
|
2028
|
+
|
|
2029
|
+
Examples
|
|
2030
|
+
--------
|
|
2031
|
+
|
|
2032
|
+
>>> import os.path
|
|
2033
|
+
>>> file = CIFFile.read(os.path.join(path_to_structures, "1aki.cif"))
|
|
2034
|
+
>>> sse = get_sse(file, match_model=1)
|
|
2035
|
+
>>> print(sse)
|
|
2036
|
+
{'A': array(['c', 'c', 'c', 'c', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a',
|
|
2037
|
+
'a', 'c', 'c', 'c', 'c', 'c', 'a', 'a', 'a', 'c', 'c', 'a', 'a',
|
|
2038
|
+
'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c', 'c', 'c',
|
|
2039
|
+
'c', 'c', 'c', 'b', 'b', 'b', 'c', 'c', 'c', 'c', 'c', 'b', 'b',
|
|
2040
|
+
'b', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c',
|
|
2041
|
+
'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c',
|
|
2042
|
+
'c', 'a', 'a', 'a', 'a', 'a', 'c', 'c', 'c', 'c', 'a', 'a', 'a',
|
|
2043
|
+
'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c', 'c', 'a',
|
|
2044
|
+
'a', 'a', 'a', 'c', 'a', 'a', 'a', 'a', 'a', 'a', 'c', 'c', 'c',
|
|
2045
|
+
'c', 'c', 'a', 'a', 'a', 'a', 'c', 'c', 'c', 'c', 'c', 'c'],
|
|
2046
|
+
dtype='<U1')}
|
|
2047
|
+
|
|
2048
|
+
If only secondary structure elements for resolved residues are requested, the length
|
|
2049
|
+
of the returned array matches the number of peptide residues in the structure.
|
|
2050
|
+
|
|
2051
|
+
>>> file = CIFFile.read(os.path.join(path_to_structures, "3o5r.cif"))
|
|
2052
|
+
>>> print(len(get_sse(file, match_model=1)["A"]))
|
|
2053
|
+
128
|
|
2054
|
+
>>> atoms = get_structure(file, model=1)
|
|
2055
|
+
>>> atoms = atoms[filter_amino_acids(atoms) & (atoms.chain_id == "A")]
|
|
2056
|
+
>>> print(get_residue_count(atoms))
|
|
2057
|
+
128
|
|
2058
|
+
"""
|
|
2059
|
+
block = _get_block(pdbx_file, data_block)
|
|
2060
|
+
|
|
2061
|
+
# Init all chains with "c" for coil
|
|
2062
|
+
sse_dict = {
|
|
2063
|
+
chain_id: np.repeat("c", len(sequence))
|
|
2064
|
+
for chain_id, sequence in get_sequence(block).items()
|
|
2065
|
+
}
|
|
2066
|
+
|
|
2067
|
+
# Populate SSE arrays with helices and strands
|
|
2068
|
+
for sse_symbol, category_name in [
|
|
2069
|
+
("a", "struct_conf"),
|
|
2070
|
+
("b", "struct_sheet_range"),
|
|
2071
|
+
]:
|
|
2072
|
+
if category_name in block:
|
|
2073
|
+
category = block[category_name]
|
|
2074
|
+
chains = category["beg_auth_asym_id"].as_array(str)
|
|
2075
|
+
start_positions = category["beg_label_seq_id"].as_array(int)
|
|
2076
|
+
end_positions = category["end_label_seq_id"].as_array(int)
|
|
2077
|
+
|
|
2078
|
+
# set alpha helix positions
|
|
2079
|
+
for chain, start, end in zip(chains, start_positions, end_positions):
|
|
2080
|
+
# Translate the 1-based positions from PDBx into 0-based array indices
|
|
2081
|
+
sse_dict[chain][start - 1 : end] = sse_symbol
|
|
2082
|
+
|
|
2083
|
+
if match_model is not None:
|
|
2084
|
+
model_atom_site = _filter_model(block["atom_site"], match_model)
|
|
2085
|
+
chain_ids = model_atom_site["auth_asym_id"].as_array(str)
|
|
2086
|
+
res_ids = model_atom_site["label_seq_id"].as_array(int, masked_value=-1)
|
|
2087
|
+
# Filter out masked residues, i.e. residues not part of a chain
|
|
2088
|
+
mask = res_ids != -1
|
|
2089
|
+
chain_ids = chain_ids[mask]
|
|
2090
|
+
res_ids = res_ids[mask]
|
|
2091
|
+
for chain_id, sse in sse_dict.items():
|
|
2092
|
+
res_ids_in_chain = res_ids[chain_ids == chain_id]
|
|
2093
|
+
# Transform from 1-based residue ID to 0-based index
|
|
2094
|
+
indices = np.unique(res_ids_in_chain) - 1
|
|
2095
|
+
sse_dict[chain_id] = sse[indices]
|
|
2096
|
+
|
|
2097
|
+
return sse_dict
|