biotite 1.5.0__cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biotite might be problematic. Click here for more details.
- biotite/__init__.py +18 -0
- biotite/application/__init__.py +69 -0
- biotite/application/application.py +276 -0
- biotite/application/autodock/__init__.py +12 -0
- biotite/application/autodock/app.py +500 -0
- biotite/application/blast/__init__.py +14 -0
- biotite/application/blast/alignment.py +92 -0
- biotite/application/blast/webapp.py +428 -0
- biotite/application/clustalo/__init__.py +12 -0
- biotite/application/clustalo/app.py +223 -0
- biotite/application/dssp/__init__.py +12 -0
- biotite/application/dssp/app.py +216 -0
- biotite/application/localapp.py +342 -0
- biotite/application/mafft/__init__.py +12 -0
- biotite/application/mafft/app.py +116 -0
- biotite/application/msaapp.py +363 -0
- biotite/application/muscle/__init__.py +13 -0
- biotite/application/muscle/app3.py +227 -0
- biotite/application/muscle/app5.py +163 -0
- biotite/application/sra/__init__.py +18 -0
- biotite/application/sra/app.py +447 -0
- biotite/application/tantan/__init__.py +12 -0
- biotite/application/tantan/app.py +199 -0
- biotite/application/util.py +77 -0
- biotite/application/viennarna/__init__.py +18 -0
- biotite/application/viennarna/rnaalifold.py +310 -0
- biotite/application/viennarna/rnafold.py +254 -0
- biotite/application/viennarna/rnaplot.py +208 -0
- biotite/application/viennarna/util.py +77 -0
- biotite/application/webapp.py +76 -0
- biotite/copyable.py +71 -0
- biotite/database/__init__.py +23 -0
- biotite/database/afdb/__init__.py +12 -0
- biotite/database/afdb/download.py +197 -0
- biotite/database/entrez/__init__.py +15 -0
- biotite/database/entrez/check.py +60 -0
- biotite/database/entrez/dbnames.py +101 -0
- biotite/database/entrez/download.py +228 -0
- biotite/database/entrez/key.py +44 -0
- biotite/database/entrez/query.py +263 -0
- biotite/database/error.py +16 -0
- biotite/database/pubchem/__init__.py +21 -0
- biotite/database/pubchem/download.py +258 -0
- biotite/database/pubchem/error.py +30 -0
- biotite/database/pubchem/query.py +819 -0
- biotite/database/pubchem/throttle.py +98 -0
- biotite/database/rcsb/__init__.py +13 -0
- biotite/database/rcsb/download.py +161 -0
- biotite/database/rcsb/query.py +963 -0
- biotite/database/uniprot/__init__.py +13 -0
- biotite/database/uniprot/check.py +40 -0
- biotite/database/uniprot/download.py +126 -0
- biotite/database/uniprot/query.py +292 -0
- biotite/file.py +244 -0
- biotite/interface/__init__.py +19 -0
- biotite/interface/openmm/__init__.py +20 -0
- biotite/interface/openmm/state.py +93 -0
- biotite/interface/openmm/system.py +227 -0
- biotite/interface/pymol/__init__.py +201 -0
- biotite/interface/pymol/cgo.py +346 -0
- biotite/interface/pymol/convert.py +185 -0
- biotite/interface/pymol/display.py +267 -0
- biotite/interface/pymol/object.py +1228 -0
- biotite/interface/pymol/shapes.py +178 -0
- biotite/interface/pymol/startup.py +169 -0
- biotite/interface/rdkit/__init__.py +19 -0
- biotite/interface/rdkit/mol.py +490 -0
- biotite/interface/version.py +94 -0
- biotite/interface/warning.py +19 -0
- biotite/sequence/__init__.py +84 -0
- biotite/sequence/align/__init__.py +199 -0
- biotite/sequence/align/alignment.py +702 -0
- biotite/sequence/align/banded.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/banded.pyx +652 -0
- biotite/sequence/align/buckets.py +71 -0
- biotite/sequence/align/cigar.py +425 -0
- biotite/sequence/align/kmeralphabet.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/kmeralphabet.pyx +595 -0
- biotite/sequence/align/kmersimilarity.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/kmersimilarity.pyx +233 -0
- biotite/sequence/align/kmertable.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/kmertable.pyx +3411 -0
- biotite/sequence/align/localgapped.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/localgapped.pyx +892 -0
- biotite/sequence/align/localungapped.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/localungapped.pyx +279 -0
- biotite/sequence/align/matrix.py +631 -0
- biotite/sequence/align/matrix_data/3Di.mat +24 -0
- biotite/sequence/align/matrix_data/BLOSUM100.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM30.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM35.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM40.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM45.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM50.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM50_13p.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM50_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM50_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM55.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM60.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM62.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM62_13p.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM62_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM62_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM65.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM70.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM75.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM80.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM85.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM90.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUMN.mat +31 -0
- biotite/sequence/align/matrix_data/CorBLOSUM49_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM57_13p.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM57_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM61_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM66_13p.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM67_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/DAYHOFF.mat +32 -0
- biotite/sequence/align/matrix_data/GONNET.mat +26 -0
- biotite/sequence/align/matrix_data/IDENTITY.mat +25 -0
- biotite/sequence/align/matrix_data/MATCH.mat +25 -0
- biotite/sequence/align/matrix_data/NUC.mat +25 -0
- biotite/sequence/align/matrix_data/PAM10.mat +34 -0
- biotite/sequence/align/matrix_data/PAM100.mat +34 -0
- biotite/sequence/align/matrix_data/PAM110.mat +34 -0
- biotite/sequence/align/matrix_data/PAM120.mat +34 -0
- biotite/sequence/align/matrix_data/PAM130.mat +34 -0
- biotite/sequence/align/matrix_data/PAM140.mat +34 -0
- biotite/sequence/align/matrix_data/PAM150.mat +34 -0
- biotite/sequence/align/matrix_data/PAM160.mat +34 -0
- biotite/sequence/align/matrix_data/PAM170.mat +34 -0
- biotite/sequence/align/matrix_data/PAM180.mat +34 -0
- biotite/sequence/align/matrix_data/PAM190.mat +34 -0
- biotite/sequence/align/matrix_data/PAM20.mat +34 -0
- biotite/sequence/align/matrix_data/PAM200.mat +34 -0
- biotite/sequence/align/matrix_data/PAM210.mat +34 -0
- biotite/sequence/align/matrix_data/PAM220.mat +34 -0
- biotite/sequence/align/matrix_data/PAM230.mat +34 -0
- biotite/sequence/align/matrix_data/PAM240.mat +34 -0
- biotite/sequence/align/matrix_data/PAM250.mat +34 -0
- biotite/sequence/align/matrix_data/PAM260.mat +34 -0
- biotite/sequence/align/matrix_data/PAM270.mat +34 -0
- biotite/sequence/align/matrix_data/PAM280.mat +34 -0
- biotite/sequence/align/matrix_data/PAM290.mat +34 -0
- biotite/sequence/align/matrix_data/PAM30.mat +34 -0
- biotite/sequence/align/matrix_data/PAM300.mat +34 -0
- biotite/sequence/align/matrix_data/PAM310.mat +34 -0
- biotite/sequence/align/matrix_data/PAM320.mat +34 -0
- biotite/sequence/align/matrix_data/PAM330.mat +34 -0
- biotite/sequence/align/matrix_data/PAM340.mat +34 -0
- biotite/sequence/align/matrix_data/PAM350.mat +34 -0
- biotite/sequence/align/matrix_data/PAM360.mat +34 -0
- biotite/sequence/align/matrix_data/PAM370.mat +34 -0
- biotite/sequence/align/matrix_data/PAM380.mat +34 -0
- biotite/sequence/align/matrix_data/PAM390.mat +34 -0
- biotite/sequence/align/matrix_data/PAM40.mat +34 -0
- biotite/sequence/align/matrix_data/PAM400.mat +34 -0
- biotite/sequence/align/matrix_data/PAM410.mat +34 -0
- biotite/sequence/align/matrix_data/PAM420.mat +34 -0
- biotite/sequence/align/matrix_data/PAM430.mat +34 -0
- biotite/sequence/align/matrix_data/PAM440.mat +34 -0
- biotite/sequence/align/matrix_data/PAM450.mat +34 -0
- biotite/sequence/align/matrix_data/PAM460.mat +34 -0
- biotite/sequence/align/matrix_data/PAM470.mat +34 -0
- biotite/sequence/align/matrix_data/PAM480.mat +34 -0
- biotite/sequence/align/matrix_data/PAM490.mat +34 -0
- biotite/sequence/align/matrix_data/PAM50.mat +34 -0
- biotite/sequence/align/matrix_data/PAM500.mat +34 -0
- biotite/sequence/align/matrix_data/PAM60.mat +34 -0
- biotite/sequence/align/matrix_data/PAM70.mat +34 -0
- biotite/sequence/align/matrix_data/PAM80.mat +34 -0
- biotite/sequence/align/matrix_data/PAM90.mat +34 -0
- biotite/sequence/align/matrix_data/PB.license +21 -0
- biotite/sequence/align/matrix_data/PB.mat +18 -0
- biotite/sequence/align/matrix_data/RBLOSUM52_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM59_13p.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM59_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM64_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM69_13p.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM69_14.3.mat +25 -0
- biotite/sequence/align/multiple.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/multiple.pyx +619 -0
- biotite/sequence/align/pairwise.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/pairwise.pyx +585 -0
- biotite/sequence/align/permutation.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/permutation.pyx +313 -0
- biotite/sequence/align/primes.txt +821 -0
- biotite/sequence/align/selector.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/selector.pyx +954 -0
- biotite/sequence/align/statistics.py +264 -0
- biotite/sequence/align/tracetable.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/tracetable.pxd +64 -0
- biotite/sequence/align/tracetable.pyx +370 -0
- biotite/sequence/alphabet.py +555 -0
- biotite/sequence/annotation.py +836 -0
- biotite/sequence/codec.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/codec.pyx +155 -0
- biotite/sequence/codon.py +476 -0
- biotite/sequence/codon_tables.txt +202 -0
- biotite/sequence/graphics/__init__.py +33 -0
- biotite/sequence/graphics/alignment.py +1101 -0
- biotite/sequence/graphics/color_schemes/3di_flower.json +48 -0
- biotite/sequence/graphics/color_schemes/autumn.json +51 -0
- biotite/sequence/graphics/color_schemes/blossom.json +51 -0
- biotite/sequence/graphics/color_schemes/clustalx_dna.json +11 -0
- biotite/sequence/graphics/color_schemes/clustalx_protein.json +28 -0
- biotite/sequence/graphics/color_schemes/flower.json +51 -0
- biotite/sequence/graphics/color_schemes/jalview_buried.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_hydrophobicity.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_prop_helix.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_prop_strand.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_prop_turn.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_taylor.json +28 -0
- biotite/sequence/graphics/color_schemes/jalview_zappo.json +28 -0
- biotite/sequence/graphics/color_schemes/ocean.json +51 -0
- biotite/sequence/graphics/color_schemes/pb_flower.json +40 -0
- biotite/sequence/graphics/color_schemes/rainbow_dna.json +11 -0
- biotite/sequence/graphics/color_schemes/rainbow_protein.json +30 -0
- biotite/sequence/graphics/color_schemes/spring.json +51 -0
- biotite/sequence/graphics/color_schemes/sunset.json +51 -0
- biotite/sequence/graphics/color_schemes/wither.json +51 -0
- biotite/sequence/graphics/colorschemes.py +170 -0
- biotite/sequence/graphics/dendrogram.py +231 -0
- biotite/sequence/graphics/features.py +544 -0
- biotite/sequence/graphics/logo.py +102 -0
- biotite/sequence/graphics/plasmid.py +712 -0
- biotite/sequence/io/__init__.py +12 -0
- biotite/sequence/io/fasta/__init__.py +22 -0
- biotite/sequence/io/fasta/convert.py +283 -0
- biotite/sequence/io/fasta/file.py +265 -0
- biotite/sequence/io/fastq/__init__.py +19 -0
- biotite/sequence/io/fastq/convert.py +117 -0
- biotite/sequence/io/fastq/file.py +507 -0
- biotite/sequence/io/genbank/__init__.py +17 -0
- biotite/sequence/io/genbank/annotation.py +269 -0
- biotite/sequence/io/genbank/file.py +573 -0
- biotite/sequence/io/genbank/metadata.py +336 -0
- biotite/sequence/io/genbank/sequence.py +173 -0
- biotite/sequence/io/general.py +201 -0
- biotite/sequence/io/gff/__init__.py +26 -0
- biotite/sequence/io/gff/convert.py +128 -0
- biotite/sequence/io/gff/file.py +449 -0
- biotite/sequence/phylo/__init__.py +36 -0
- biotite/sequence/phylo/nj.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/phylo/nj.pyx +221 -0
- biotite/sequence/phylo/tree.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/phylo/tree.pyx +1169 -0
- biotite/sequence/phylo/upgma.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/phylo/upgma.pyx +164 -0
- biotite/sequence/profile.py +561 -0
- biotite/sequence/search.py +117 -0
- biotite/sequence/seqtypes.py +720 -0
- biotite/sequence/sequence.py +373 -0
- biotite/setup_ccd.py +197 -0
- biotite/structure/__init__.py +135 -0
- biotite/structure/alphabet/__init__.py +25 -0
- biotite/structure/alphabet/encoder.py +332 -0
- biotite/structure/alphabet/encoder_weights_3di.kerasify +0 -0
- biotite/structure/alphabet/i3d.py +109 -0
- biotite/structure/alphabet/layers.py +86 -0
- biotite/structure/alphabet/pb.license +21 -0
- biotite/structure/alphabet/pb.py +170 -0
- biotite/structure/alphabet/unkerasify.py +128 -0
- biotite/structure/atoms.py +1562 -0
- biotite/structure/basepairs.py +1403 -0
- biotite/structure/bonds.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/structure/bonds.pyx +2036 -0
- biotite/structure/box.py +724 -0
- biotite/structure/celllist.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/structure/celllist.pyx +864 -0
- biotite/structure/chains.py +310 -0
- biotite/structure/charges.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/structure/charges.pyx +520 -0
- biotite/structure/compare.py +683 -0
- biotite/structure/density.py +109 -0
- biotite/structure/dotbracket.py +213 -0
- biotite/structure/error.py +39 -0
- biotite/structure/filter.py +591 -0
- biotite/structure/geometry.py +817 -0
- biotite/structure/graphics/__init__.py +13 -0
- biotite/structure/graphics/atoms.py +243 -0
- biotite/structure/graphics/rna.py +298 -0
- biotite/structure/hbond.py +425 -0
- biotite/structure/info/__init__.py +24 -0
- biotite/structure/info/atom_masses.json +121 -0
- biotite/structure/info/atoms.py +98 -0
- biotite/structure/info/bonds.py +149 -0
- biotite/structure/info/ccd.py +200 -0
- biotite/structure/info/components.bcif +0 -0
- biotite/structure/info/groups.py +128 -0
- biotite/structure/info/masses.py +121 -0
- biotite/structure/info/misc.py +137 -0
- biotite/structure/info/radii.py +267 -0
- biotite/structure/info/standardize.py +185 -0
- biotite/structure/integrity.py +213 -0
- biotite/structure/io/__init__.py +29 -0
- biotite/structure/io/dcd/__init__.py +13 -0
- biotite/structure/io/dcd/file.py +67 -0
- biotite/structure/io/general.py +243 -0
- biotite/structure/io/gro/__init__.py +14 -0
- biotite/structure/io/gro/file.py +343 -0
- biotite/structure/io/mol/__init__.py +20 -0
- biotite/structure/io/mol/convert.py +112 -0
- biotite/structure/io/mol/ctab.py +420 -0
- biotite/structure/io/mol/header.py +120 -0
- biotite/structure/io/mol/mol.py +149 -0
- biotite/structure/io/mol/sdf.py +940 -0
- biotite/structure/io/netcdf/__init__.py +13 -0
- biotite/structure/io/netcdf/file.py +64 -0
- biotite/structure/io/pdb/__init__.py +20 -0
- biotite/structure/io/pdb/convert.py +389 -0
- biotite/structure/io/pdb/file.py +1380 -0
- biotite/structure/io/pdb/hybrid36.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/structure/io/pdb/hybrid36.pyx +242 -0
- biotite/structure/io/pdbqt/__init__.py +15 -0
- biotite/structure/io/pdbqt/convert.py +113 -0
- biotite/structure/io/pdbqt/file.py +688 -0
- biotite/structure/io/pdbx/__init__.py +23 -0
- biotite/structure/io/pdbx/bcif.py +674 -0
- biotite/structure/io/pdbx/cif.py +1091 -0
- biotite/structure/io/pdbx/component.py +251 -0
- biotite/structure/io/pdbx/compress.py +362 -0
- biotite/structure/io/pdbx/convert.py +2113 -0
- biotite/structure/io/pdbx/encoding.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/structure/io/pdbx/encoding.pyx +1078 -0
- biotite/structure/io/trajfile.py +696 -0
- biotite/structure/io/trr/__init__.py +13 -0
- biotite/structure/io/trr/file.py +43 -0
- biotite/structure/io/util.py +38 -0
- biotite/structure/io/xtc/__init__.py +13 -0
- biotite/structure/io/xtc/file.py +43 -0
- biotite/structure/mechanics.py +72 -0
- biotite/structure/molecules.py +337 -0
- biotite/structure/pseudoknots.py +622 -0
- biotite/structure/rdf.py +245 -0
- biotite/structure/repair.py +302 -0
- biotite/structure/residues.py +716 -0
- biotite/structure/rings.py +451 -0
- biotite/structure/sasa.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/structure/sasa.pyx +322 -0
- biotite/structure/segments.py +328 -0
- biotite/structure/sequence.py +110 -0
- biotite/structure/spacegroups.json +1567 -0
- biotite/structure/spacegroups.license +26 -0
- biotite/structure/sse.py +306 -0
- biotite/structure/superimpose.py +511 -0
- biotite/structure/tm.py +581 -0
- biotite/structure/transform.py +736 -0
- biotite/structure/util.py +160 -0
- biotite/version.py +34 -0
- biotite/visualize.py +375 -0
- biotite-1.5.0.dist-info/METADATA +162 -0
- biotite-1.5.0.dist-info/RECORD +354 -0
- biotite-1.5.0.dist-info/WHEEL +6 -0
- biotite-1.5.0.dist-info/licenses/LICENSE.rst +30 -0
|
@@ -0,0 +1,2113 @@
|
|
|
1
|
+
# This source code is part of the Biotite package and is distributed
|
|
2
|
+
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
|
|
3
|
+
# information.
|
|
4
|
+
|
|
5
|
+
__name__ = "biotite.structure.io.pdbx"
|
|
6
|
+
__author__ = "Fabrice Allain, Patrick Kunzmann, Cheyenne Ziegler"
|
|
7
|
+
__all__ = [
|
|
8
|
+
"get_sequence",
|
|
9
|
+
"get_model_count",
|
|
10
|
+
"get_structure",
|
|
11
|
+
"set_structure",
|
|
12
|
+
"get_component",
|
|
13
|
+
"set_component",
|
|
14
|
+
"list_assemblies",
|
|
15
|
+
"get_assembly",
|
|
16
|
+
"get_unit_cell",
|
|
17
|
+
"get_sse",
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
import itertools
|
|
21
|
+
import warnings
|
|
22
|
+
from collections import defaultdict
|
|
23
|
+
import numpy as np
|
|
24
|
+
from biotite.file import InvalidFileError
|
|
25
|
+
from biotite.sequence.seqtypes import NucleotideSequence, ProteinSequence
|
|
26
|
+
from biotite.structure.atoms import (
|
|
27
|
+
AtomArray,
|
|
28
|
+
AtomArrayStack,
|
|
29
|
+
concatenate,
|
|
30
|
+
repeat,
|
|
31
|
+
)
|
|
32
|
+
from biotite.structure.bonds import BondList, BondType, connect_via_residue_names
|
|
33
|
+
from biotite.structure.box import (
|
|
34
|
+
coord_to_fraction,
|
|
35
|
+
fraction_to_coord,
|
|
36
|
+
space_group_transforms,
|
|
37
|
+
unitcell_from_vectors,
|
|
38
|
+
vectors_from_unitcell,
|
|
39
|
+
)
|
|
40
|
+
from biotite.structure.error import BadStructureError
|
|
41
|
+
from biotite.structure.filter import _canonical_aa_list as canonical_aa_list
|
|
42
|
+
from biotite.structure.filter import (
|
|
43
|
+
_canonical_nucleotide_list as canonical_nucleotide_list,
|
|
44
|
+
)
|
|
45
|
+
from biotite.structure.filter import (
|
|
46
|
+
filter_first_altloc,
|
|
47
|
+
filter_highest_occupancy_altloc,
|
|
48
|
+
)
|
|
49
|
+
from biotite.structure.geometry import centroid
|
|
50
|
+
from biotite.structure.io.pdbx.bcif import (
|
|
51
|
+
BinaryCIFBlock,
|
|
52
|
+
BinaryCIFColumn,
|
|
53
|
+
BinaryCIFFile,
|
|
54
|
+
)
|
|
55
|
+
from biotite.structure.io.pdbx.cif import CIFBlock, CIFFile
|
|
56
|
+
from biotite.structure.io.pdbx.component import MaskValue
|
|
57
|
+
from biotite.structure.io.pdbx.encoding import StringArrayEncoding
|
|
58
|
+
from biotite.structure.repair import create_continuous_res_ids
|
|
59
|
+
from biotite.structure.residues import (
|
|
60
|
+
get_residue_count,
|
|
61
|
+
get_residue_positions,
|
|
62
|
+
get_residue_starts_for,
|
|
63
|
+
)
|
|
64
|
+
from biotite.structure.transform import AffineTransformation
|
|
65
|
+
|
|
66
|
+
# Bond types in `struct_conn` category that refer to covalent bonds
|
|
67
|
+
PDBX_BOND_TYPE_ID_TO_TYPE = {
|
|
68
|
+
# Although a covalent bond, could in theory have a higher bond order,
|
|
69
|
+
# practically inter-residue bonds are always single
|
|
70
|
+
"covale": BondType.SINGLE,
|
|
71
|
+
"covale_base": BondType.SINGLE,
|
|
72
|
+
"covale_phosphate": BondType.SINGLE,
|
|
73
|
+
"covale_sugar": BondType.SINGLE,
|
|
74
|
+
"disulf": BondType.SINGLE,
|
|
75
|
+
"modres": BondType.SINGLE,
|
|
76
|
+
"modres_link": BondType.SINGLE,
|
|
77
|
+
"metalc": BondType.COORDINATION,
|
|
78
|
+
}
|
|
79
|
+
PDBX_BOND_TYPE_TO_TYPE_ID = {
|
|
80
|
+
BondType.ANY: "covale",
|
|
81
|
+
BondType.SINGLE: "covale",
|
|
82
|
+
BondType.DOUBLE: "covale",
|
|
83
|
+
BondType.TRIPLE: "covale",
|
|
84
|
+
BondType.QUADRUPLE: "covale",
|
|
85
|
+
BondType.AROMATIC_SINGLE: "covale",
|
|
86
|
+
BondType.AROMATIC_DOUBLE: "covale",
|
|
87
|
+
BondType.AROMATIC_TRIPLE: "covale",
|
|
88
|
+
BondType.COORDINATION: "metalc",
|
|
89
|
+
}
|
|
90
|
+
PDBX_BOND_TYPE_TO_ORDER = {
|
|
91
|
+
BondType.SINGLE: "sing",
|
|
92
|
+
BondType.DOUBLE: "doub",
|
|
93
|
+
BondType.TRIPLE: "trip",
|
|
94
|
+
BondType.QUADRUPLE: "quad",
|
|
95
|
+
BondType.AROMATIC_SINGLE: "sing",
|
|
96
|
+
BondType.AROMATIC_DOUBLE: "doub",
|
|
97
|
+
BondType.AROMATIC_TRIPLE: "trip",
|
|
98
|
+
# These are masked later, it is merely added here to avoid a KeyError
|
|
99
|
+
BondType.ANY: "",
|
|
100
|
+
BondType.AROMATIC: "",
|
|
101
|
+
BondType.COORDINATION: "",
|
|
102
|
+
}
|
|
103
|
+
# Map 'chem_comp_bond' bond orders and aromaticity to 'BondType'...
|
|
104
|
+
COMP_BOND_ORDER_TO_TYPE = {
|
|
105
|
+
("SING", "N"): BondType.SINGLE,
|
|
106
|
+
("DOUB", "N"): BondType.DOUBLE,
|
|
107
|
+
("TRIP", "N"): BondType.TRIPLE,
|
|
108
|
+
("QUAD", "N"): BondType.QUADRUPLE,
|
|
109
|
+
("SING", "Y"): BondType.AROMATIC_SINGLE,
|
|
110
|
+
("DOUB", "Y"): BondType.AROMATIC_DOUBLE,
|
|
111
|
+
("TRIP", "Y"): BondType.AROMATIC_TRIPLE,
|
|
112
|
+
("AROM", "Y"): BondType.AROMATIC,
|
|
113
|
+
}
|
|
114
|
+
# ...and vice versa
|
|
115
|
+
COMP_BOND_TYPE_TO_ORDER = {
|
|
116
|
+
bond_type: order for order, bond_type in COMP_BOND_ORDER_TO_TYPE.items()
|
|
117
|
+
}
|
|
118
|
+
CANONICAL_RESIDUE_LIST = canonical_aa_list + canonical_nucleotide_list
|
|
119
|
+
# it was observed that when the number or rows in `atom_site` and `struct_conn`
|
|
120
|
+
# exceed a certain threshold,
|
|
121
|
+
# a dictionary approach is less computation and memory intensive than the dense
|
|
122
|
+
# vectorized approach.
|
|
123
|
+
# https://github.com/biotite-dev/biotite/pull/765#issuecomment-2708867357
|
|
124
|
+
FIND_MATCHES_SWITCH_THRESHOLD = 4000000
|
|
125
|
+
|
|
126
|
+
_proteinseq_type_list = ["polypeptide(D)", "polypeptide(L)"]
|
|
127
|
+
_nucleotideseq_type_list = [
|
|
128
|
+
"polydeoxyribonucleotide",
|
|
129
|
+
"polyribonucleotide",
|
|
130
|
+
"polydeoxyribonucleotide/polyribonucleotide hybrid",
|
|
131
|
+
]
|
|
132
|
+
_other_type_list = [
|
|
133
|
+
"cyclic-pseudo-peptide",
|
|
134
|
+
"other",
|
|
135
|
+
"peptide nucleic acid",
|
|
136
|
+
"polysaccharide(D)",
|
|
137
|
+
"polysaccharide(L)",
|
|
138
|
+
]
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _filter(category, index):
|
|
142
|
+
"""
|
|
143
|
+
Reduce the given category to the values selected by the given index,
|
|
144
|
+
"""
|
|
145
|
+
Category = type(category)
|
|
146
|
+
Column = Category.subcomponent_class()
|
|
147
|
+
Data = Column.subcomponent_class()
|
|
148
|
+
|
|
149
|
+
return Category(
|
|
150
|
+
{
|
|
151
|
+
key: Column(
|
|
152
|
+
Data(column.data.array[index]),
|
|
153
|
+
(Data(column.mask.array[index]) if column.mask is not None else None),
|
|
154
|
+
)
|
|
155
|
+
for key, column in category.items()
|
|
156
|
+
}
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def get_sequence(pdbx_file, data_block=None):
|
|
161
|
+
"""
|
|
162
|
+
Get the protein and nucleotide sequences from the
|
|
163
|
+
``entity_poly.pdbx_seq_one_letter_code_can`` entry.
|
|
164
|
+
|
|
165
|
+
Supported polymer types (``_entity_poly.type``) are:
|
|
166
|
+
``'polypeptide(D)'``, ``'polypeptide(L)'``,
|
|
167
|
+
``'polydeoxyribonucleotide'``, ``'polyribonucleotide'`` and
|
|
168
|
+
``'polydeoxyribonucleotide/polyribonucleotide hybrid'``.
|
|
169
|
+
Uracil is converted to Thymine.
|
|
170
|
+
|
|
171
|
+
Parameters
|
|
172
|
+
----------
|
|
173
|
+
pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
|
|
174
|
+
The file object.
|
|
175
|
+
data_block : str, optional
|
|
176
|
+
The name of the data block.
|
|
177
|
+
Default is the first (and most times only) data block of the
|
|
178
|
+
file.
|
|
179
|
+
If the data block object is passed directly to `pdbx_file`,
|
|
180
|
+
this parameter is ignored.
|
|
181
|
+
|
|
182
|
+
Returns
|
|
183
|
+
-------
|
|
184
|
+
sequence_dict : Dictionary of Sequences
|
|
185
|
+
Dictionary keys are derived from ``entity_poly.pdbx_strand_id``
|
|
186
|
+
(equivalent to ``atom_site.auth_asym_id``).
|
|
187
|
+
Dictionary values are sequences.
|
|
188
|
+
|
|
189
|
+
Notes
|
|
190
|
+
-----
|
|
191
|
+
The ``entity_poly.pdbx_seq_one_letter_code_can`` field contains the initial
|
|
192
|
+
complete sequence. If the structure represents a truncated or spliced
|
|
193
|
+
version of this initial sequence, it will include only a subset of the
|
|
194
|
+
initial sequence. Use biotite.structure.get_residues to retrieve only
|
|
195
|
+
the residues that are represented in the structure.
|
|
196
|
+
"""
|
|
197
|
+
|
|
198
|
+
block = _get_block(pdbx_file, data_block)
|
|
199
|
+
poly_category = block["entity_poly"]
|
|
200
|
+
|
|
201
|
+
seq_string = poly_category["pdbx_seq_one_letter_code_can"].as_array(str)
|
|
202
|
+
seq_type = poly_category["type"].as_array(str)
|
|
203
|
+
|
|
204
|
+
sequences = [
|
|
205
|
+
_convert_string_to_sequence(string, stype)
|
|
206
|
+
for string, stype in zip(seq_string, seq_type)
|
|
207
|
+
]
|
|
208
|
+
|
|
209
|
+
strand_ids = poly_category["pdbx_strand_id"].as_array(str)
|
|
210
|
+
strand_ids = [strand_id.split(",") for strand_id in strand_ids]
|
|
211
|
+
|
|
212
|
+
sequence_dict = {
|
|
213
|
+
strand_id: sequence
|
|
214
|
+
for sequence, strand_ids in zip(sequences, strand_ids)
|
|
215
|
+
for strand_id in strand_ids
|
|
216
|
+
if sequence is not None
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
return sequence_dict
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def get_model_count(pdbx_file, data_block=None):
|
|
223
|
+
"""
|
|
224
|
+
Get the number of models contained in a file.
|
|
225
|
+
|
|
226
|
+
Parameters
|
|
227
|
+
----------
|
|
228
|
+
pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
|
|
229
|
+
The file object.
|
|
230
|
+
data_block : str, optional
|
|
231
|
+
The name of the data block.
|
|
232
|
+
Default is the first (and most times only) data block of the
|
|
233
|
+
file.
|
|
234
|
+
If the data block object is passed directly to `pdbx_file`,
|
|
235
|
+
this parameter is ignored.
|
|
236
|
+
|
|
237
|
+
Returns
|
|
238
|
+
-------
|
|
239
|
+
model_count : int
|
|
240
|
+
The number of models.
|
|
241
|
+
"""
|
|
242
|
+
block = _get_block(pdbx_file, data_block)
|
|
243
|
+
return len(np.unique((block["atom_site"]["pdbx_PDB_model_num"].as_array(np.int32))))
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def get_structure(
|
|
247
|
+
pdbx_file,
|
|
248
|
+
model=None,
|
|
249
|
+
data_block=None,
|
|
250
|
+
altloc="first",
|
|
251
|
+
extra_fields=None,
|
|
252
|
+
use_author_fields=True,
|
|
253
|
+
include_bonds=False,
|
|
254
|
+
):
|
|
255
|
+
"""
|
|
256
|
+
Create an :class:`AtomArray` or :class:`AtomArrayStack` from the
|
|
257
|
+
``atom_site`` category in a file.
|
|
258
|
+
|
|
259
|
+
Parameters
|
|
260
|
+
----------
|
|
261
|
+
pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
|
|
262
|
+
The file object.
|
|
263
|
+
model : int, optional
|
|
264
|
+
If this parameter is given, the function will return an
|
|
265
|
+
:class:`AtomArray` from the atoms corresponding to the given
|
|
266
|
+
model number (starting at 1).
|
|
267
|
+
Negative values are used to index models starting from the last
|
|
268
|
+
model insted of the first model.
|
|
269
|
+
If this parameter is omitted, an :class:`AtomArrayStack`
|
|
270
|
+
containing all models will be returned, even if the structure
|
|
271
|
+
contains only one model.
|
|
272
|
+
data_block : str, optional
|
|
273
|
+
The name of the data block.
|
|
274
|
+
Default is the first (and most times only) data block of the
|
|
275
|
+
file.
|
|
276
|
+
If the data block object is passed directly to `pdbx_file`,
|
|
277
|
+
this parameter is ignored.
|
|
278
|
+
altloc : {'first', 'occupancy', 'all'}
|
|
279
|
+
This parameter defines how *altloc* IDs are handled:
|
|
280
|
+
- ``'first'`` - Use atoms that have the first *altloc* ID
|
|
281
|
+
appearing in a residue.
|
|
282
|
+
- ``'occupancy'`` - Use atoms that have the *altloc* ID
|
|
283
|
+
with the highest occupancy for a residue.
|
|
284
|
+
- ``'all'`` - Use all atoms.
|
|
285
|
+
Note that this leads to duplicate atoms.
|
|
286
|
+
When this option is chosen, the ``altloc_id`` annotation
|
|
287
|
+
array is added to the returned structure.
|
|
288
|
+
extra_fields : list of str, optional
|
|
289
|
+
The strings in the list are entry names, that are
|
|
290
|
+
additionally added as annotation arrays.
|
|
291
|
+
The annotation category name will be the same as the PDBx
|
|
292
|
+
subcategory name.
|
|
293
|
+
The array type is always `str`.
|
|
294
|
+
An exception are the special field identifiers:
|
|
295
|
+
``'atom_id'``, ``'b_factor'``, ``'occupancy'`` and ``'charge'``.
|
|
296
|
+
These will convert the fitting subcategory into an
|
|
297
|
+
annotation array with reasonable type.
|
|
298
|
+
use_author_fields : bool, optional
|
|
299
|
+
Some fields can be read from two alternative sources,
|
|
300
|
+
for example both, ``label_seq_id`` and ``auth_seq_id`` describe
|
|
301
|
+
the ID of the residue.
|
|
302
|
+
While, the ``label_xxx`` fields can be used as official pointers
|
|
303
|
+
to other categories in the file, the ``auth_xxx``
|
|
304
|
+
fields are set by the author(s) of the structure and are
|
|
305
|
+
consistent with the corresponding values in PDB files.
|
|
306
|
+
If `use_author_fields` is true, the annotation arrays will be
|
|
307
|
+
read from the ``auth_xxx`` fields (if applicable),
|
|
308
|
+
otherwise from the the ``label_xxx`` fields.
|
|
309
|
+
If the requested field is not available, the respective other
|
|
310
|
+
field is taken as fallback.
|
|
311
|
+
include_bonds : bool, optional
|
|
312
|
+
If set to true, a :class:`BondList` will be created for the
|
|
313
|
+
resulting :class:`AtomArray` containing the bond information
|
|
314
|
+
from the file.
|
|
315
|
+
Inter-residue bonds, will be read from the ``struct_conn``
|
|
316
|
+
category.
|
|
317
|
+
Intra-residue bonds will be read from the ``chem_comp_bond``, if
|
|
318
|
+
available, otherwise they will be derived from the Chemical
|
|
319
|
+
Component Dictionary.
|
|
320
|
+
|
|
321
|
+
Returns
|
|
322
|
+
-------
|
|
323
|
+
array : AtomArray or AtomArrayStack
|
|
324
|
+
The return type depends on the `model` parameter.
|
|
325
|
+
|
|
326
|
+
Examples
|
|
327
|
+
--------
|
|
328
|
+
|
|
329
|
+
>>> import os.path
|
|
330
|
+
>>> file = CIFFile.read(os.path.join(path_to_structures, "1l2y.cif"))
|
|
331
|
+
>>> arr = get_structure(file, model=1)
|
|
332
|
+
>>> print(len(arr))
|
|
333
|
+
304
|
|
334
|
+
"""
|
|
335
|
+
block = _get_block(pdbx_file, data_block)
|
|
336
|
+
|
|
337
|
+
extra_fields = set() if extra_fields is None else set(extra_fields)
|
|
338
|
+
|
|
339
|
+
atom_site = block.get("atom_site")
|
|
340
|
+
if atom_site is None:
|
|
341
|
+
raise InvalidFileError("Missing 'atom_site' category in file")
|
|
342
|
+
|
|
343
|
+
models = atom_site["pdbx_PDB_model_num"].as_array(np.int32)
|
|
344
|
+
model_count = len(np.unique(models))
|
|
345
|
+
atom_count = len(models)
|
|
346
|
+
|
|
347
|
+
if model is None:
|
|
348
|
+
# For a stack, the annotations are derived from the first model
|
|
349
|
+
model_atom_site = _filter_model(atom_site, 1)
|
|
350
|
+
# Any field of the category would work here to get the length
|
|
351
|
+
model_length = model_atom_site.row_count
|
|
352
|
+
atoms = AtomArrayStack(model_count, model_length)
|
|
353
|
+
|
|
354
|
+
# Check if each model has the same amount of atoms
|
|
355
|
+
# If not, raise exception
|
|
356
|
+
if model_length * model_count != atom_count:
|
|
357
|
+
raise InvalidFileError(
|
|
358
|
+
"The models in the file have unequal "
|
|
359
|
+
"amount of atoms, give an explicit model "
|
|
360
|
+
"instead"
|
|
361
|
+
)
|
|
362
|
+
|
|
363
|
+
atoms.coord[:, :, 0] = (
|
|
364
|
+
atom_site["Cartn_x"]
|
|
365
|
+
.as_array(np.float32)
|
|
366
|
+
.reshape((model_count, model_length))
|
|
367
|
+
)
|
|
368
|
+
atoms.coord[:, :, 1] = (
|
|
369
|
+
atom_site["Cartn_y"]
|
|
370
|
+
.as_array(np.float32)
|
|
371
|
+
.reshape((model_count, model_length))
|
|
372
|
+
)
|
|
373
|
+
atoms.coord[:, :, 2] = (
|
|
374
|
+
atom_site["Cartn_z"]
|
|
375
|
+
.as_array(np.float32)
|
|
376
|
+
.reshape((model_count, model_length))
|
|
377
|
+
)
|
|
378
|
+
|
|
379
|
+
box = _get_box(block)
|
|
380
|
+
if box is not None:
|
|
381
|
+
# Duplicate same box for each model
|
|
382
|
+
atoms.box = np.repeat(box[np.newaxis, ...], model_count, axis=0)
|
|
383
|
+
|
|
384
|
+
else:
|
|
385
|
+
if model == 0:
|
|
386
|
+
raise ValueError("The model index must not be 0")
|
|
387
|
+
# Negative models mean model indexing starting from last model
|
|
388
|
+
model = model_count + model + 1 if model < 0 else model
|
|
389
|
+
if model > model_count:
|
|
390
|
+
raise ValueError(
|
|
391
|
+
f"The file has {model_count} models, "
|
|
392
|
+
f"the given model {model} does not exist"
|
|
393
|
+
)
|
|
394
|
+
|
|
395
|
+
model_atom_site = _filter_model(atom_site, model)
|
|
396
|
+
# Any field of the category would work here to get the length
|
|
397
|
+
model_length = model_atom_site.row_count
|
|
398
|
+
atoms = AtomArray(model_length)
|
|
399
|
+
|
|
400
|
+
atoms.coord[:, 0] = model_atom_site["Cartn_x"].as_array(np.float32)
|
|
401
|
+
atoms.coord[:, 1] = model_atom_site["Cartn_y"].as_array(np.float32)
|
|
402
|
+
atoms.coord[:, 2] = model_atom_site["Cartn_z"].as_array(np.float32)
|
|
403
|
+
|
|
404
|
+
atoms.box = _get_box(block)
|
|
405
|
+
|
|
406
|
+
# The below part is the same for both, AtomArray and AtomArrayStack
|
|
407
|
+
_fill_annotations(atoms, model_atom_site, extra_fields, use_author_fields)
|
|
408
|
+
|
|
409
|
+
atoms, altloc_filtered_atom_site = _filter_altloc(atoms, model_atom_site, altloc)
|
|
410
|
+
|
|
411
|
+
if include_bonds:
|
|
412
|
+
if altloc == "all":
|
|
413
|
+
raise ValueError(
|
|
414
|
+
"Bond computation is not supported with `altloc='all', consider using "
|
|
415
|
+
"'connect_via_residue_names()' afterwards"
|
|
416
|
+
)
|
|
417
|
+
|
|
418
|
+
if "chem_comp_bond" in block:
|
|
419
|
+
try:
|
|
420
|
+
custom_bond_dict = _parse_intra_residue_bonds(block["chem_comp_bond"])
|
|
421
|
+
except KeyError:
|
|
422
|
+
warnings.warn(
|
|
423
|
+
"The 'chem_comp_bond' category has missing columns, "
|
|
424
|
+
"falling back to using Chemical Component Dictionary",
|
|
425
|
+
UserWarning,
|
|
426
|
+
)
|
|
427
|
+
custom_bond_dict = None
|
|
428
|
+
bonds = connect_via_residue_names(atoms, custom_bond_dict=custom_bond_dict)
|
|
429
|
+
else:
|
|
430
|
+
bonds = connect_via_residue_names(atoms)
|
|
431
|
+
if "struct_conn" in block:
|
|
432
|
+
bonds = bonds.merge(
|
|
433
|
+
_parse_inter_residue_bonds(
|
|
434
|
+
altloc_filtered_atom_site,
|
|
435
|
+
block["struct_conn"],
|
|
436
|
+
atom_count=atoms.array_length(),
|
|
437
|
+
)
|
|
438
|
+
)
|
|
439
|
+
atoms.bonds = bonds
|
|
440
|
+
|
|
441
|
+
return atoms
|
|
442
|
+
|
|
443
|
+
|
|
444
|
+
def _get_block(pdbx_component, block_name):
|
|
445
|
+
if not isinstance(pdbx_component, (CIFBlock, BinaryCIFBlock)):
|
|
446
|
+
# Determine block
|
|
447
|
+
if block_name is None:
|
|
448
|
+
return pdbx_component.block
|
|
449
|
+
else:
|
|
450
|
+
return pdbx_component[block_name]
|
|
451
|
+
else:
|
|
452
|
+
return pdbx_component
|
|
453
|
+
|
|
454
|
+
|
|
455
|
+
def _get_or_fallback(category, key, fallback_key):
|
|
456
|
+
"""
|
|
457
|
+
Return column related to key in category if it exists,
|
|
458
|
+
otherwise try to get the column related to fallback key.
|
|
459
|
+
"""
|
|
460
|
+
if key not in category:
|
|
461
|
+
warnings.warn(
|
|
462
|
+
f"Attribute '{key}' not found within 'atom_site' category. "
|
|
463
|
+
f"The fallback attribute '{fallback_key}' will be used instead",
|
|
464
|
+
UserWarning,
|
|
465
|
+
)
|
|
466
|
+
try:
|
|
467
|
+
return category[fallback_key]
|
|
468
|
+
except KeyError as key_exc:
|
|
469
|
+
raise InvalidFileError(
|
|
470
|
+
f"Fallback attribute '{fallback_key}' not found within "
|
|
471
|
+
"'atom_site' category"
|
|
472
|
+
) from key_exc
|
|
473
|
+
return category[key]
|
|
474
|
+
|
|
475
|
+
|
|
476
|
+
def _fill_annotations(array, atom_site, extra_fields, use_author_fields):
|
|
477
|
+
"""Fill atom_site annotations in atom array or atom array stack.
|
|
478
|
+
|
|
479
|
+
Parameters
|
|
480
|
+
----------
|
|
481
|
+
array : AtomArray or AtomArrayStack
|
|
482
|
+
Atom array or stack which will be annotated.
|
|
483
|
+
atom_site : CIFCategory or BinaryCIFCategory
|
|
484
|
+
``atom_site`` category with values for one model.
|
|
485
|
+
extra_fields : list of str
|
|
486
|
+
Entry names, that are additionally added as annotation arrays.
|
|
487
|
+
use_author_fields : bool
|
|
488
|
+
Define if alternate fields prefixed with ``auth_`` should be used
|
|
489
|
+
instead of ``label_``.
|
|
490
|
+
"""
|
|
491
|
+
|
|
492
|
+
prefix, alt_prefix = ("auth", "label") if use_author_fields else ("label", "auth")
|
|
493
|
+
|
|
494
|
+
array.set_annotation(
|
|
495
|
+
"chain_id",
|
|
496
|
+
_get_or_fallback(
|
|
497
|
+
atom_site, f"{prefix}_asym_id", f"{alt_prefix}_asym_id"
|
|
498
|
+
).as_array(str),
|
|
499
|
+
)
|
|
500
|
+
array.set_annotation("ins_code", atom_site["pdbx_PDB_ins_code"].as_array(str, ""))
|
|
501
|
+
array.set_annotation(
|
|
502
|
+
"res_name",
|
|
503
|
+
_get_or_fallback(
|
|
504
|
+
atom_site, f"{prefix}_comp_id", f"{alt_prefix}_comp_id"
|
|
505
|
+
).as_array(str),
|
|
506
|
+
)
|
|
507
|
+
array.set_annotation("hetero", atom_site["group_PDB"].as_array(str) == "HETATM")
|
|
508
|
+
array.set_annotation(
|
|
509
|
+
"atom_name",
|
|
510
|
+
_get_or_fallback(
|
|
511
|
+
atom_site, f"{prefix}_atom_id", f"{alt_prefix}_atom_id"
|
|
512
|
+
).as_array(str),
|
|
513
|
+
)
|
|
514
|
+
array.set_annotation("element", atom_site["type_symbol"].as_array(str))
|
|
515
|
+
|
|
516
|
+
# Special handling for `res_id`, as the `label_seq_id` is equal (`.`) for all
|
|
517
|
+
# hetero residues, which makes distinguishing subsequent residues from another
|
|
518
|
+
# difficult (https://github.com/biotite-dev/biotite/issues/553)
|
|
519
|
+
res_id = _get_or_fallback(
|
|
520
|
+
atom_site, f"{prefix}_seq_id", f"{alt_prefix}_seq_id"
|
|
521
|
+
).as_array(int, -1)
|
|
522
|
+
if not use_author_fields and "auth_seq_id" in atom_site:
|
|
523
|
+
# Therefore, the `auth_seq_id` is still used to determine residue starts
|
|
524
|
+
# in `create_continuous_res_ids()`, even if `use_author_fields = False`.
|
|
525
|
+
res_id_for_residue_starts = atom_site["auth_seq_id"].as_array(int, -1)
|
|
526
|
+
array.set_annotation("res_id", res_id_for_residue_starts)
|
|
527
|
+
fallback_res_ids = create_continuous_res_ids(array)
|
|
528
|
+
array.set_annotation("res_id", np.where(res_id == -1, fallback_res_ids, res_id))
|
|
529
|
+
else:
|
|
530
|
+
array.set_annotation("res_id", res_id)
|
|
531
|
+
|
|
532
|
+
if "atom_id" in extra_fields:
|
|
533
|
+
if "id" in atom_site:
|
|
534
|
+
array.set_annotation("atom_id", atom_site["id"].as_array(int))
|
|
535
|
+
else:
|
|
536
|
+
warnings.warn(
|
|
537
|
+
"Missing 'id' in 'atom_site' category. 'atom_id' generated automatically.",
|
|
538
|
+
UserWarning,
|
|
539
|
+
)
|
|
540
|
+
array.set_annotation("atom_id", np.arange(array.array_length()))
|
|
541
|
+
extra_fields.remove("atom_id")
|
|
542
|
+
if "b_factor" in extra_fields:
|
|
543
|
+
if "B_iso_or_equiv" in atom_site:
|
|
544
|
+
array.set_annotation(
|
|
545
|
+
"b_factor", atom_site["B_iso_or_equiv"].as_array(float)
|
|
546
|
+
)
|
|
547
|
+
else:
|
|
548
|
+
warnings.warn(
|
|
549
|
+
"Missing 'B_iso_or_equiv' in 'atom_site' category. 'b_factor' will be set to `nan`.",
|
|
550
|
+
UserWarning,
|
|
551
|
+
)
|
|
552
|
+
array.set_annotation("b_factor", np.full(array.array_length(), np.nan))
|
|
553
|
+
extra_fields.remove("b_factor")
|
|
554
|
+
if "occupancy" in extra_fields:
|
|
555
|
+
if "occupancy" in atom_site:
|
|
556
|
+
array.set_annotation("occupancy", atom_site["occupancy"].as_array(float))
|
|
557
|
+
else:
|
|
558
|
+
warnings.warn(
|
|
559
|
+
"Missing 'occupancy' in 'atom_site' category. 'occupancy' will be assumed to be 1.0",
|
|
560
|
+
UserWarning,
|
|
561
|
+
)
|
|
562
|
+
array.set_annotation(
|
|
563
|
+
"occupancy", np.ones(array.array_length(), dtype=float)
|
|
564
|
+
)
|
|
565
|
+
extra_fields.remove("occupancy")
|
|
566
|
+
if "charge" in extra_fields:
|
|
567
|
+
if "pdbx_formal_charge" in atom_site:
|
|
568
|
+
array.set_annotation(
|
|
569
|
+
"charge",
|
|
570
|
+
atom_site["pdbx_formal_charge"].as_array(
|
|
571
|
+
int, 0
|
|
572
|
+
), # masked values are set to 0
|
|
573
|
+
)
|
|
574
|
+
else:
|
|
575
|
+
warnings.warn(
|
|
576
|
+
"Missing 'pdbx_formal_charge' in 'atom_site' category. 'charge' will be set to 0",
|
|
577
|
+
UserWarning,
|
|
578
|
+
)
|
|
579
|
+
array.set_annotation("charge", np.zeros(array.array_length(), dtype=int))
|
|
580
|
+
extra_fields.remove("charge")
|
|
581
|
+
|
|
582
|
+
# Handle all remaining custom fields
|
|
583
|
+
for field in extra_fields:
|
|
584
|
+
array.set_annotation(field, atom_site[field].as_array(str))
|
|
585
|
+
|
|
586
|
+
|
|
587
|
+
def _parse_intra_residue_bonds(chem_comp_bond):
|
|
588
|
+
"""
|
|
589
|
+
Create a :func:`connect_via_residue_names()` compatible
|
|
590
|
+
`custom_bond_dict` from the ``chem_comp_bond`` category.
|
|
591
|
+
"""
|
|
592
|
+
custom_bond_dict = {}
|
|
593
|
+
for res_name, atom_1, atom_2, order, aromatic_flag in zip(
|
|
594
|
+
chem_comp_bond["comp_id"].as_array(str),
|
|
595
|
+
chem_comp_bond["atom_id_1"].as_array(str),
|
|
596
|
+
chem_comp_bond["atom_id_2"].as_array(str),
|
|
597
|
+
chem_comp_bond["value_order"].as_array(str),
|
|
598
|
+
chem_comp_bond["pdbx_aromatic_flag"].as_array(str),
|
|
599
|
+
):
|
|
600
|
+
if res_name not in custom_bond_dict:
|
|
601
|
+
custom_bond_dict[res_name] = {}
|
|
602
|
+
bond_type = COMP_BOND_ORDER_TO_TYPE.get(
|
|
603
|
+
(order.upper(), aromatic_flag), BondType.ANY
|
|
604
|
+
)
|
|
605
|
+
custom_bond_dict[res_name][atom_1.item(), atom_2.item()] = bond_type
|
|
606
|
+
return custom_bond_dict
|
|
607
|
+
|
|
608
|
+
|
|
609
|
+
def _parse_inter_residue_bonds(atom_site, struct_conn, atom_count=None):
|
|
610
|
+
"""
|
|
611
|
+
Create inter-residue bonds by parsing the ``struct_conn`` category.
|
|
612
|
+
The atom indices of each bond are found by matching the bond labels
|
|
613
|
+
to the ``atom_site`` category.
|
|
614
|
+
If atom_count is None, it will be inferred from the ``atom_site`` category.
|
|
615
|
+
"""
|
|
616
|
+
# Identity symmetry operation
|
|
617
|
+
IDENTITY = "1_555"
|
|
618
|
+
# Columns in 'atom_site' that should be matched by 'struct_conn'
|
|
619
|
+
COLUMNS = [
|
|
620
|
+
"label_asym_id",
|
|
621
|
+
"label_comp_id",
|
|
622
|
+
"label_seq_id",
|
|
623
|
+
"label_atom_id",
|
|
624
|
+
"label_alt_id",
|
|
625
|
+
"auth_asym_id",
|
|
626
|
+
"auth_comp_id",
|
|
627
|
+
"auth_seq_id",
|
|
628
|
+
"pdbx_PDB_ins_code",
|
|
629
|
+
]
|
|
630
|
+
|
|
631
|
+
covale_mask = np.isin(
|
|
632
|
+
struct_conn["conn_type_id"].as_array(str),
|
|
633
|
+
list(PDBX_BOND_TYPE_ID_TO_TYPE.keys()),
|
|
634
|
+
)
|
|
635
|
+
if "ptnr1_symmetry" in struct_conn:
|
|
636
|
+
covale_mask &= struct_conn["ptnr1_symmetry"].as_array(str, IDENTITY) == IDENTITY
|
|
637
|
+
if "ptnr2_symmetry" in struct_conn:
|
|
638
|
+
covale_mask &= struct_conn["ptnr2_symmetry"].as_array(str, IDENTITY) == IDENTITY
|
|
639
|
+
|
|
640
|
+
atom_indices = [None] * 2
|
|
641
|
+
for i in range(2):
|
|
642
|
+
reference_arrays = []
|
|
643
|
+
query_arrays = []
|
|
644
|
+
for col_name in COLUMNS:
|
|
645
|
+
struct_conn_col_name = _get_struct_conn_col_name(col_name, i + 1)
|
|
646
|
+
if col_name not in atom_site or struct_conn_col_name not in struct_conn:
|
|
647
|
+
continue
|
|
648
|
+
# Ensure both arrays have the same dtype to allow comparison
|
|
649
|
+
reference = atom_site[col_name].as_array()
|
|
650
|
+
dtype = reference.dtype
|
|
651
|
+
query = struct_conn[struct_conn_col_name].as_array(dtype)
|
|
652
|
+
if np.issubdtype(reference.dtype, str):
|
|
653
|
+
# The mask value is not necessarily consistent
|
|
654
|
+
# between query and reference
|
|
655
|
+
# -> make it consistent
|
|
656
|
+
reference[reference == "?"] = "."
|
|
657
|
+
query[query == "?"] = "."
|
|
658
|
+
reference_arrays.append(reference)
|
|
659
|
+
query_arrays.append(query[covale_mask])
|
|
660
|
+
# Match the combination of 'label_asym_id', 'label_comp_id', etc.
|
|
661
|
+
# in 'atom_site' and 'struct_conn'
|
|
662
|
+
atom_indices[i] = _find_matches(query_arrays, reference_arrays)
|
|
663
|
+
atoms_indices_1 = atom_indices[0]
|
|
664
|
+
atoms_indices_2 = atom_indices[1]
|
|
665
|
+
|
|
666
|
+
# Some bonds in 'struct_conn' may not be found in 'atom_site'
|
|
667
|
+
# This is okay,
|
|
668
|
+
# as 'atom_site' might already be reduced to a single model
|
|
669
|
+
mapping_exists_mask = (atoms_indices_1 != -1) & (atoms_indices_2 != -1)
|
|
670
|
+
atoms_indices_1 = atoms_indices_1[mapping_exists_mask]
|
|
671
|
+
atoms_indices_2 = atoms_indices_2[mapping_exists_mask]
|
|
672
|
+
|
|
673
|
+
bond_type_id = struct_conn["conn_type_id"].as_array()
|
|
674
|
+
# Consecutively apply the same masks as applied to the atom indices
|
|
675
|
+
# Logical combination does not work here,
|
|
676
|
+
# as the second mask was created based on already filtered data
|
|
677
|
+
bond_type_id = bond_type_id[covale_mask][mapping_exists_mask]
|
|
678
|
+
# The type ID is always present in the dictionary,
|
|
679
|
+
# as it was used to filter the applicable bonds
|
|
680
|
+
bond_types = [PDBX_BOND_TYPE_ID_TO_TYPE[type_id] for type_id in bond_type_id]
|
|
681
|
+
|
|
682
|
+
return BondList(
|
|
683
|
+
atom_count if atom_count is not None else atom_site.row_count,
|
|
684
|
+
np.stack([atoms_indices_1, atoms_indices_2, bond_types], axis=-1),
|
|
685
|
+
)
|
|
686
|
+
|
|
687
|
+
|
|
688
|
+
def _find_matches(query_arrays, reference_arrays):
|
|
689
|
+
"""
|
|
690
|
+
For each index in the `query_arrays` find the indices in the
|
|
691
|
+
`reference_arrays` where all query values match the reference counterpart.
|
|
692
|
+
If no match is found for a query, the corresponding index is -1.
|
|
693
|
+
"""
|
|
694
|
+
if (
|
|
695
|
+
query_arrays[0].shape[0] * reference_arrays[0].shape[0]
|
|
696
|
+
<= FIND_MATCHES_SWITCH_THRESHOLD
|
|
697
|
+
):
|
|
698
|
+
match_indices = _find_matches_by_dense_array(query_arrays, reference_arrays)
|
|
699
|
+
else:
|
|
700
|
+
match_indices = _find_matches_by_dict(query_arrays, reference_arrays)
|
|
701
|
+
return match_indices
|
|
702
|
+
|
|
703
|
+
|
|
704
|
+
def _find_matches_by_dense_array(query_arrays, reference_arrays):
|
|
705
|
+
match_masks_for_all_columns = np.stack(
|
|
706
|
+
[
|
|
707
|
+
query[:, np.newaxis] == reference[np.newaxis, :]
|
|
708
|
+
for query, reference in zip(query_arrays, reference_arrays)
|
|
709
|
+
],
|
|
710
|
+
axis=-1,
|
|
711
|
+
)
|
|
712
|
+
match_masks = np.all(match_masks_for_all_columns, axis=-1)
|
|
713
|
+
query_matches, reference_matches = np.where(match_masks)
|
|
714
|
+
|
|
715
|
+
# Duplicate matches indicate that an atom from the query cannot
|
|
716
|
+
# be uniquely matched to an atom in the reference
|
|
717
|
+
unique_query_matches, counts = np.unique(query_matches, return_counts=True)
|
|
718
|
+
if np.any(counts > 1):
|
|
719
|
+
ambiguous_query = unique_query_matches[np.where(counts > 1)[0][0]]
|
|
720
|
+
raise InvalidFileError(
|
|
721
|
+
f"The covalent bond in the 'struct_conn' category at index "
|
|
722
|
+
f"{ambiguous_query} cannot be unambiguously assigned to atoms in "
|
|
723
|
+
f"the 'atom_site' category"
|
|
724
|
+
)
|
|
725
|
+
|
|
726
|
+
# -1 indicates that no match was found in the reference
|
|
727
|
+
match_indices = np.full(len(query_arrays[0]), -1, dtype=int)
|
|
728
|
+
match_indices[query_matches] = reference_matches
|
|
729
|
+
return match_indices
|
|
730
|
+
|
|
731
|
+
|
|
732
|
+
def _find_matches_by_dict(query_arrays, reference_arrays):
|
|
733
|
+
# Convert reference arrays to a dictionary for O(1) lookups
|
|
734
|
+
reference_dict = {}
|
|
735
|
+
ambiguous_keys = set()
|
|
736
|
+
for ref_idx, ref_row in enumerate(zip(*reference_arrays)):
|
|
737
|
+
ref_key = tuple(ref_row)
|
|
738
|
+
if ref_key in reference_dict:
|
|
739
|
+
ambiguous_keys.add(ref_key)
|
|
740
|
+
continue
|
|
741
|
+
reference_dict[ref_key] = ref_idx
|
|
742
|
+
|
|
743
|
+
match_indices = []
|
|
744
|
+
for query_idx, query_row in enumerate(zip(*query_arrays)):
|
|
745
|
+
query_key = tuple(query_row)
|
|
746
|
+
occurrence = reference_dict.get(query_key)
|
|
747
|
+
|
|
748
|
+
if occurrence is None:
|
|
749
|
+
# -1 indicates that no match was found in the reference
|
|
750
|
+
match_indices.append(-1)
|
|
751
|
+
elif query_key in ambiguous_keys:
|
|
752
|
+
# The query cannot be uniquely matched to an atom in the reference
|
|
753
|
+
raise InvalidFileError(
|
|
754
|
+
f"The covalent bond in the 'struct_conn' category at index "
|
|
755
|
+
f"{query_idx} cannot be unambiguously assigned to atoms in "
|
|
756
|
+
f"the 'atom_site' category"
|
|
757
|
+
)
|
|
758
|
+
else:
|
|
759
|
+
match_indices.append(occurrence)
|
|
760
|
+
|
|
761
|
+
return np.array(match_indices)
|
|
762
|
+
|
|
763
|
+
|
|
764
|
+
def _get_struct_conn_col_name(col_name, partner):
|
|
765
|
+
"""
|
|
766
|
+
For a column name in ``atom_site`` get the corresponding column name
|
|
767
|
+
in ``struct_conn``.
|
|
768
|
+
"""
|
|
769
|
+
if col_name == "label_alt_id":
|
|
770
|
+
return f"pdbx_ptnr{partner}_label_alt_id"
|
|
771
|
+
elif col_name.startswith("pdbx_"):
|
|
772
|
+
# Move 'pdbx_' to front
|
|
773
|
+
return f"pdbx_ptnr{partner}_{col_name[5:]}"
|
|
774
|
+
else:
|
|
775
|
+
return f"ptnr{partner}_{col_name}"
|
|
776
|
+
|
|
777
|
+
|
|
778
|
+
def _filter_altloc(array, atom_site, altloc):
|
|
779
|
+
"""
|
|
780
|
+
Filter the given :class:`AtomArray` and ``atom_site`` category to the rows
|
|
781
|
+
specified by the given *altloc* identifier.
|
|
782
|
+
"""
|
|
783
|
+
altloc_ids = atom_site.get("label_alt_id")
|
|
784
|
+
occupancy = atom_site.get("occupancy")
|
|
785
|
+
|
|
786
|
+
if altloc == "all":
|
|
787
|
+
array.set_annotation("altloc_id", altloc_ids.as_array(str))
|
|
788
|
+
return array, atom_site
|
|
789
|
+
elif altloc_ids is None or (
|
|
790
|
+
altloc_ids.mask is not None
|
|
791
|
+
and (altloc_ids.mask.array != MaskValue.PRESENT).all()
|
|
792
|
+
):
|
|
793
|
+
# No altlocs in atom_site category
|
|
794
|
+
return array, atom_site
|
|
795
|
+
elif altloc == "occupancy" and occupancy is not None:
|
|
796
|
+
mask = filter_highest_occupancy_altloc(
|
|
797
|
+
array, altloc_ids.as_array(str), occupancy.as_array(float)
|
|
798
|
+
)
|
|
799
|
+
return array[..., mask], _filter(atom_site, mask)
|
|
800
|
+
# 'first' is also fallback if file has no occupancy information
|
|
801
|
+
elif altloc == "first":
|
|
802
|
+
mask = filter_first_altloc(array, altloc_ids.as_array(str))
|
|
803
|
+
return array[..., mask], _filter(atom_site, mask)
|
|
804
|
+
else:
|
|
805
|
+
raise ValueError(f"'{altloc}' is not a valid 'altloc' option")
|
|
806
|
+
|
|
807
|
+
|
|
808
|
+
def _filter_model(atom_site, model):
|
|
809
|
+
"""
|
|
810
|
+
Reduce the ``atom_site`` category to the values for the given
|
|
811
|
+
model.
|
|
812
|
+
|
|
813
|
+
Parameters
|
|
814
|
+
----------
|
|
815
|
+
atom_site : CIFCategory or BinaryCIFCategory
|
|
816
|
+
``atom_site`` category containing all models.
|
|
817
|
+
model : int
|
|
818
|
+
The model to be selected.
|
|
819
|
+
|
|
820
|
+
Returns
|
|
821
|
+
-------
|
|
822
|
+
atom_site : CIFCategory or BinaryCIFCategory
|
|
823
|
+
The ``atom_site`` category containing only the selected model.
|
|
824
|
+
"""
|
|
825
|
+
models = atom_site["pdbx_PDB_model_num"].as_array(np.int32)
|
|
826
|
+
_, model_starts = np.unique(models, return_index=True)
|
|
827
|
+
model_starts.sort()
|
|
828
|
+
# Append exclusive stop
|
|
829
|
+
model_starts = np.append(model_starts, [atom_site.row_count])
|
|
830
|
+
# Indexing starts at 0, but model number starts at 1
|
|
831
|
+
model_index = model - 1
|
|
832
|
+
index = slice(model_starts[model_index], model_starts[model_index + 1])
|
|
833
|
+
return _filter(atom_site, index)
|
|
834
|
+
|
|
835
|
+
|
|
836
|
+
def _get_box(block):
|
|
837
|
+
cell = block.get("cell")
|
|
838
|
+
if cell is None:
|
|
839
|
+
return None
|
|
840
|
+
try:
|
|
841
|
+
len_a, len_b, len_c = [
|
|
842
|
+
float(cell[length].as_item())
|
|
843
|
+
for length in ["length_a", "length_b", "length_c"]
|
|
844
|
+
]
|
|
845
|
+
alpha, beta, gamma = [
|
|
846
|
+
np.deg2rad(float(cell[angle].as_item()))
|
|
847
|
+
for angle in ["angle_alpha", "angle_beta", "angle_gamma"]
|
|
848
|
+
]
|
|
849
|
+
except ValueError:
|
|
850
|
+
# 'cell_dict' has no proper unit cell values, e.g. '?'
|
|
851
|
+
return None
|
|
852
|
+
return vectors_from_unitcell(len_a, len_b, len_c, alpha, beta, gamma)
|
|
853
|
+
|
|
854
|
+
|
|
855
|
+
def set_structure(
|
|
856
|
+
pdbx_file,
|
|
857
|
+
array,
|
|
858
|
+
data_block=None,
|
|
859
|
+
include_bonds=False,
|
|
860
|
+
extra_fields=[],
|
|
861
|
+
):
|
|
862
|
+
"""
|
|
863
|
+
Set the ``atom_site`` category with atom information from an
|
|
864
|
+
:class:`AtomArray` or :class:`AtomArrayStack`.
|
|
865
|
+
|
|
866
|
+
This will save the coordinates, the mandatory annotation categories
|
|
867
|
+
and the optional annotation categories
|
|
868
|
+
``atom_id``, ``b_factor``, ``occupancy`` and ``charge``.
|
|
869
|
+
If the atom array (stack) contains the annotation ``'atom_id'``,
|
|
870
|
+
these values will be used for atom numbering instead of continuous
|
|
871
|
+
numbering.
|
|
872
|
+
Furthermore, inter-residue bonds will be written into the
|
|
873
|
+
``struct_conn`` category.
|
|
874
|
+
|
|
875
|
+
Parameters
|
|
876
|
+
----------
|
|
877
|
+
pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
|
|
878
|
+
The file object.
|
|
879
|
+
array : AtomArray or AtomArrayStack
|
|
880
|
+
The structure to be written. If a stack is given, each array in
|
|
881
|
+
the stack will be in a separate model.
|
|
882
|
+
data_block : str, optional
|
|
883
|
+
The name of the data block.
|
|
884
|
+
Default is the first (and most times only) data block of the
|
|
885
|
+
file.
|
|
886
|
+
If the data block object is passed directly to `pdbx_file`,
|
|
887
|
+
this parameter is ignored.
|
|
888
|
+
If the file is empty, a new data block will be created.
|
|
889
|
+
include_bonds : bool, optional
|
|
890
|
+
DEPRECATED: Has no effect anymore.
|
|
891
|
+
extra_fields : list of str, optional
|
|
892
|
+
List of additional fields from the ``atom_site`` category
|
|
893
|
+
that should be written into the file.
|
|
894
|
+
Default is an empty list.
|
|
895
|
+
|
|
896
|
+
Notes
|
|
897
|
+
-----
|
|
898
|
+
In some cases, the written inter-residue bonds cannot be read again
|
|
899
|
+
due to ambiguity to which atoms the bond refers.
|
|
900
|
+
This is the case, when two equal residues in the same chain have
|
|
901
|
+
the same (or a masked) `res_id`.
|
|
902
|
+
|
|
903
|
+
Examples
|
|
904
|
+
--------
|
|
905
|
+
|
|
906
|
+
>>> import os.path
|
|
907
|
+
>>> file = CIFFile()
|
|
908
|
+
>>> set_structure(file, atom_array)
|
|
909
|
+
>>> file.write(os.path.join(path_to_directory, "structure.cif"))
|
|
910
|
+
"""
|
|
911
|
+
if include_bonds:
|
|
912
|
+
warnings.warn(
|
|
913
|
+
"`include_bonds` parameter is deprecated, "
|
|
914
|
+
"intra-residue are always written, if available",
|
|
915
|
+
DeprecationWarning,
|
|
916
|
+
)
|
|
917
|
+
|
|
918
|
+
_check_non_empty(array)
|
|
919
|
+
|
|
920
|
+
block = _get_or_create_block(pdbx_file, data_block)
|
|
921
|
+
Category = block.subcomponent_class()
|
|
922
|
+
Column = Category.subcomponent_class()
|
|
923
|
+
|
|
924
|
+
# Fill PDBx columns from information
|
|
925
|
+
# in structures' attribute arrays as good as possible
|
|
926
|
+
atom_site = Category()
|
|
927
|
+
atom_site["group_PDB"] = np.where(array.hetero, "HETATM", "ATOM")
|
|
928
|
+
atom_site["type_symbol"] = np.copy(array.element)
|
|
929
|
+
atom_site["label_atom_id"] = np.copy(array.atom_name)
|
|
930
|
+
atom_site["label_alt_id"] = Column(
|
|
931
|
+
# AtomArrays do not store altloc atoms
|
|
932
|
+
np.full(array.array_length(), "."),
|
|
933
|
+
np.full(array.array_length(), MaskValue.INAPPLICABLE),
|
|
934
|
+
)
|
|
935
|
+
atom_site["label_comp_id"] = np.copy(array.res_name)
|
|
936
|
+
atom_site["label_asym_id"] = np.copy(array.chain_id)
|
|
937
|
+
atom_site["label_entity_id"] = (
|
|
938
|
+
np.copy(array.label_entity_id)
|
|
939
|
+
if "label_entity_id" in array.get_annotation_categories()
|
|
940
|
+
else _determine_entity_id(array.chain_id)
|
|
941
|
+
)
|
|
942
|
+
atom_site["label_seq_id"] = np.copy(array.res_id)
|
|
943
|
+
atom_site["pdbx_PDB_ins_code"] = Column(
|
|
944
|
+
np.copy(array.ins_code),
|
|
945
|
+
np.where(array.ins_code == "", MaskValue.INAPPLICABLE, MaskValue.PRESENT),
|
|
946
|
+
)
|
|
947
|
+
atom_site["auth_seq_id"] = atom_site["label_seq_id"]
|
|
948
|
+
atom_site["auth_comp_id"] = atom_site["label_comp_id"]
|
|
949
|
+
atom_site["auth_asym_id"] = atom_site["label_asym_id"]
|
|
950
|
+
atom_site["auth_atom_id"] = atom_site["label_atom_id"]
|
|
951
|
+
|
|
952
|
+
annot_categories = array.get_annotation_categories()
|
|
953
|
+
if "atom_id" in annot_categories:
|
|
954
|
+
atom_site["id"] = np.copy(array.atom_id)
|
|
955
|
+
if "b_factor" in annot_categories:
|
|
956
|
+
atom_site["B_iso_or_equiv"] = np.copy(array.b_factor)
|
|
957
|
+
if "occupancy" in annot_categories:
|
|
958
|
+
atom_site["occupancy"] = np.copy(array.occupancy)
|
|
959
|
+
if "charge" in annot_categories:
|
|
960
|
+
atom_site["pdbx_formal_charge"] = Column(
|
|
961
|
+
np.array([f"{c:+d}" if c != 0 else "?" for c in array.charge]),
|
|
962
|
+
np.where(array.charge == 0, MaskValue.MISSING, MaskValue.PRESENT),
|
|
963
|
+
)
|
|
964
|
+
|
|
965
|
+
# Handle all remaining custom fields
|
|
966
|
+
if len(extra_fields) > 0:
|
|
967
|
+
# ... check to avoid clashes with standard annotations
|
|
968
|
+
_standard_annotations = [
|
|
969
|
+
"hetero",
|
|
970
|
+
"element",
|
|
971
|
+
"atom_name",
|
|
972
|
+
"res_name",
|
|
973
|
+
"chain_id",
|
|
974
|
+
"res_id",
|
|
975
|
+
"ins_code",
|
|
976
|
+
"atom_id",
|
|
977
|
+
"b_factor",
|
|
978
|
+
"occupancy",
|
|
979
|
+
"charge",
|
|
980
|
+
]
|
|
981
|
+
_reserved_annotation_names = list(atom_site.keys()) + _standard_annotations
|
|
982
|
+
|
|
983
|
+
for annot in extra_fields:
|
|
984
|
+
if annot in _reserved_annotation_names:
|
|
985
|
+
raise ValueError(
|
|
986
|
+
f"Annotation name '{annot}' is reserved and cannot be written to as extra field. "
|
|
987
|
+
"Please choose another name."
|
|
988
|
+
)
|
|
989
|
+
atom_site[annot] = np.copy(array.get_annotation(annot))
|
|
990
|
+
|
|
991
|
+
if array.bonds is not None:
|
|
992
|
+
struct_conn = _set_inter_residue_bonds(array, atom_site)
|
|
993
|
+
if struct_conn is not None:
|
|
994
|
+
block["struct_conn"] = struct_conn
|
|
995
|
+
chem_comp_bond = _set_intra_residue_bonds(array, atom_site)
|
|
996
|
+
if chem_comp_bond is not None:
|
|
997
|
+
block["chem_comp_bond"] = chem_comp_bond
|
|
998
|
+
|
|
999
|
+
# In case of a single model handle each coordinate
|
|
1000
|
+
# simply like a flattened array
|
|
1001
|
+
if isinstance(array, AtomArray) or (
|
|
1002
|
+
isinstance(array, AtomArrayStack) and array.stack_depth() == 1
|
|
1003
|
+
):
|
|
1004
|
+
# 'ravel' flattens coord without copy
|
|
1005
|
+
# in case of stack with stack_depth = 1
|
|
1006
|
+
atom_site["Cartn_x"] = np.copy(np.ravel(array.coord[..., 0]))
|
|
1007
|
+
atom_site["Cartn_y"] = np.copy(np.ravel(array.coord[..., 1]))
|
|
1008
|
+
atom_site["Cartn_z"] = np.copy(np.ravel(array.coord[..., 2]))
|
|
1009
|
+
atom_site["pdbx_PDB_model_num"] = np.ones(array.array_length(), dtype=np.int32)
|
|
1010
|
+
# In case of multiple models repeat annotations
|
|
1011
|
+
# and use model specific coordinates
|
|
1012
|
+
else:
|
|
1013
|
+
atom_site = _repeat(atom_site, array.stack_depth())
|
|
1014
|
+
coord = np.reshape(array.coord, (array.stack_depth() * array.array_length(), 3))
|
|
1015
|
+
atom_site["Cartn_x"] = np.copy(coord[:, 0])
|
|
1016
|
+
atom_site["Cartn_y"] = np.copy(coord[:, 1])
|
|
1017
|
+
atom_site["Cartn_z"] = np.copy(coord[:, 2])
|
|
1018
|
+
atom_site["pdbx_PDB_model_num"] = np.repeat(
|
|
1019
|
+
np.arange(1, array.stack_depth() + 1, dtype=np.int32),
|
|
1020
|
+
repeats=array.array_length(),
|
|
1021
|
+
)
|
|
1022
|
+
if "atom_id" not in annot_categories:
|
|
1023
|
+
# Count from 1
|
|
1024
|
+
atom_site["id"] = np.arange(1, len(atom_site["group_PDB"]) + 1)
|
|
1025
|
+
block["atom_site"] = atom_site
|
|
1026
|
+
|
|
1027
|
+
# Write box into file
|
|
1028
|
+
if array.box is not None:
|
|
1029
|
+
# PDBx files can only store one box for all models
|
|
1030
|
+
# -> Use first box
|
|
1031
|
+
if array.box.ndim == 3:
|
|
1032
|
+
box = array.box[0]
|
|
1033
|
+
else:
|
|
1034
|
+
box = array.box
|
|
1035
|
+
len_a, len_b, len_c, alpha, beta, gamma = unitcell_from_vectors(box)
|
|
1036
|
+
cell = Category()
|
|
1037
|
+
cell["length_a"] = len_a
|
|
1038
|
+
cell["length_b"] = len_b
|
|
1039
|
+
cell["length_c"] = len_c
|
|
1040
|
+
cell["angle_alpha"] = np.rad2deg(alpha)
|
|
1041
|
+
cell["angle_beta"] = np.rad2deg(beta)
|
|
1042
|
+
cell["angle_gamma"] = np.rad2deg(gamma)
|
|
1043
|
+
block["cell"] = cell
|
|
1044
|
+
|
|
1045
|
+
|
|
1046
|
+
def _check_non_empty(array):
|
|
1047
|
+
if isinstance(array, AtomArray):
|
|
1048
|
+
if array.array_length() == 0:
|
|
1049
|
+
raise BadStructureError("Structure must not be empty")
|
|
1050
|
+
elif isinstance(array, AtomArrayStack):
|
|
1051
|
+
if array.array_length() == 0 or array.stack_depth() == 0:
|
|
1052
|
+
raise BadStructureError("Structure must not be empty")
|
|
1053
|
+
else:
|
|
1054
|
+
raise ValueError(
|
|
1055
|
+
"Structure must be AtomArray or AtomArrayStack, "
|
|
1056
|
+
f"but got {type(array).__name__}"
|
|
1057
|
+
)
|
|
1058
|
+
|
|
1059
|
+
|
|
1060
|
+
def _get_or_create_block(pdbx_component, block_name):
|
|
1061
|
+
Block = pdbx_component.subcomponent_class()
|
|
1062
|
+
|
|
1063
|
+
if isinstance(pdbx_component, (CIFFile, BinaryCIFFile)):
|
|
1064
|
+
if block_name is None:
|
|
1065
|
+
if len(pdbx_component) > 0:
|
|
1066
|
+
block_name = next(iter(pdbx_component.keys()))
|
|
1067
|
+
else:
|
|
1068
|
+
# File is empty -> invent a new block name
|
|
1069
|
+
block_name = "structure"
|
|
1070
|
+
|
|
1071
|
+
if block_name not in pdbx_component:
|
|
1072
|
+
block = Block()
|
|
1073
|
+
pdbx_component[block_name] = block
|
|
1074
|
+
return pdbx_component[block_name]
|
|
1075
|
+
else:
|
|
1076
|
+
# Already a block
|
|
1077
|
+
return pdbx_component
|
|
1078
|
+
|
|
1079
|
+
|
|
1080
|
+
def _determine_entity_id(chain_id):
|
|
1081
|
+
entity_id = np.zeros(len(chain_id), dtype=int)
|
|
1082
|
+
# Dictionary that translates chain_id to entity_id
|
|
1083
|
+
id_translation = {}
|
|
1084
|
+
id = 1
|
|
1085
|
+
for i in range(len(chain_id)):
|
|
1086
|
+
try:
|
|
1087
|
+
entity_id[i] = id_translation[chain_id[i]]
|
|
1088
|
+
except KeyError:
|
|
1089
|
+
# chain_id is not in dictionary -> new entry
|
|
1090
|
+
id_translation[chain_id[i]] = id
|
|
1091
|
+
entity_id[i] = id_translation[chain_id[i]]
|
|
1092
|
+
id += 1
|
|
1093
|
+
return entity_id
|
|
1094
|
+
|
|
1095
|
+
|
|
1096
|
+
def _repeat(category, repetitions):
|
|
1097
|
+
Category = type(category)
|
|
1098
|
+
Column = Category.subcomponent_class()
|
|
1099
|
+
Data = Column.subcomponent_class()
|
|
1100
|
+
|
|
1101
|
+
category_dict = {}
|
|
1102
|
+
for key, column in category.items():
|
|
1103
|
+
if isinstance(column, BinaryCIFColumn):
|
|
1104
|
+
data_encoding = column.data.encoding
|
|
1105
|
+
# Optimization: The repeated string array has the same
|
|
1106
|
+
# unique values, as the original string array
|
|
1107
|
+
# -> Use same unique values (faster due to shorter array)
|
|
1108
|
+
if isinstance(data_encoding[0], StringArrayEncoding):
|
|
1109
|
+
data_encoding[0].strings = np.unique(column.data.array)
|
|
1110
|
+
data = Data(np.tile(column.data.array, repetitions), data_encoding)
|
|
1111
|
+
else:
|
|
1112
|
+
data = Data(np.tile(column.data.array, repetitions))
|
|
1113
|
+
mask = (
|
|
1114
|
+
Data(np.tile(column.mask.array, repetitions))
|
|
1115
|
+
if column.mask is not None
|
|
1116
|
+
else None
|
|
1117
|
+
)
|
|
1118
|
+
category_dict[key] = Column(data, mask)
|
|
1119
|
+
return Category(category_dict)
|
|
1120
|
+
|
|
1121
|
+
|
|
1122
|
+
def _set_intra_residue_bonds(array, atom_site):
|
|
1123
|
+
"""
|
|
1124
|
+
Create the ``chem_comp_bond`` category containing the intra-residue
|
|
1125
|
+
bonds.
|
|
1126
|
+
``atom_site`` is only used to infer the right :class:`Category` type
|
|
1127
|
+
(either :class:`CIFCategory` or :class:`BinaryCIFCategory`).
|
|
1128
|
+
"""
|
|
1129
|
+
if (array.res_name == "").any():
|
|
1130
|
+
raise BadStructureError(
|
|
1131
|
+
"Structure contains atoms with empty residue name, "
|
|
1132
|
+
"but it is required to write intra-residue bonds"
|
|
1133
|
+
)
|
|
1134
|
+
if (array.atom_name == "").any():
|
|
1135
|
+
raise BadStructureError(
|
|
1136
|
+
"Structure contains atoms with empty atom name, "
|
|
1137
|
+
"but it is required to write intra-residue bonds"
|
|
1138
|
+
)
|
|
1139
|
+
|
|
1140
|
+
Category = type(atom_site)
|
|
1141
|
+
Column = Category.subcomponent_class()
|
|
1142
|
+
|
|
1143
|
+
bond_array = _filter_bonds(array, "intra")
|
|
1144
|
+
if len(bond_array) == 0:
|
|
1145
|
+
return None
|
|
1146
|
+
value_order = np.zeros(len(bond_array), dtype="U4")
|
|
1147
|
+
aromatic_flag = np.zeros(len(bond_array), dtype="U1")
|
|
1148
|
+
for i, bond_type in enumerate(bond_array[:, 2]):
|
|
1149
|
+
if bond_type == BondType.ANY:
|
|
1150
|
+
# ANY bonds will be masked anyway, no need to set the value
|
|
1151
|
+
continue
|
|
1152
|
+
order, aromatic = COMP_BOND_TYPE_TO_ORDER[bond_type]
|
|
1153
|
+
value_order[i] = order
|
|
1154
|
+
aromatic_flag[i] = aromatic
|
|
1155
|
+
any_mask = bond_array[:, 2] == BondType.ANY
|
|
1156
|
+
|
|
1157
|
+
# Remove already existing residue and atom name combinations
|
|
1158
|
+
# These appear when the structure contains a residue multiple times
|
|
1159
|
+
atom_id_1 = array.atom_name[bond_array[:, 0]]
|
|
1160
|
+
atom_id_2 = array.atom_name[bond_array[:, 1]]
|
|
1161
|
+
# Take the residue name from the first atom index, as the residue
|
|
1162
|
+
# name is the same for both atoms, since we have only intra bonds
|
|
1163
|
+
comp_id = array.res_name[bond_array[:, 0]]
|
|
1164
|
+
_, unique_indices = np.unique(
|
|
1165
|
+
np.stack([comp_id, atom_id_1, atom_id_2], axis=-1), axis=0, return_index=True
|
|
1166
|
+
)
|
|
1167
|
+
unique_indices.sort()
|
|
1168
|
+
|
|
1169
|
+
chem_comp_bond = Category()
|
|
1170
|
+
n_bonds = len(unique_indices)
|
|
1171
|
+
chem_comp_bond["pdbx_ordinal"] = np.arange(1, n_bonds + 1, dtype=np.int32)
|
|
1172
|
+
chem_comp_bond["comp_id"] = comp_id[unique_indices]
|
|
1173
|
+
chem_comp_bond["atom_id_1"] = atom_id_1[unique_indices]
|
|
1174
|
+
chem_comp_bond["atom_id_2"] = atom_id_2[unique_indices]
|
|
1175
|
+
chem_comp_bond["value_order"] = Column(
|
|
1176
|
+
value_order[unique_indices],
|
|
1177
|
+
np.where(any_mask[unique_indices], MaskValue.MISSING, MaskValue.PRESENT),
|
|
1178
|
+
)
|
|
1179
|
+
chem_comp_bond["pdbx_aromatic_flag"] = Column(
|
|
1180
|
+
aromatic_flag[unique_indices],
|
|
1181
|
+
np.where(any_mask[unique_indices], MaskValue.MISSING, MaskValue.PRESENT),
|
|
1182
|
+
)
|
|
1183
|
+
# BondList does not contain stereo information
|
|
1184
|
+
# -> all values are missing
|
|
1185
|
+
chem_comp_bond["pdbx_stereo_config"] = Column(
|
|
1186
|
+
np.zeros(n_bonds, dtype="U1"),
|
|
1187
|
+
np.full(n_bonds, MaskValue.MISSING),
|
|
1188
|
+
)
|
|
1189
|
+
return chem_comp_bond
|
|
1190
|
+
|
|
1191
|
+
|
|
1192
|
+
def _set_inter_residue_bonds(array, atom_site):
|
|
1193
|
+
"""
|
|
1194
|
+
Create the ``struct_conn`` category containing the inter-residue
|
|
1195
|
+
bonds.
|
|
1196
|
+
The involved atoms are identified by annotations from the
|
|
1197
|
+
``atom_site`` category.
|
|
1198
|
+
"""
|
|
1199
|
+
COLUMNS = [
|
|
1200
|
+
"label_asym_id",
|
|
1201
|
+
"label_comp_id",
|
|
1202
|
+
"label_seq_id",
|
|
1203
|
+
"label_atom_id",
|
|
1204
|
+
"pdbx_PDB_ins_code",
|
|
1205
|
+
]
|
|
1206
|
+
|
|
1207
|
+
Category = type(atom_site)
|
|
1208
|
+
Column = Category.subcomponent_class()
|
|
1209
|
+
|
|
1210
|
+
bond_array = _filter_bonds(array, "inter")
|
|
1211
|
+
if len(bond_array) == 0:
|
|
1212
|
+
return None
|
|
1213
|
+
|
|
1214
|
+
# Filter out 'standard' links, i.e. backbone bonds between adjacent canonical
|
|
1215
|
+
# nucleotide/amino acid residues
|
|
1216
|
+
bond_array = bond_array[~_filter_canonical_links(array, bond_array)]
|
|
1217
|
+
if len(bond_array) == 0:
|
|
1218
|
+
return None
|
|
1219
|
+
|
|
1220
|
+
struct_conn = Category()
|
|
1221
|
+
struct_conn["id"] = np.arange(1, len(bond_array) + 1)
|
|
1222
|
+
struct_conn["conn_type_id"] = [
|
|
1223
|
+
PDBX_BOND_TYPE_TO_TYPE_ID[btype] for btype in bond_array[:, 2]
|
|
1224
|
+
]
|
|
1225
|
+
struct_conn["pdbx_value_order"] = Column(
|
|
1226
|
+
np.array([PDBX_BOND_TYPE_TO_ORDER[btype] for btype in bond_array[:, 2]]),
|
|
1227
|
+
np.where(
|
|
1228
|
+
np.isin(bond_array[:, 2], (BondType.ANY, BondType.COORDINATION)),
|
|
1229
|
+
MaskValue.MISSING,
|
|
1230
|
+
MaskValue.PRESENT,
|
|
1231
|
+
),
|
|
1232
|
+
)
|
|
1233
|
+
# Write the identifying annotation...
|
|
1234
|
+
for col_name in COLUMNS:
|
|
1235
|
+
annot = atom_site[col_name].as_array()
|
|
1236
|
+
# ...for each bond partner
|
|
1237
|
+
for i in range(2):
|
|
1238
|
+
atom_indices = bond_array[:, i]
|
|
1239
|
+
struct_conn[_get_struct_conn_col_name(col_name, i + 1)] = annot[
|
|
1240
|
+
atom_indices
|
|
1241
|
+
]
|
|
1242
|
+
return struct_conn
|
|
1243
|
+
|
|
1244
|
+
|
|
1245
|
+
def _filter_bonds(array, connection):
|
|
1246
|
+
"""
|
|
1247
|
+
Get a bonds array, that contain either only intra-residue or
|
|
1248
|
+
only inter-residue bonds.
|
|
1249
|
+
"""
|
|
1250
|
+
bond_array = array.bonds.as_array()
|
|
1251
|
+
# To save computation time call 'get_residue_starts_for()' only once
|
|
1252
|
+
# with indices of the first and second atom of each bond
|
|
1253
|
+
residue_starts_1, residue_starts_2 = (
|
|
1254
|
+
get_residue_starts_for(array, bond_array[:, :2].flatten()).reshape(-1, 2).T
|
|
1255
|
+
)
|
|
1256
|
+
if connection == "intra":
|
|
1257
|
+
return bond_array[residue_starts_1 == residue_starts_2]
|
|
1258
|
+
elif connection == "inter":
|
|
1259
|
+
return bond_array[residue_starts_1 != residue_starts_2]
|
|
1260
|
+
else:
|
|
1261
|
+
raise ValueError("Invalid 'connection' option")
|
|
1262
|
+
|
|
1263
|
+
|
|
1264
|
+
def _filter_canonical_links(array, bond_array):
|
|
1265
|
+
"""
|
|
1266
|
+
Filter out peptide bonds between adjacent canonical amino acid residues.
|
|
1267
|
+
"""
|
|
1268
|
+
# Get the residue index for each bonded atom
|
|
1269
|
+
residue_indices = get_residue_positions(array, bond_array[:, :2].flatten()).reshape(
|
|
1270
|
+
-1, 2
|
|
1271
|
+
)
|
|
1272
|
+
|
|
1273
|
+
return (
|
|
1274
|
+
# Must be canonical residues
|
|
1275
|
+
np.isin(array.res_name[bond_array[:, 0]], CANONICAL_RESIDUE_LIST) &
|
|
1276
|
+
np.isin(array.res_name[bond_array[:, 1]], CANONICAL_RESIDUE_LIST) &
|
|
1277
|
+
# Must be backbone bond
|
|
1278
|
+
np.isin(array.atom_name[bond_array[:, 0]], ("C", "O3'")) &
|
|
1279
|
+
np.isin(array.atom_name[bond_array[:, 1]], ("N", "P")) &
|
|
1280
|
+
# Must connect adjacent residues
|
|
1281
|
+
residue_indices[:, 1] - residue_indices[:, 0] == 1
|
|
1282
|
+
) # fmt: skip
|
|
1283
|
+
|
|
1284
|
+
|
|
1285
|
+
def get_component(
|
|
1286
|
+
pdbx_file,
|
|
1287
|
+
data_block=None,
|
|
1288
|
+
use_ideal_coord=True,
|
|
1289
|
+
res_name=None,
|
|
1290
|
+
allow_missing_coord=False,
|
|
1291
|
+
):
|
|
1292
|
+
"""
|
|
1293
|
+
Create an :class:`AtomArray` for a chemical component from the
|
|
1294
|
+
``chem_comp_atom`` and, if available, the ``chem_comp_bond``
|
|
1295
|
+
category in a file.
|
|
1296
|
+
|
|
1297
|
+
Parameters
|
|
1298
|
+
----------
|
|
1299
|
+
pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
|
|
1300
|
+
The file object.
|
|
1301
|
+
data_block : str, optional
|
|
1302
|
+
The name of the data block.
|
|
1303
|
+
Default is the first (and most times only) data block of the
|
|
1304
|
+
file.
|
|
1305
|
+
If the data block object is passed directly to `pdbx_file`,
|
|
1306
|
+
this parameter is ignored.
|
|
1307
|
+
use_ideal_coord : bool, optional
|
|
1308
|
+
If true, the *ideal* coordinates are read from the file
|
|
1309
|
+
(``pdbx_model_Cartn_<dim>_ideal`` fields), typically
|
|
1310
|
+
originating from computations.
|
|
1311
|
+
If set to false, alternative coordinates are read
|
|
1312
|
+
(``model_Cartn_<dim>_`` fields).
|
|
1313
|
+
res_name : str
|
|
1314
|
+
In rare cases the categories may contain rows for multiple
|
|
1315
|
+
components.
|
|
1316
|
+
In this case, the component with the given residue name is
|
|
1317
|
+
read.
|
|
1318
|
+
By default, all rows would be read in this case.
|
|
1319
|
+
allow_missing_coord : bool, optional
|
|
1320
|
+
Whether to allow missing coordinate values in components.
|
|
1321
|
+
If ``True``, these will be represented as ``nan`` values.
|
|
1322
|
+
If ``False``, a ``ValueError`` is raised when missing coordinates
|
|
1323
|
+
are encountered.
|
|
1324
|
+
|
|
1325
|
+
Returns
|
|
1326
|
+
-------
|
|
1327
|
+
array : AtomArray
|
|
1328
|
+
The parsed chemical component.
|
|
1329
|
+
|
|
1330
|
+
Examples
|
|
1331
|
+
--------
|
|
1332
|
+
|
|
1333
|
+
>>> import os.path
|
|
1334
|
+
>>> file = CIFFile.read(
|
|
1335
|
+
... os.path.join(path_to_structures, "molecules", "TYR.cif")
|
|
1336
|
+
... )
|
|
1337
|
+
>>> comp = get_component(file)
|
|
1338
|
+
>>> print(comp)
|
|
1339
|
+
HET 0 TYR N N 1.320 0.952 1.428
|
|
1340
|
+
HET 0 TYR CA C -0.018 0.429 1.734
|
|
1341
|
+
HET 0 TYR C C -0.103 0.094 3.201
|
|
1342
|
+
HET 0 TYR O O 0.886 -0.254 3.799
|
|
1343
|
+
HET 0 TYR CB C -0.274 -0.831 0.907
|
|
1344
|
+
HET 0 TYR CG C -0.189 -0.496 -0.559
|
|
1345
|
+
HET 0 TYR CD1 C 1.022 -0.589 -1.219
|
|
1346
|
+
HET 0 TYR CD2 C -1.324 -0.102 -1.244
|
|
1347
|
+
HET 0 TYR CE1 C 1.103 -0.282 -2.563
|
|
1348
|
+
HET 0 TYR CE2 C -1.247 0.210 -2.587
|
|
1349
|
+
HET 0 TYR CZ C -0.032 0.118 -3.252
|
|
1350
|
+
HET 0 TYR OH O 0.044 0.420 -4.574
|
|
1351
|
+
HET 0 TYR OXT O -1.279 0.184 3.842
|
|
1352
|
+
HET 0 TYR H H 1.977 0.225 1.669
|
|
1353
|
+
HET 0 TYR H2 H 1.365 1.063 0.426
|
|
1354
|
+
HET 0 TYR HA H -0.767 1.183 1.489
|
|
1355
|
+
HET 0 TYR HB2 H 0.473 -1.585 1.152
|
|
1356
|
+
HET 0 TYR HB3 H -1.268 -1.219 1.134
|
|
1357
|
+
HET 0 TYR HD1 H 1.905 -0.902 -0.683
|
|
1358
|
+
HET 0 TYR HD2 H -2.269 -0.031 -0.727
|
|
1359
|
+
HET 0 TYR HE1 H 2.049 -0.354 -3.078
|
|
1360
|
+
HET 0 TYR HE2 H -2.132 0.523 -3.121
|
|
1361
|
+
HET 0 TYR HH H -0.123 -0.399 -5.059
|
|
1362
|
+
HET 0 TYR HXT H -1.333 -0.030 4.784
|
|
1363
|
+
"""
|
|
1364
|
+
block = _get_block(pdbx_file, data_block)
|
|
1365
|
+
|
|
1366
|
+
try:
|
|
1367
|
+
atom_category = block["chem_comp_atom"]
|
|
1368
|
+
except KeyError:
|
|
1369
|
+
raise InvalidFileError("Missing 'chem_comp_atom' category in file")
|
|
1370
|
+
if res_name is not None:
|
|
1371
|
+
atom_category = _filter(
|
|
1372
|
+
atom_category, atom_category["comp_id"].as_array() == res_name
|
|
1373
|
+
)
|
|
1374
|
+
if atom_category.row_count == 0:
|
|
1375
|
+
raise KeyError(
|
|
1376
|
+
f"No rows with residue name '{res_name}' found in "
|
|
1377
|
+
f"'chem_comp_atom' category"
|
|
1378
|
+
)
|
|
1379
|
+
|
|
1380
|
+
array = AtomArray(atom_category.row_count)
|
|
1381
|
+
|
|
1382
|
+
array.set_annotation("hetero", np.full(len(atom_category["comp_id"]), True))
|
|
1383
|
+
array.set_annotation("res_name", atom_category["comp_id"].as_array(str))
|
|
1384
|
+
array.set_annotation("atom_name", atom_category["atom_id"].as_array(str))
|
|
1385
|
+
array.set_annotation("element", atom_category["type_symbol"].as_array(str))
|
|
1386
|
+
array.set_annotation("charge", atom_category["charge"].as_array(int, 0))
|
|
1387
|
+
|
|
1388
|
+
coord_fields = [f"pdbx_model_Cartn_{dim}_ideal" for dim in ("x", "y", "z")]
|
|
1389
|
+
alt_coord_fields = [f"model_Cartn_{dim}" for dim in ("x", "y", "z")]
|
|
1390
|
+
if not use_ideal_coord:
|
|
1391
|
+
# Swap with the fallback option
|
|
1392
|
+
coord_fields, alt_coord_fields = alt_coord_fields, coord_fields
|
|
1393
|
+
try:
|
|
1394
|
+
array.coord = _parse_component_coordinates(
|
|
1395
|
+
[atom_category[field] for field in coord_fields]
|
|
1396
|
+
)
|
|
1397
|
+
except Exception as err:
|
|
1398
|
+
if isinstance(err, KeyError):
|
|
1399
|
+
key = err.args[0]
|
|
1400
|
+
warnings.warn(
|
|
1401
|
+
f"Attribute '{key}' not found within 'chem_comp_atom' category. "
|
|
1402
|
+
f"The fallback coordinates will be used instead",
|
|
1403
|
+
UserWarning,
|
|
1404
|
+
)
|
|
1405
|
+
elif isinstance(err, ValueError):
|
|
1406
|
+
warnings.warn(
|
|
1407
|
+
"The coordinates are missing for some atoms. "
|
|
1408
|
+
"The fallback coordinates will be used instead",
|
|
1409
|
+
UserWarning,
|
|
1410
|
+
)
|
|
1411
|
+
else:
|
|
1412
|
+
raise
|
|
1413
|
+
array.coord = _parse_component_coordinates(
|
|
1414
|
+
[atom_category[field] for field in alt_coord_fields],
|
|
1415
|
+
allow_missing=allow_missing_coord,
|
|
1416
|
+
)
|
|
1417
|
+
|
|
1418
|
+
try:
|
|
1419
|
+
bond_category = block["chem_comp_bond"]
|
|
1420
|
+
if res_name is not None:
|
|
1421
|
+
bond_category = _filter(
|
|
1422
|
+
bond_category, bond_category["comp_id"].as_array() == res_name
|
|
1423
|
+
)
|
|
1424
|
+
except KeyError:
|
|
1425
|
+
warnings.warn(
|
|
1426
|
+
"Category 'chem_comp_bond' not found. No bonds will be parsed",
|
|
1427
|
+
UserWarning,
|
|
1428
|
+
)
|
|
1429
|
+
else:
|
|
1430
|
+
bonds = BondList(array.array_length())
|
|
1431
|
+
for atom1, atom2, order, aromatic_flag in zip(
|
|
1432
|
+
bond_category["atom_id_1"].as_array(str),
|
|
1433
|
+
bond_category["atom_id_2"].as_array(str),
|
|
1434
|
+
bond_category["value_order"].as_array(str),
|
|
1435
|
+
bond_category["pdbx_aromatic_flag"].as_array(str),
|
|
1436
|
+
):
|
|
1437
|
+
atom_i = np.where(array.atom_name == atom1)[0][0]
|
|
1438
|
+
atom_j = np.where(array.atom_name == atom2)[0][0]
|
|
1439
|
+
bond_type = COMP_BOND_ORDER_TO_TYPE[order, aromatic_flag]
|
|
1440
|
+
bonds.add_bond(atom_i, atom_j, bond_type)
|
|
1441
|
+
array.bonds = bonds
|
|
1442
|
+
|
|
1443
|
+
return array
|
|
1444
|
+
|
|
1445
|
+
|
|
1446
|
+
def _parse_component_coordinates(coord_columns, allow_missing=False):
|
|
1447
|
+
coord = np.zeros((len(coord_columns[0]), 3), dtype=np.float32)
|
|
1448
|
+
for i, column in enumerate(coord_columns):
|
|
1449
|
+
if column.mask is not None and column.mask.array.any():
|
|
1450
|
+
if allow_missing:
|
|
1451
|
+
warnings.warn(
|
|
1452
|
+
"Missing coordinates for some atoms. Those will be set to nan",
|
|
1453
|
+
UserWarning,
|
|
1454
|
+
)
|
|
1455
|
+
else:
|
|
1456
|
+
raise ValueError(
|
|
1457
|
+
"Missing coordinates for some atoms",
|
|
1458
|
+
)
|
|
1459
|
+
coord[:, i] = column.as_array(np.float32, masked_value=np.nan)
|
|
1460
|
+
return coord
|
|
1461
|
+
|
|
1462
|
+
|
|
1463
|
+
def set_component(pdbx_file, array, data_block=None):
|
|
1464
|
+
"""
|
|
1465
|
+
Set the ``chem_comp_atom`` and, if bonds are available,
|
|
1466
|
+
``chem_comp_bond`` category with atom information from an
|
|
1467
|
+
:class:`AtomArray`.
|
|
1468
|
+
|
|
1469
|
+
This will save the coordinates, the mandatory annotation categories
|
|
1470
|
+
and the optional ``charge`` category as well as an associated
|
|
1471
|
+
:class:`BondList`, if available.
|
|
1472
|
+
|
|
1473
|
+
Parameters
|
|
1474
|
+
----------
|
|
1475
|
+
pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
|
|
1476
|
+
The file object.
|
|
1477
|
+
array : AtomArray
|
|
1478
|
+
The chemical component to be written.
|
|
1479
|
+
Must contain only a single residue.
|
|
1480
|
+
data_block : str, optional
|
|
1481
|
+
The name of the data block.
|
|
1482
|
+
Default is the first (and most times only) data block of the
|
|
1483
|
+
file.
|
|
1484
|
+
If the file is empty, a new data will be created.
|
|
1485
|
+
If the data block object is passed directly to `pdbx_file`,
|
|
1486
|
+
this parameter is ignored.
|
|
1487
|
+
"""
|
|
1488
|
+
_check_non_empty(array)
|
|
1489
|
+
|
|
1490
|
+
block = _get_or_create_block(pdbx_file, data_block)
|
|
1491
|
+
Category = block.subcomponent_class()
|
|
1492
|
+
|
|
1493
|
+
if get_residue_count(array) > 1:
|
|
1494
|
+
raise BadStructureError("The input atom array must comprise only one residue")
|
|
1495
|
+
res_name = array.res_name[0]
|
|
1496
|
+
|
|
1497
|
+
annot_categories = array.get_annotation_categories()
|
|
1498
|
+
if "charge" in annot_categories:
|
|
1499
|
+
charge = array.charge.astype("U2")
|
|
1500
|
+
else:
|
|
1501
|
+
charge = np.full(array.array_length(), "?", dtype="U2")
|
|
1502
|
+
|
|
1503
|
+
atom_cat = Category()
|
|
1504
|
+
atom_cat["comp_id"] = np.full(array.array_length(), res_name)
|
|
1505
|
+
atom_cat["atom_id"] = np.copy(array.atom_name)
|
|
1506
|
+
atom_cat["alt_atom_id"] = atom_cat["atom_id"]
|
|
1507
|
+
atom_cat["type_symbol"] = np.copy(array.element)
|
|
1508
|
+
atom_cat["charge"] = charge
|
|
1509
|
+
atom_cat["model_Cartn_x"] = np.copy(array.coord[:, 0])
|
|
1510
|
+
atom_cat["model_Cartn_y"] = np.copy(array.coord[:, 1])
|
|
1511
|
+
atom_cat["model_Cartn_z"] = np.copy(array.coord[:, 2])
|
|
1512
|
+
atom_cat["pdbx_model_Cartn_x_ideal"] = atom_cat["model_Cartn_x"]
|
|
1513
|
+
atom_cat["pdbx_model_Cartn_y_ideal"] = atom_cat["model_Cartn_y"]
|
|
1514
|
+
atom_cat["pdbx_model_Cartn_z_ideal"] = atom_cat["model_Cartn_z"]
|
|
1515
|
+
atom_cat["pdbx_component_atom_id"] = atom_cat["atom_id"]
|
|
1516
|
+
atom_cat["pdbx_component_comp_id"] = atom_cat["comp_id"]
|
|
1517
|
+
atom_cat["pdbx_ordinal"] = np.arange(1, array.array_length() + 1).astype(str)
|
|
1518
|
+
block["chem_comp_atom"] = atom_cat
|
|
1519
|
+
|
|
1520
|
+
if array.bonds is not None and array.bonds.get_bond_count() > 0:
|
|
1521
|
+
bond_array = array.bonds.as_array()
|
|
1522
|
+
order_flags = []
|
|
1523
|
+
aromatic_flags = []
|
|
1524
|
+
for bond_type in bond_array[:, 2]:
|
|
1525
|
+
order_flag, aromatic_flag = COMP_BOND_TYPE_TO_ORDER[bond_type]
|
|
1526
|
+
order_flags.append(order_flag)
|
|
1527
|
+
aromatic_flags.append(aromatic_flag)
|
|
1528
|
+
|
|
1529
|
+
bond_cat = Category()
|
|
1530
|
+
bond_cat["comp_id"] = np.full(len(bond_array), res_name)
|
|
1531
|
+
bond_cat["atom_id_1"] = array.atom_name[bond_array[:, 0]]
|
|
1532
|
+
bond_cat["atom_id_2"] = array.atom_name[bond_array[:, 1]]
|
|
1533
|
+
bond_cat["value_order"] = np.array(order_flags)
|
|
1534
|
+
bond_cat["pdbx_aromatic_flag"] = np.array(aromatic_flags)
|
|
1535
|
+
bond_cat["pdbx_ordinal"] = np.arange(1, len(bond_array) + 1).astype(str)
|
|
1536
|
+
block["chem_comp_bond"] = bond_cat
|
|
1537
|
+
|
|
1538
|
+
|
|
1539
|
+
def list_assemblies(pdbx_file, data_block=None):
|
|
1540
|
+
"""
|
|
1541
|
+
List the biological assemblies that are available for the structure
|
|
1542
|
+
in the given file.
|
|
1543
|
+
|
|
1544
|
+
This function receives the data from the ``pdbx_struct_assembly``
|
|
1545
|
+
category in the file.
|
|
1546
|
+
Consequently, this category must be present in the file.
|
|
1547
|
+
|
|
1548
|
+
Parameters
|
|
1549
|
+
----------
|
|
1550
|
+
pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
|
|
1551
|
+
The file object.
|
|
1552
|
+
data_block : str, optional
|
|
1553
|
+
The name of the data block.
|
|
1554
|
+
Default is the first (and most times only) data block of the
|
|
1555
|
+
file.
|
|
1556
|
+
If the data block object is passed directly to `pdbx_file`,
|
|
1557
|
+
this parameter is ignored.
|
|
1558
|
+
|
|
1559
|
+
Returns
|
|
1560
|
+
-------
|
|
1561
|
+
assemblies : dict of str -> str
|
|
1562
|
+
A dictionary that maps an assembly ID to a description of the
|
|
1563
|
+
corresponding assembly.
|
|
1564
|
+
|
|
1565
|
+
Examples
|
|
1566
|
+
--------
|
|
1567
|
+
|
|
1568
|
+
>>> import os.path
|
|
1569
|
+
>>> file = CIFFile.read(os.path.join(path_to_structures, "1f2n.cif"))
|
|
1570
|
+
>>> assembly_ids = list_assemblies(file)
|
|
1571
|
+
>>> for key, val in assembly_ids.items():
|
|
1572
|
+
... print(f"'{key}' : '{val}'")
|
|
1573
|
+
'1' : 'complete icosahedral assembly'
|
|
1574
|
+
'2' : 'icosahedral asymmetric unit'
|
|
1575
|
+
'3' : 'icosahedral pentamer'
|
|
1576
|
+
'4' : 'icosahedral 23 hexamer'
|
|
1577
|
+
'5' : 'icosahedral asymmetric unit, std point frame'
|
|
1578
|
+
'6' : 'crystal asymmetric unit, crystal frame'
|
|
1579
|
+
"""
|
|
1580
|
+
block = _get_block(pdbx_file, data_block)
|
|
1581
|
+
|
|
1582
|
+
try:
|
|
1583
|
+
assembly_category = block["pdbx_struct_assembly"]
|
|
1584
|
+
except KeyError:
|
|
1585
|
+
raise InvalidFileError("File has no 'pdbx_struct_assembly' category")
|
|
1586
|
+
return {
|
|
1587
|
+
id: details
|
|
1588
|
+
for id, details in zip(
|
|
1589
|
+
assembly_category["id"].as_array(str),
|
|
1590
|
+
assembly_category["details"].as_array(str),
|
|
1591
|
+
)
|
|
1592
|
+
}
|
|
1593
|
+
|
|
1594
|
+
|
|
1595
|
+
def get_assembly(
|
|
1596
|
+
pdbx_file,
|
|
1597
|
+
assembly_id=None,
|
|
1598
|
+
model=None,
|
|
1599
|
+
data_block=None,
|
|
1600
|
+
altloc="first",
|
|
1601
|
+
extra_fields=None,
|
|
1602
|
+
use_author_fields=True,
|
|
1603
|
+
include_bonds=False,
|
|
1604
|
+
):
|
|
1605
|
+
"""
|
|
1606
|
+
Build the given biological assembly.
|
|
1607
|
+
|
|
1608
|
+
This function receives the data from the
|
|
1609
|
+
``pdbx_struct_assembly_gen``, ``pdbx_struct_oper_list`` and
|
|
1610
|
+
``atom_site`` categories in the file.
|
|
1611
|
+
Consequently, these categories must be present in the file.
|
|
1612
|
+
|
|
1613
|
+
Parameters
|
|
1614
|
+
----------
|
|
1615
|
+
pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
|
|
1616
|
+
The file object.
|
|
1617
|
+
assembly_id : str
|
|
1618
|
+
The assembly to build.
|
|
1619
|
+
Available assembly IDs can be obtained via
|
|
1620
|
+
:func:`list_assemblies()`.
|
|
1621
|
+
model : int, optional
|
|
1622
|
+
If this parameter is given, the function will return an
|
|
1623
|
+
:class:`AtomArray` from the atoms corresponding to the given
|
|
1624
|
+
model number (starting at 1).
|
|
1625
|
+
Negative values are used to index models starting from the last
|
|
1626
|
+
model insted of the first model.
|
|
1627
|
+
If this parameter is omitted, an :class:`AtomArrayStack`
|
|
1628
|
+
containing all models will be returned, even if the structure
|
|
1629
|
+
contains only one model.
|
|
1630
|
+
data_block : str, optional
|
|
1631
|
+
The name of the data block.
|
|
1632
|
+
Default is the first (and most times only) data block of the
|
|
1633
|
+
file.
|
|
1634
|
+
If the data block object is passed directly to `pdbx_file`,
|
|
1635
|
+
this parameter is ignored.
|
|
1636
|
+
altloc : {'first', 'occupancy', 'all'}
|
|
1637
|
+
This parameter defines how *altloc* IDs are handled:
|
|
1638
|
+
- ``'first'`` - Use atoms that have the first *altloc* ID
|
|
1639
|
+
appearing in a residue.
|
|
1640
|
+
- ``'occupancy'`` - Use atoms that have the *altloc* ID
|
|
1641
|
+
with the highest occupancy for a residue.
|
|
1642
|
+
- ``'all'`` - Use all atoms.
|
|
1643
|
+
Note that this leads to duplicate atoms.
|
|
1644
|
+
When this option is chosen, the ``altloc_id`` annotation
|
|
1645
|
+
array is added to the returned structure.
|
|
1646
|
+
extra_fields : list of str, optional
|
|
1647
|
+
The strings in the list are entry names, that are
|
|
1648
|
+
additionally added as annotation arrays.
|
|
1649
|
+
The annotation category name will be the same as the PDBx
|
|
1650
|
+
subcategory name.
|
|
1651
|
+
The array type is always `str`.
|
|
1652
|
+
An exception are the special field identifiers:
|
|
1653
|
+
``'atom_id'``, ``'b_factor'``, ``'occupancy'`` and ``'charge'``.
|
|
1654
|
+
These will convert the fitting subcategory into an
|
|
1655
|
+
annotation array with reasonable type.
|
|
1656
|
+
use_author_fields : bool, optional
|
|
1657
|
+
Some fields can be read from two alternative sources,
|
|
1658
|
+
for example both, ``label_seq_id`` and ``auth_seq_id`` describe
|
|
1659
|
+
the ID of the residue.
|
|
1660
|
+
While, the ``label_xxx`` fields can be used as official pointers
|
|
1661
|
+
to other categories in the file, the ``auth_xxx``
|
|
1662
|
+
fields are set by the author(s) of the structure and are
|
|
1663
|
+
consistent with the corresponding values in PDB files.
|
|
1664
|
+
If `use_author_fields` is true, the annotation arrays will be
|
|
1665
|
+
read from the ``auth_xxx`` fields (if applicable),
|
|
1666
|
+
otherwise from the the ``label_xxx`` fields.
|
|
1667
|
+
include_bonds : bool, optional
|
|
1668
|
+
If set to true, a :class:`BondList` will be created for the
|
|
1669
|
+
resulting :class:`AtomArray` containing the bond information
|
|
1670
|
+
from the file.
|
|
1671
|
+
Inter-residue bonds, will be read from the ``struct_conn``
|
|
1672
|
+
category.
|
|
1673
|
+
Intra-residue bonds will be read from the ``chem_comp_bond``, if
|
|
1674
|
+
available, otherwise they will be derived from the Chemical
|
|
1675
|
+
Component Dictionary.
|
|
1676
|
+
|
|
1677
|
+
Returns
|
|
1678
|
+
-------
|
|
1679
|
+
assembly : AtomArray or AtomArrayStack
|
|
1680
|
+
The assembly.
|
|
1681
|
+
The return type depends on the `model` parameter.
|
|
1682
|
+
Contains the `sym_id` annotation, which enumerates the copies of the asymmetric
|
|
1683
|
+
unit in the assembly.
|
|
1684
|
+
|
|
1685
|
+
Examples
|
|
1686
|
+
--------
|
|
1687
|
+
|
|
1688
|
+
>>> import os.path
|
|
1689
|
+
>>> file = CIFFile.read(os.path.join(path_to_structures, "1f2n.cif"))
|
|
1690
|
+
>>> assembly = get_assembly(file, model=1)
|
|
1691
|
+
"""
|
|
1692
|
+
block = _get_block(pdbx_file, data_block)
|
|
1693
|
+
|
|
1694
|
+
try:
|
|
1695
|
+
assembly_gen_category = block["pdbx_struct_assembly_gen"]
|
|
1696
|
+
except KeyError:
|
|
1697
|
+
raise InvalidFileError("File has no 'pdbx_struct_assembly_gen' category")
|
|
1698
|
+
|
|
1699
|
+
try:
|
|
1700
|
+
struct_oper_category = block["pdbx_struct_oper_list"]
|
|
1701
|
+
except KeyError:
|
|
1702
|
+
raise InvalidFileError("File has no 'pdbx_struct_oper_list' category")
|
|
1703
|
+
|
|
1704
|
+
assembly_ids = assembly_gen_category["assembly_id"].as_array(str)
|
|
1705
|
+
if assembly_id is None:
|
|
1706
|
+
assembly_id = assembly_ids[0]
|
|
1707
|
+
elif assembly_id not in assembly_ids:
|
|
1708
|
+
raise KeyError(f"File has no Assembly ID '{assembly_id}'")
|
|
1709
|
+
|
|
1710
|
+
### Calculate all possible transformations
|
|
1711
|
+
transformations = _get_transformations(struct_oper_category)
|
|
1712
|
+
|
|
1713
|
+
### Get structure according to additional parameters
|
|
1714
|
+
# Include 'label_asym_id' as annotation array
|
|
1715
|
+
# for correct asym ID filtering
|
|
1716
|
+
extra_fields = [] if extra_fields is None else extra_fields
|
|
1717
|
+
if "label_asym_id" in extra_fields:
|
|
1718
|
+
extra_fields_and_asym = extra_fields
|
|
1719
|
+
else:
|
|
1720
|
+
# The operations apply on asym IDs
|
|
1721
|
+
# -> they need to be included to select the correct atoms
|
|
1722
|
+
extra_fields_and_asym = extra_fields + ["label_asym_id"]
|
|
1723
|
+
structure = get_structure(
|
|
1724
|
+
pdbx_file,
|
|
1725
|
+
model,
|
|
1726
|
+
data_block,
|
|
1727
|
+
altloc,
|
|
1728
|
+
extra_fields_and_asym,
|
|
1729
|
+
use_author_fields,
|
|
1730
|
+
include_bonds,
|
|
1731
|
+
)
|
|
1732
|
+
|
|
1733
|
+
### Get transformations and apply them to the affected asym IDs
|
|
1734
|
+
chain_ops = defaultdict(list)
|
|
1735
|
+
for id, op_expr, asym_id_expr in zip(
|
|
1736
|
+
assembly_gen_category["assembly_id"].as_array(str),
|
|
1737
|
+
assembly_gen_category["oper_expression"].as_array(str),
|
|
1738
|
+
assembly_gen_category["asym_id_list"].as_array(str),
|
|
1739
|
+
):
|
|
1740
|
+
# Find the operation expressions for given assembly ID
|
|
1741
|
+
# We already asserted that the ID is actually present
|
|
1742
|
+
if id == assembly_id:
|
|
1743
|
+
for chain_id in asym_id_expr.split(","):
|
|
1744
|
+
chain_ops[chain_id].extend(_parse_operation_expression(op_expr))
|
|
1745
|
+
|
|
1746
|
+
sub_assemblies = []
|
|
1747
|
+
for asym_id, op_list in chain_ops.items():
|
|
1748
|
+
sub_struct = structure[..., structure.label_asym_id == asym_id]
|
|
1749
|
+
sub_assembly = _apply_transformations(sub_struct, transformations, op_list)
|
|
1750
|
+
# Merge the chain's sub_assembly into the rest of the assembly
|
|
1751
|
+
sub_assemblies.append(sub_assembly)
|
|
1752
|
+
assembly = concatenate(sub_assemblies)
|
|
1753
|
+
|
|
1754
|
+
# Sort AtomArray or AtomArrayStack by 'sym_id'
|
|
1755
|
+
max_sym_id = assembly.sym_id.max()
|
|
1756
|
+
assembly = concatenate(
|
|
1757
|
+
[assembly[..., assembly.sym_id == sym_id] for sym_id in range(max_sym_id + 1)]
|
|
1758
|
+
)
|
|
1759
|
+
|
|
1760
|
+
# Remove 'label_asym_id', if it was not included in the original
|
|
1761
|
+
# user-supplied 'extra_fields'
|
|
1762
|
+
if "label_asym_id" not in extra_fields:
|
|
1763
|
+
assembly.del_annotation("label_asym_id")
|
|
1764
|
+
|
|
1765
|
+
return assembly
|
|
1766
|
+
|
|
1767
|
+
|
|
1768
|
+
def _apply_transformations(structure, transformation_dict, operations):
|
|
1769
|
+
"""
|
|
1770
|
+
Get subassembly by applying the given operations to the input
|
|
1771
|
+
structure containing affected asym IDs.
|
|
1772
|
+
"""
|
|
1773
|
+
# Additional first dimesion for 'structure.repeat()'
|
|
1774
|
+
assembly_coord = np.zeros((len(operations),) + structure.coord.shape)
|
|
1775
|
+
# Apply corresponding transformation for each copy in the assembly
|
|
1776
|
+
for i, operation in enumerate(operations):
|
|
1777
|
+
coord = structure.coord
|
|
1778
|
+
# Execute for each transformation step
|
|
1779
|
+
# in the operation expression
|
|
1780
|
+
for op_step in operation:
|
|
1781
|
+
coord = transformation_dict[op_step].apply(coord)
|
|
1782
|
+
assembly_coord[i] = coord
|
|
1783
|
+
|
|
1784
|
+
assembly = repeat(structure, assembly_coord)
|
|
1785
|
+
assembly.set_annotation(
|
|
1786
|
+
"sym_id", np.repeat(np.arange(len(operations)), structure.array_length())
|
|
1787
|
+
)
|
|
1788
|
+
return assembly
|
|
1789
|
+
|
|
1790
|
+
|
|
1791
|
+
def _get_transformations(struct_oper):
|
|
1792
|
+
"""
|
|
1793
|
+
Get affine transformation for each operation ID in ``pdbx_struct_oper_list``.
|
|
1794
|
+
"""
|
|
1795
|
+
transformation_dict = {}
|
|
1796
|
+
for index, id in enumerate(struct_oper["id"].as_array(str)):
|
|
1797
|
+
rotation_matrix = np.array(
|
|
1798
|
+
[
|
|
1799
|
+
[
|
|
1800
|
+
struct_oper[f"matrix[{i}][{j}]"].as_array(float)[index]
|
|
1801
|
+
for j in (1, 2, 3)
|
|
1802
|
+
]
|
|
1803
|
+
for i in (1, 2, 3)
|
|
1804
|
+
]
|
|
1805
|
+
)
|
|
1806
|
+
translation_vector = np.array(
|
|
1807
|
+
[struct_oper[f"vector[{i}]"].as_array(float)[index] for i in (1, 2, 3)]
|
|
1808
|
+
)
|
|
1809
|
+
transformation_dict[id] = AffineTransformation(
|
|
1810
|
+
np.zeros(3), rotation_matrix, translation_vector
|
|
1811
|
+
)
|
|
1812
|
+
return transformation_dict
|
|
1813
|
+
|
|
1814
|
+
|
|
1815
|
+
def _parse_operation_expression(expression):
|
|
1816
|
+
"""
|
|
1817
|
+
Get successive operation steps (IDs) for the given
|
|
1818
|
+
``oper_expression``.
|
|
1819
|
+
Form the cartesian product, if necessary.
|
|
1820
|
+
"""
|
|
1821
|
+
# Split groups by parentheses:
|
|
1822
|
+
# use the opening parenthesis as delimiter
|
|
1823
|
+
# and just remove the closing parenthesis
|
|
1824
|
+
# example: '(X0)(1-10,21-25)' from 1a34
|
|
1825
|
+
expressions_per_step = expression.replace(")", "").split("(")
|
|
1826
|
+
expressions_per_step = [e for e in expressions_per_step if len(e) > 0]
|
|
1827
|
+
# Important: Operations are applied from right to left
|
|
1828
|
+
expressions_per_step.reverse()
|
|
1829
|
+
|
|
1830
|
+
operations = []
|
|
1831
|
+
for one_step_expr in expressions_per_step:
|
|
1832
|
+
one_step_op_ids = []
|
|
1833
|
+
for expr in one_step_expr.split(","):
|
|
1834
|
+
if "-" in expr:
|
|
1835
|
+
# Range of operation IDs, they must be integers
|
|
1836
|
+
first, last = expr.split("-")
|
|
1837
|
+
one_step_op_ids.extend(
|
|
1838
|
+
[str(id) for id in range(int(first), int(last) + 1)]
|
|
1839
|
+
)
|
|
1840
|
+
else:
|
|
1841
|
+
# Single operation ID
|
|
1842
|
+
one_step_op_ids.append(expr)
|
|
1843
|
+
operations.append(one_step_op_ids)
|
|
1844
|
+
|
|
1845
|
+
# Cartesian product of operations
|
|
1846
|
+
return list(itertools.product(*operations))
|
|
1847
|
+
|
|
1848
|
+
|
|
1849
|
+
def _convert_string_to_sequence(string, stype):
|
|
1850
|
+
"""
|
|
1851
|
+
Convert strings to `ProteinSequence` if `stype` is contained in
|
|
1852
|
+
``proteinseq_type_list`` or to ``NucleotideSequence`` if `stype` is
|
|
1853
|
+
contained in ``_nucleotideseq_type_list``.
|
|
1854
|
+
"""
|
|
1855
|
+
# sequence may be stored as multiline string
|
|
1856
|
+
string = string.replace("\n", "")
|
|
1857
|
+
if stype in _proteinseq_type_list:
|
|
1858
|
+
return ProteinSequence(string)
|
|
1859
|
+
elif stype in _nucleotideseq_type_list:
|
|
1860
|
+
string = string.replace("U", "T")
|
|
1861
|
+
return NucleotideSequence(string)
|
|
1862
|
+
elif stype in _other_type_list:
|
|
1863
|
+
return None
|
|
1864
|
+
else:
|
|
1865
|
+
raise InvalidFileError("mmCIF _entity_poly.type unsupported type: " + stype)
|
|
1866
|
+
|
|
1867
|
+
|
|
1868
|
+
def get_unit_cell(
|
|
1869
|
+
pdbx_file,
|
|
1870
|
+
center=True,
|
|
1871
|
+
model=None,
|
|
1872
|
+
data_block=None,
|
|
1873
|
+
altloc="first",
|
|
1874
|
+
extra_fields=None,
|
|
1875
|
+
use_author_fields=True,
|
|
1876
|
+
include_bonds=False,
|
|
1877
|
+
):
|
|
1878
|
+
"""
|
|
1879
|
+
Build a structure model containing all symmetric copies of the structure within a
|
|
1880
|
+
single unit cell.
|
|
1881
|
+
|
|
1882
|
+
This function receives the data from the ``symmetry`` and ``atom_site`` categories
|
|
1883
|
+
in the file.
|
|
1884
|
+
Consequently, these categories must be present in the file.
|
|
1885
|
+
|
|
1886
|
+
Parameters
|
|
1887
|
+
----------
|
|
1888
|
+
pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
|
|
1889
|
+
The file object.
|
|
1890
|
+
center : bool, optional
|
|
1891
|
+
If set to true, each symmetric copy will be moved inside the unit cell
|
|
1892
|
+
dimensions, if its centroid is outside.
|
|
1893
|
+
By default, the copies are are created using the raw space group
|
|
1894
|
+
transformations, which may put them one unit cell length further away.
|
|
1895
|
+
model : int, optional
|
|
1896
|
+
If this parameter is given, the function will return an
|
|
1897
|
+
:class:`AtomArray` from the atoms corresponding to the given
|
|
1898
|
+
model number (starting at 1).
|
|
1899
|
+
Negative values are used to index models starting from the last
|
|
1900
|
+
model insted of the first model.
|
|
1901
|
+
If this parameter is omitted, an :class:`AtomArrayStack`
|
|
1902
|
+
containing all models will be returned, even if the structure
|
|
1903
|
+
contains only one model.
|
|
1904
|
+
data_block : str, optional
|
|
1905
|
+
The name of the data block.
|
|
1906
|
+
Default is the first (and most times only) data block of the
|
|
1907
|
+
file.
|
|
1908
|
+
If the data block object is passed directly to `pdbx_file`,
|
|
1909
|
+
this parameter is ignored.
|
|
1910
|
+
altloc : {'first', 'occupancy', 'all'}
|
|
1911
|
+
This parameter defines how *altloc* IDs are handled:
|
|
1912
|
+
- ``'first'`` - Use atoms that have the first *altloc* ID
|
|
1913
|
+
appearing in a residue.
|
|
1914
|
+
- ``'occupancy'`` - Use atoms that have the *altloc* ID
|
|
1915
|
+
with the highest occupancy for a residue.
|
|
1916
|
+
- ``'all'`` - Use all atoms.
|
|
1917
|
+
Note that this leads to duplicate atoms.
|
|
1918
|
+
When this option is chosen, the ``altloc_id`` annotation
|
|
1919
|
+
array is added to the returned structure.
|
|
1920
|
+
extra_fields : list of str, optional
|
|
1921
|
+
The strings in the list are entry names, that are
|
|
1922
|
+
additionally added as annotation arrays.
|
|
1923
|
+
The annotation category name will be the same as the PDBx
|
|
1924
|
+
subcategory name.
|
|
1925
|
+
The array type is always `str`.
|
|
1926
|
+
An exception are the special field identifiers:
|
|
1927
|
+
``'atom_id'``, ``'b_factor'``, ``'occupancy'`` and ``'charge'``.
|
|
1928
|
+
These will convert the fitting subcategory into an
|
|
1929
|
+
annotation array with reasonable type.
|
|
1930
|
+
use_author_fields : bool, optional
|
|
1931
|
+
Some fields can be read from two alternative sources,
|
|
1932
|
+
for example both, ``label_seq_id`` and ``auth_seq_id`` describe
|
|
1933
|
+
the ID of the residue.
|
|
1934
|
+
While, the ``label_xxx`` fields can be used as official pointers
|
|
1935
|
+
to other categories in the file, the ``auth_xxx``
|
|
1936
|
+
fields are set by the author(s) of the structure and are
|
|
1937
|
+
consistent with the corresponding values in PDB files.
|
|
1938
|
+
If `use_author_fields` is true, the annotation arrays will be
|
|
1939
|
+
read from the ``auth_xxx`` fields (if applicable),
|
|
1940
|
+
otherwise from the the ``label_xxx`` fields.
|
|
1941
|
+
include_bonds : bool, optional
|
|
1942
|
+
If set to true, a :class:`BondList` will be created for the
|
|
1943
|
+
resulting :class:`AtomArray` containing the bond information
|
|
1944
|
+
from the file.
|
|
1945
|
+
Inter-residue bonds, will be read from the ``struct_conn``
|
|
1946
|
+
category.
|
|
1947
|
+
Intra-residue bonds will be read from the ``chem_comp_bond``, if
|
|
1948
|
+
available, otherwise they will be derived from the Chemical
|
|
1949
|
+
Component Dictionary.
|
|
1950
|
+
|
|
1951
|
+
Returns
|
|
1952
|
+
-------
|
|
1953
|
+
unit_cell : AtomArray or AtomArrayStack
|
|
1954
|
+
The structure representing the unit cell.
|
|
1955
|
+
The return type depends on the `model` parameter.
|
|
1956
|
+
Contains the `sym_id` annotation, which enumerates the copies of the asymmetric
|
|
1957
|
+
unit in the unit cell.
|
|
1958
|
+
|
|
1959
|
+
Examples
|
|
1960
|
+
--------
|
|
1961
|
+
|
|
1962
|
+
>>> import os.path
|
|
1963
|
+
>>> file = CIFFile.read(os.path.join(path_to_structures, "1f2n.cif"))
|
|
1964
|
+
>>> unit_cell = get_unit_cell(file, model=1)
|
|
1965
|
+
"""
|
|
1966
|
+
block = _get_block(pdbx_file, data_block)
|
|
1967
|
+
|
|
1968
|
+
try:
|
|
1969
|
+
space_group = block["symmetry"]["space_group_name_H-M"].as_item()
|
|
1970
|
+
except KeyError:
|
|
1971
|
+
raise InvalidFileError("File has no 'symmetry.space_group_name_H-M' field")
|
|
1972
|
+
transforms = space_group_transforms(space_group)
|
|
1973
|
+
|
|
1974
|
+
asym = get_structure(
|
|
1975
|
+
pdbx_file,
|
|
1976
|
+
model,
|
|
1977
|
+
data_block,
|
|
1978
|
+
altloc,
|
|
1979
|
+
extra_fields,
|
|
1980
|
+
use_author_fields,
|
|
1981
|
+
include_bonds,
|
|
1982
|
+
)
|
|
1983
|
+
|
|
1984
|
+
fractional_asym_coord = coord_to_fraction(asym.coord, asym.box)
|
|
1985
|
+
unit_cell_copies = []
|
|
1986
|
+
for transform in transforms:
|
|
1987
|
+
fractional_coord = transform.apply(fractional_asym_coord)
|
|
1988
|
+
if center:
|
|
1989
|
+
# If the centroid is outside the box, move the copy inside the box
|
|
1990
|
+
orig_centroid = centroid(fractional_coord)
|
|
1991
|
+
new_centroid = orig_centroid % 1
|
|
1992
|
+
fractional_coord += (new_centroid - orig_centroid)[..., np.newaxis, :]
|
|
1993
|
+
unit_cell_copies.append(fraction_to_coord(fractional_coord, asym.box))
|
|
1994
|
+
|
|
1995
|
+
unit_cell = repeat(asym, np.stack(unit_cell_copies, axis=0))
|
|
1996
|
+
unit_cell.set_annotation(
|
|
1997
|
+
"sym_id", np.repeat(np.arange(len(transforms)), asym.array_length())
|
|
1998
|
+
)
|
|
1999
|
+
return unit_cell
|
|
2000
|
+
|
|
2001
|
+
|
|
2002
|
+
def get_sse(pdbx_file, data_block=None, match_model=None):
|
|
2003
|
+
"""
|
|
2004
|
+
Get the secondary structure from a PDBx file.
|
|
2005
|
+
|
|
2006
|
+
Parameters
|
|
2007
|
+
----------
|
|
2008
|
+
pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
|
|
2009
|
+
The file object.
|
|
2010
|
+
The following categories are required:
|
|
2011
|
+
|
|
2012
|
+
- ``entity_poly``
|
|
2013
|
+
- ``struct_conf`` (if alpha-helices are present)
|
|
2014
|
+
- ``struct_sheet_range`` (if beta-strands are present)
|
|
2015
|
+
- ``atom_site`` (if `match_model` is set)
|
|
2016
|
+
|
|
2017
|
+
data_block : str, optional
|
|
2018
|
+
The name of the data block.
|
|
2019
|
+
Default is the first (and most times only) data block of the
|
|
2020
|
+
file.
|
|
2021
|
+
If the data block object is passed directly to `pdbx_file`,
|
|
2022
|
+
this parameter is ignored.
|
|
2023
|
+
match_model : None, optional
|
|
2024
|
+
If a model number is given, only secondary structure elements for residues are
|
|
2025
|
+
kept, that are resolved in the given model.
|
|
2026
|
+
This means secondary structure elements for residues that would not appear
|
|
2027
|
+
in a corresponding :class:`AtomArray` from :func:`get_structure()` are removed.
|
|
2028
|
+
By default, all residues in the sequence are kept.
|
|
2029
|
+
|
|
2030
|
+
Returns
|
|
2031
|
+
-------
|
|
2032
|
+
sse_dict : dict of str -> ndarray, dtype=str
|
|
2033
|
+
The dictionary maps the chain ID (derived from ``auth_asym_id``) to the
|
|
2034
|
+
secondary structure of the respective chain.
|
|
2035
|
+
|
|
2036
|
+
- ``"a"``: alpha-helix
|
|
2037
|
+
- ``"b"``: beta-strand
|
|
2038
|
+
- ``"c"``: coil or not an amino acid
|
|
2039
|
+
|
|
2040
|
+
Each secondary structure element corresponds to the ``label_seq_id`` of the
|
|
2041
|
+
``atom_site`` category.
|
|
2042
|
+
This means that the 0-th position of the array corresponds to the residue
|
|
2043
|
+
in ``atom_site`` with ``label_seq_id`` ``1``.
|
|
2044
|
+
|
|
2045
|
+
Examples
|
|
2046
|
+
--------
|
|
2047
|
+
|
|
2048
|
+
>>> import os.path
|
|
2049
|
+
>>> file = CIFFile.read(os.path.join(path_to_structures, "1aki.cif"))
|
|
2050
|
+
>>> sse = get_sse(file, match_model=1)
|
|
2051
|
+
>>> print(sse)
|
|
2052
|
+
{'A': array(['c', 'c', 'c', 'c', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a',
|
|
2053
|
+
'a', 'c', 'c', 'c', 'c', 'c', 'a', 'a', 'a', 'c', 'c', 'a', 'a',
|
|
2054
|
+
'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c', 'c', 'c',
|
|
2055
|
+
'c', 'c', 'c', 'b', 'b', 'b', 'c', 'c', 'c', 'c', 'c', 'b', 'b',
|
|
2056
|
+
'b', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c',
|
|
2057
|
+
'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c',
|
|
2058
|
+
'c', 'a', 'a', 'a', 'a', 'a', 'c', 'c', 'c', 'c', 'a', 'a', 'a',
|
|
2059
|
+
'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c', 'c', 'a',
|
|
2060
|
+
'a', 'a', 'a', 'c', 'a', 'a', 'a', 'a', 'a', 'a', 'c', 'c', 'c',
|
|
2061
|
+
'c', 'c', 'a', 'a', 'a', 'a', 'c', 'c', 'c', 'c', 'c', 'c'],
|
|
2062
|
+
dtype='<U1')}
|
|
2063
|
+
|
|
2064
|
+
If only secondary structure elements for resolved residues are requested, the length
|
|
2065
|
+
of the returned array matches the number of peptide residues in the structure.
|
|
2066
|
+
|
|
2067
|
+
>>> file = CIFFile.read(os.path.join(path_to_structures, "3o5r.cif"))
|
|
2068
|
+
>>> print(len(get_sse(file, match_model=1)["A"]))
|
|
2069
|
+
128
|
|
2070
|
+
>>> atoms = get_structure(file, model=1)
|
|
2071
|
+
>>> atoms = atoms[filter_amino_acids(atoms) & (atoms.chain_id == "A")]
|
|
2072
|
+
>>> print(get_residue_count(atoms))
|
|
2073
|
+
128
|
|
2074
|
+
"""
|
|
2075
|
+
block = _get_block(pdbx_file, data_block)
|
|
2076
|
+
|
|
2077
|
+
# Init all chains with "c" for coil
|
|
2078
|
+
sse_dict = {
|
|
2079
|
+
chain_id: np.repeat("c", len(sequence))
|
|
2080
|
+
for chain_id, sequence in get_sequence(block).items()
|
|
2081
|
+
}
|
|
2082
|
+
|
|
2083
|
+
# Populate SSE arrays with helices and strands
|
|
2084
|
+
for sse_symbol, category_name in [
|
|
2085
|
+
("a", "struct_conf"),
|
|
2086
|
+
("b", "struct_sheet_range"),
|
|
2087
|
+
]:
|
|
2088
|
+
if category_name in block:
|
|
2089
|
+
category = block[category_name]
|
|
2090
|
+
chains = category["beg_auth_asym_id"].as_array(str)
|
|
2091
|
+
start_positions = category["beg_label_seq_id"].as_array(int)
|
|
2092
|
+
end_positions = category["end_label_seq_id"].as_array(int)
|
|
2093
|
+
|
|
2094
|
+
# set alpha helix positions
|
|
2095
|
+
for chain, start, end in zip(chains, start_positions, end_positions):
|
|
2096
|
+
# Translate the 1-based positions from PDBx into 0-based array indices
|
|
2097
|
+
sse_dict[chain][start - 1 : end] = sse_symbol
|
|
2098
|
+
|
|
2099
|
+
if match_model is not None:
|
|
2100
|
+
model_atom_site = _filter_model(block["atom_site"], match_model)
|
|
2101
|
+
chain_ids = model_atom_site["auth_asym_id"].as_array(str)
|
|
2102
|
+
res_ids = model_atom_site["label_seq_id"].as_array(int, masked_value=-1)
|
|
2103
|
+
# Filter out masked residues, i.e. residues not part of a chain
|
|
2104
|
+
mask = res_ids != -1
|
|
2105
|
+
chain_ids = chain_ids[mask]
|
|
2106
|
+
res_ids = res_ids[mask]
|
|
2107
|
+
for chain_id, sse in sse_dict.items():
|
|
2108
|
+
res_ids_in_chain = res_ids[chain_ids == chain_id]
|
|
2109
|
+
# Transform from 1-based residue ID to 0-based index
|
|
2110
|
+
indices = np.unique(res_ids_in_chain) - 1
|
|
2111
|
+
sse_dict[chain_id] = sse[indices]
|
|
2112
|
+
|
|
2113
|
+
return sse_dict
|