biotite 1.6.0__cp314-cp314-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biotite/__init__.py +18 -0
- biotite/application/__init__.py +69 -0
- biotite/application/application.py +276 -0
- biotite/application/autodock/__init__.py +12 -0
- biotite/application/autodock/app.py +500 -0
- biotite/application/blast/__init__.py +14 -0
- biotite/application/blast/alignment.py +92 -0
- biotite/application/blast/webapp.py +426 -0
- biotite/application/clustalo/__init__.py +12 -0
- biotite/application/clustalo/app.py +223 -0
- biotite/application/dssp/__init__.py +12 -0
- biotite/application/dssp/app.py +216 -0
- biotite/application/localapp.py +342 -0
- biotite/application/mafft/__init__.py +12 -0
- biotite/application/mafft/app.py +116 -0
- biotite/application/msaapp.py +363 -0
- biotite/application/muscle/__init__.py +13 -0
- biotite/application/muscle/app3.py +227 -0
- biotite/application/muscle/app5.py +163 -0
- biotite/application/sra/__init__.py +18 -0
- biotite/application/sra/app.py +447 -0
- biotite/application/tantan/__init__.py +12 -0
- biotite/application/tantan/app.py +199 -0
- biotite/application/util.py +77 -0
- biotite/application/viennarna/__init__.py +18 -0
- biotite/application/viennarna/rnaalifold.py +310 -0
- biotite/application/viennarna/rnafold.py +254 -0
- biotite/application/viennarna/rnaplot.py +208 -0
- biotite/application/viennarna/util.py +77 -0
- biotite/application/webapp.py +76 -0
- biotite/copyable.py +71 -0
- biotite/database/__init__.py +23 -0
- biotite/database/afdb/__init__.py +12 -0
- biotite/database/afdb/download.py +202 -0
- biotite/database/entrez/__init__.py +15 -0
- biotite/database/entrez/check.py +66 -0
- biotite/database/entrez/dbnames.py +101 -0
- biotite/database/entrez/download.py +224 -0
- biotite/database/entrez/key.py +44 -0
- biotite/database/entrez/query.py +263 -0
- biotite/database/error.py +16 -0
- biotite/database/pubchem/__init__.py +21 -0
- biotite/database/pubchem/download.py +259 -0
- biotite/database/pubchem/error.py +30 -0
- biotite/database/pubchem/query.py +819 -0
- biotite/database/pubchem/throttle.py +98 -0
- biotite/database/rcsb/__init__.py +13 -0
- biotite/database/rcsb/download.py +191 -0
- biotite/database/rcsb/query.py +963 -0
- biotite/database/uniprot/__init__.py +13 -0
- biotite/database/uniprot/check.py +40 -0
- biotite/database/uniprot/download.py +127 -0
- biotite/database/uniprot/query.py +292 -0
- biotite/file.py +244 -0
- biotite/interface/__init__.py +19 -0
- biotite/interface/openmm/__init__.py +20 -0
- biotite/interface/openmm/state.py +93 -0
- biotite/interface/openmm/system.py +227 -0
- biotite/interface/pymol/__init__.py +201 -0
- biotite/interface/pymol/cgo.py +346 -0
- biotite/interface/pymol/convert.py +185 -0
- biotite/interface/pymol/display.py +267 -0
- biotite/interface/pymol/object.py +1228 -0
- biotite/interface/pymol/shapes.py +178 -0
- biotite/interface/pymol/startup.py +169 -0
- biotite/interface/rdkit/__init__.py +19 -0
- biotite/interface/rdkit/mol.py +491 -0
- biotite/interface/version.py +94 -0
- biotite/interface/warning.py +19 -0
- biotite/sequence/__init__.py +84 -0
- biotite/sequence/align/__init__.py +199 -0
- biotite/sequence/align/alignment.py +763 -0
- biotite/sequence/align/banded.cp314-win_amd64.pyd +0 -0
- biotite/sequence/align/banded.pyx +652 -0
- biotite/sequence/align/buckets.py +71 -0
- biotite/sequence/align/cigar.py +425 -0
- biotite/sequence/align/kmeralphabet.cp314-win_amd64.pyd +0 -0
- biotite/sequence/align/kmeralphabet.pyx +595 -0
- biotite/sequence/align/kmersimilarity.cp314-win_amd64.pyd +0 -0
- biotite/sequence/align/kmersimilarity.pyx +233 -0
- biotite/sequence/align/kmertable.cp314-win_amd64.pyd +0 -0
- biotite/sequence/align/kmertable.pyx +3411 -0
- biotite/sequence/align/localgapped.cp314-win_amd64.pyd +0 -0
- biotite/sequence/align/localgapped.pyx +892 -0
- biotite/sequence/align/localungapped.cp314-win_amd64.pyd +0 -0
- biotite/sequence/align/localungapped.pyx +279 -0
- biotite/sequence/align/matrix.py +631 -0
- biotite/sequence/align/matrix_data/3Di.mat +24 -0
- biotite/sequence/align/matrix_data/BLOSUM100.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM30.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM35.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM40.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM45.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM50.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM50_13p.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM50_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM50_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM55.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM60.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM62.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM62_13p.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM62_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM62_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM65.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM70.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM75.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM80.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM85.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM90.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUMN.mat +31 -0
- biotite/sequence/align/matrix_data/CorBLOSUM49_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM57_13p.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM57_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM61_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM66_13p.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM67_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/DAYHOFF.mat +32 -0
- biotite/sequence/align/matrix_data/GONNET.mat +26 -0
- biotite/sequence/align/matrix_data/IDENTITY.mat +25 -0
- biotite/sequence/align/matrix_data/MATCH.mat +25 -0
- biotite/sequence/align/matrix_data/NUC.mat +25 -0
- biotite/sequence/align/matrix_data/PAM10.mat +34 -0
- biotite/sequence/align/matrix_data/PAM100.mat +34 -0
- biotite/sequence/align/matrix_data/PAM110.mat +34 -0
- biotite/sequence/align/matrix_data/PAM120.mat +34 -0
- biotite/sequence/align/matrix_data/PAM130.mat +34 -0
- biotite/sequence/align/matrix_data/PAM140.mat +34 -0
- biotite/sequence/align/matrix_data/PAM150.mat +34 -0
- biotite/sequence/align/matrix_data/PAM160.mat +34 -0
- biotite/sequence/align/matrix_data/PAM170.mat +34 -0
- biotite/sequence/align/matrix_data/PAM180.mat +34 -0
- biotite/sequence/align/matrix_data/PAM190.mat +34 -0
- biotite/sequence/align/matrix_data/PAM20.mat +34 -0
- biotite/sequence/align/matrix_data/PAM200.mat +34 -0
- biotite/sequence/align/matrix_data/PAM210.mat +34 -0
- biotite/sequence/align/matrix_data/PAM220.mat +34 -0
- biotite/sequence/align/matrix_data/PAM230.mat +34 -0
- biotite/sequence/align/matrix_data/PAM240.mat +34 -0
- biotite/sequence/align/matrix_data/PAM250.mat +34 -0
- biotite/sequence/align/matrix_data/PAM260.mat +34 -0
- biotite/sequence/align/matrix_data/PAM270.mat +34 -0
- biotite/sequence/align/matrix_data/PAM280.mat +34 -0
- biotite/sequence/align/matrix_data/PAM290.mat +34 -0
- biotite/sequence/align/matrix_data/PAM30.mat +34 -0
- biotite/sequence/align/matrix_data/PAM300.mat +34 -0
- biotite/sequence/align/matrix_data/PAM310.mat +34 -0
- biotite/sequence/align/matrix_data/PAM320.mat +34 -0
- biotite/sequence/align/matrix_data/PAM330.mat +34 -0
- biotite/sequence/align/matrix_data/PAM340.mat +34 -0
- biotite/sequence/align/matrix_data/PAM350.mat +34 -0
- biotite/sequence/align/matrix_data/PAM360.mat +34 -0
- biotite/sequence/align/matrix_data/PAM370.mat +34 -0
- biotite/sequence/align/matrix_data/PAM380.mat +34 -0
- biotite/sequence/align/matrix_data/PAM390.mat +34 -0
- biotite/sequence/align/matrix_data/PAM40.mat +34 -0
- biotite/sequence/align/matrix_data/PAM400.mat +34 -0
- biotite/sequence/align/matrix_data/PAM410.mat +34 -0
- biotite/sequence/align/matrix_data/PAM420.mat +34 -0
- biotite/sequence/align/matrix_data/PAM430.mat +34 -0
- biotite/sequence/align/matrix_data/PAM440.mat +34 -0
- biotite/sequence/align/matrix_data/PAM450.mat +34 -0
- biotite/sequence/align/matrix_data/PAM460.mat +34 -0
- biotite/sequence/align/matrix_data/PAM470.mat +34 -0
- biotite/sequence/align/matrix_data/PAM480.mat +34 -0
- biotite/sequence/align/matrix_data/PAM490.mat +34 -0
- biotite/sequence/align/matrix_data/PAM50.mat +34 -0
- biotite/sequence/align/matrix_data/PAM500.mat +34 -0
- biotite/sequence/align/matrix_data/PAM60.mat +34 -0
- biotite/sequence/align/matrix_data/PAM70.mat +34 -0
- biotite/sequence/align/matrix_data/PAM80.mat +34 -0
- biotite/sequence/align/matrix_data/PAM90.mat +34 -0
- biotite/sequence/align/matrix_data/PB.license +21 -0
- biotite/sequence/align/matrix_data/PB.mat +18 -0
- biotite/sequence/align/matrix_data/RBLOSUM52_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM59_13p.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM59_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM64_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM69_13p.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM69_14.3.mat +25 -0
- biotite/sequence/align/multiple.cp314-win_amd64.pyd +0 -0
- biotite/sequence/align/multiple.pyx +619 -0
- biotite/sequence/align/pairwise.cp314-win_amd64.pyd +0 -0
- biotite/sequence/align/pairwise.pyx +585 -0
- biotite/sequence/align/permutation.cp314-win_amd64.pyd +0 -0
- biotite/sequence/align/permutation.pyx +313 -0
- biotite/sequence/align/primes.txt +821 -0
- biotite/sequence/align/selector.cp314-win_amd64.pyd +0 -0
- biotite/sequence/align/selector.pyx +954 -0
- biotite/sequence/align/statistics.py +264 -0
- biotite/sequence/align/tracetable.cp314-win_amd64.pyd +0 -0
- biotite/sequence/align/tracetable.pxd +64 -0
- biotite/sequence/align/tracetable.pyx +370 -0
- biotite/sequence/alphabet.py +555 -0
- biotite/sequence/annotation.py +836 -0
- biotite/sequence/codec.cp314-win_amd64.pyd +0 -0
- biotite/sequence/codec.pyx +155 -0
- biotite/sequence/codon.py +476 -0
- biotite/sequence/codon_tables.txt +202 -0
- biotite/sequence/graphics/__init__.py +33 -0
- biotite/sequence/graphics/alignment.py +1101 -0
- biotite/sequence/graphics/color_schemes/3di_flower.json +48 -0
- biotite/sequence/graphics/color_schemes/autumn.json +51 -0
- biotite/sequence/graphics/color_schemes/blossom.json +51 -0
- biotite/sequence/graphics/color_schemes/clustalx_dna.json +11 -0
- biotite/sequence/graphics/color_schemes/clustalx_protein.json +28 -0
- biotite/sequence/graphics/color_schemes/flower.json +51 -0
- biotite/sequence/graphics/color_schemes/jalview_buried.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_hydrophobicity.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_prop_helix.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_prop_strand.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_prop_turn.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_taylor.json +28 -0
- biotite/sequence/graphics/color_schemes/jalview_zappo.json +28 -0
- biotite/sequence/graphics/color_schemes/ocean.json +51 -0
- biotite/sequence/graphics/color_schemes/pb_flower.json +40 -0
- biotite/sequence/graphics/color_schemes/rainbow_dna.json +11 -0
- biotite/sequence/graphics/color_schemes/rainbow_protein.json +30 -0
- biotite/sequence/graphics/color_schemes/spring.json +51 -0
- biotite/sequence/graphics/color_schemes/sunset.json +51 -0
- biotite/sequence/graphics/color_schemes/wither.json +51 -0
- biotite/sequence/graphics/colorschemes.py +170 -0
- biotite/sequence/graphics/dendrogram.py +231 -0
- biotite/sequence/graphics/features.py +544 -0
- biotite/sequence/graphics/logo.py +102 -0
- biotite/sequence/graphics/plasmid.py +712 -0
- biotite/sequence/io/__init__.py +12 -0
- biotite/sequence/io/fasta/__init__.py +22 -0
- biotite/sequence/io/fasta/convert.py +462 -0
- biotite/sequence/io/fasta/file.py +265 -0
- biotite/sequence/io/fastq/__init__.py +19 -0
- biotite/sequence/io/fastq/convert.py +117 -0
- biotite/sequence/io/fastq/file.py +507 -0
- biotite/sequence/io/genbank/__init__.py +17 -0
- biotite/sequence/io/genbank/annotation.py +269 -0
- biotite/sequence/io/genbank/file.py +573 -0
- biotite/sequence/io/genbank/metadata.py +336 -0
- biotite/sequence/io/genbank/sequence.py +173 -0
- biotite/sequence/io/general.py +201 -0
- biotite/sequence/io/gff/__init__.py +26 -0
- biotite/sequence/io/gff/convert.py +128 -0
- biotite/sequence/io/gff/file.py +449 -0
- biotite/sequence/phylo/__init__.py +36 -0
- biotite/sequence/phylo/nj.cp314-win_amd64.pyd +0 -0
- biotite/sequence/phylo/nj.pyx +221 -0
- biotite/sequence/phylo/tree.cp314-win_amd64.pyd +0 -0
- biotite/sequence/phylo/tree.pyx +1169 -0
- biotite/sequence/phylo/upgma.cp314-win_amd64.pyd +0 -0
- biotite/sequence/phylo/upgma.pyx +164 -0
- biotite/sequence/profile.py +561 -0
- biotite/sequence/search.py +117 -0
- biotite/sequence/seqtypes.py +720 -0
- biotite/sequence/sequence.py +373 -0
- biotite/setup_ccd.py +197 -0
- biotite/structure/__init__.py +135 -0
- biotite/structure/alphabet/__init__.py +25 -0
- biotite/structure/alphabet/encoder.py +332 -0
- biotite/structure/alphabet/encoder_weights_3di.kerasify +0 -0
- biotite/structure/alphabet/i3d.py +109 -0
- biotite/structure/alphabet/layers.py +86 -0
- biotite/structure/alphabet/pb.license +21 -0
- biotite/structure/alphabet/pb.py +170 -0
- biotite/structure/alphabet/unkerasify.py +128 -0
- biotite/structure/atoms.py +1596 -0
- biotite/structure/basepairs.py +1403 -0
- biotite/structure/bonds.cp314-win_amd64.pyd +0 -0
- biotite/structure/bonds.pyx +2036 -0
- biotite/structure/box.py +724 -0
- biotite/structure/celllist.cp314-win_amd64.pyd +0 -0
- biotite/structure/celllist.pyx +864 -0
- biotite/structure/chains.py +310 -0
- biotite/structure/charges.cp314-win_amd64.pyd +0 -0
- biotite/structure/charges.pyx +521 -0
- biotite/structure/compare.py +683 -0
- biotite/structure/density.py +109 -0
- biotite/structure/dotbracket.py +213 -0
- biotite/structure/error.py +39 -0
- biotite/structure/filter.py +646 -0
- biotite/structure/geometry.py +817 -0
- biotite/structure/graphics/__init__.py +13 -0
- biotite/structure/graphics/atoms.py +243 -0
- biotite/structure/graphics/rna.py +298 -0
- biotite/structure/hbond.py +426 -0
- biotite/structure/info/__init__.py +24 -0
- biotite/structure/info/atom_masses.json +121 -0
- biotite/structure/info/atoms.py +98 -0
- biotite/structure/info/bonds.py +149 -0
- biotite/structure/info/ccd.py +200 -0
- biotite/structure/info/components.bcif +0 -0
- biotite/structure/info/groups.py +128 -0
- biotite/structure/info/masses.py +121 -0
- biotite/structure/info/misc.py +137 -0
- biotite/structure/info/radii.py +267 -0
- biotite/structure/info/standardize.py +185 -0
- biotite/structure/integrity.py +213 -0
- biotite/structure/io/__init__.py +29 -0
- biotite/structure/io/dcd/__init__.py +13 -0
- biotite/structure/io/dcd/file.py +67 -0
- biotite/structure/io/general.py +243 -0
- biotite/structure/io/gro/__init__.py +14 -0
- biotite/structure/io/gro/file.py +343 -0
- biotite/structure/io/mol/__init__.py +20 -0
- biotite/structure/io/mol/convert.py +112 -0
- biotite/structure/io/mol/ctab.py +420 -0
- biotite/structure/io/mol/header.py +120 -0
- biotite/structure/io/mol/mol.py +149 -0
- biotite/structure/io/mol/sdf.py +940 -0
- biotite/structure/io/netcdf/__init__.py +13 -0
- biotite/structure/io/netcdf/file.py +64 -0
- biotite/structure/io/pdb/__init__.py +20 -0
- biotite/structure/io/pdb/convert.py +389 -0
- biotite/structure/io/pdb/file.py +1380 -0
- biotite/structure/io/pdb/hybrid36.cp314-win_amd64.pyd +0 -0
- biotite/structure/io/pdb/hybrid36.pyx +242 -0
- biotite/structure/io/pdbqt/__init__.py +15 -0
- biotite/structure/io/pdbqt/convert.py +113 -0
- biotite/structure/io/pdbqt/file.py +688 -0
- biotite/structure/io/pdbx/__init__.py +23 -0
- biotite/structure/io/pdbx/bcif.py +674 -0
- biotite/structure/io/pdbx/cif.py +1091 -0
- biotite/structure/io/pdbx/component.py +251 -0
- biotite/structure/io/pdbx/compress.py +362 -0
- biotite/structure/io/pdbx/convert.py +2122 -0
- biotite/structure/io/pdbx/encoding.cp314-win_amd64.pyd +0 -0
- biotite/structure/io/pdbx/encoding.pyx +1078 -0
- biotite/structure/io/trajfile.py +696 -0
- biotite/structure/io/trr/__init__.py +13 -0
- biotite/structure/io/trr/file.py +43 -0
- biotite/structure/io/util.py +38 -0
- biotite/structure/io/xtc/__init__.py +13 -0
- biotite/structure/io/xtc/file.py +43 -0
- biotite/structure/mechanics.py +72 -0
- biotite/structure/molecules.py +337 -0
- biotite/structure/pseudoknots.py +622 -0
- biotite/structure/rdf.py +245 -0
- biotite/structure/repair.py +302 -0
- biotite/structure/residues.py +716 -0
- biotite/structure/rings.py +452 -0
- biotite/structure/sasa.cp314-win_amd64.pyd +0 -0
- biotite/structure/sasa.pyx +322 -0
- biotite/structure/segments.py +328 -0
- biotite/structure/sequence.py +110 -0
- biotite/structure/spacegroups.json +1567 -0
- biotite/structure/spacegroups.license +26 -0
- biotite/structure/sse.py +306 -0
- biotite/structure/superimpose.py +511 -0
- biotite/structure/tm.py +581 -0
- biotite/structure/transform.py +736 -0
- biotite/structure/util.py +160 -0
- biotite/version.py +34 -0
- biotite/visualize.py +375 -0
- biotite-1.6.0.dist-info/METADATA +162 -0
- biotite-1.6.0.dist-info/RECORD +354 -0
- biotite-1.6.0.dist-info/WHEEL +4 -0
- biotite-1.6.0.dist-info/licenses/LICENSE.rst +30 -0
|
@@ -0,0 +1,1380 @@
|
|
|
1
|
+
# This source code is part of the Biotite package and is distributed
|
|
2
|
+
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
|
|
3
|
+
# information.
|
|
4
|
+
|
|
5
|
+
__name__ = "biotite.structure.io.pdb"
|
|
6
|
+
__author__ = "Patrick Kunzmann, Daniel Bauer, Claude J. Rogers"
|
|
7
|
+
__all__ = ["PDBFile"]
|
|
8
|
+
|
|
9
|
+
import itertools
|
|
10
|
+
import warnings
|
|
11
|
+
from collections import namedtuple
|
|
12
|
+
import numpy as np
|
|
13
|
+
from biotite.file import InvalidFileError, TextFile
|
|
14
|
+
from biotite.structure.atoms import AtomArray, AtomArrayStack, repeat
|
|
15
|
+
from biotite.structure.bonds import (
|
|
16
|
+
BondList,
|
|
17
|
+
connect_via_residue_names,
|
|
18
|
+
)
|
|
19
|
+
from biotite.structure.box import unitcell_from_vectors, vectors_from_unitcell
|
|
20
|
+
from biotite.structure.error import BadStructureError
|
|
21
|
+
from biotite.structure.filter import (
|
|
22
|
+
filter_first_altloc,
|
|
23
|
+
filter_highest_occupancy_altloc,
|
|
24
|
+
filter_solvent,
|
|
25
|
+
)
|
|
26
|
+
from biotite.structure.info.bonds import bonds_in_residue
|
|
27
|
+
from biotite.structure.io.pdb.hybrid36 import (
|
|
28
|
+
decode_hybrid36,
|
|
29
|
+
encode_hybrid36,
|
|
30
|
+
max_hybrid36_number,
|
|
31
|
+
)
|
|
32
|
+
from biotite.structure.io.util import number_of_integer_digits
|
|
33
|
+
from biotite.structure.repair import infer_elements
|
|
34
|
+
from biotite.structure.util import matrix_rotate
|
|
35
|
+
|
|
36
|
+
_PDB_MAX_ATOMS = 99999
|
|
37
|
+
_PDB_MAX_RESIDUES = 9999
|
|
38
|
+
|
|
39
|
+
# slice objects for readability
|
|
40
|
+
# ATOM/HETATM
|
|
41
|
+
_record = slice(0, 6)
|
|
42
|
+
_atom_id = slice(6, 11)
|
|
43
|
+
_atom_name = slice(12, 16)
|
|
44
|
+
_alt_loc = slice(16, 17)
|
|
45
|
+
_res_name = slice(17, 20)
|
|
46
|
+
_chain_id = slice(21, 22)
|
|
47
|
+
_res_id = slice(22, 26)
|
|
48
|
+
_ins_code = slice(26, 27)
|
|
49
|
+
_coord_x = slice(30, 38)
|
|
50
|
+
_coord_y = slice(38, 46)
|
|
51
|
+
_coord_z = slice(46, 54)
|
|
52
|
+
_occupancy = slice(54, 60)
|
|
53
|
+
_temp_f = slice(60, 66)
|
|
54
|
+
_element = slice(76, 78)
|
|
55
|
+
_charge = slice(78, 80)
|
|
56
|
+
# CRYST1
|
|
57
|
+
_a = slice(6, 15)
|
|
58
|
+
_b = slice(15, 24)
|
|
59
|
+
_c = slice(24, 33)
|
|
60
|
+
_alpha = slice(33, 40)
|
|
61
|
+
_beta = slice(40, 47)
|
|
62
|
+
_gamma = slice(47, 54)
|
|
63
|
+
_space = slice(55, 66)
|
|
64
|
+
_z = slice(66, 70)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class PDBFile(TextFile):
|
|
68
|
+
r"""
|
|
69
|
+
This class represents a PDB file.
|
|
70
|
+
|
|
71
|
+
The usage of :mod:`biotite.structure.io.pdbx` is encouraged in favor
|
|
72
|
+
of this class.
|
|
73
|
+
|
|
74
|
+
This class only provides support for reading/writing the pure atom
|
|
75
|
+
information (*ATOM*, *HETATM*, *MODEL* and *ENDMDL* records). *TER*
|
|
76
|
+
records cannot be written.
|
|
77
|
+
Additionally, *REMARK* records can be read
|
|
78
|
+
|
|
79
|
+
See Also
|
|
80
|
+
--------
|
|
81
|
+
CIFFile : Interface to CIF files, a modern replacement for PDB files.
|
|
82
|
+
BinaryCIFFile : Interface to BinaryCIF files, a binary variant of CIF files.
|
|
83
|
+
|
|
84
|
+
Examples
|
|
85
|
+
--------
|
|
86
|
+
Load a `\\*.pdb` file, modify the structure and save the new
|
|
87
|
+
structure into a new file:
|
|
88
|
+
|
|
89
|
+
>>> import os.path
|
|
90
|
+
>>> file = PDBFile.read(os.path.join(path_to_structures, "1l2y.pdb"))
|
|
91
|
+
>>> array_stack = file.get_structure()
|
|
92
|
+
>>> array_stack_mod = rotate(array_stack, [1,2,3])
|
|
93
|
+
>>> file = PDBFile()
|
|
94
|
+
>>> file.set_structure(array_stack_mod)
|
|
95
|
+
>>> file.write(os.path.join(path_to_directory, "1l2y_mod.pdb"))
|
|
96
|
+
"""
|
|
97
|
+
|
|
98
|
+
@classmethod
|
|
99
|
+
def read(cls, file):
|
|
100
|
+
file = super().read(file)
|
|
101
|
+
# Pad lines with whitespace if lines are shorter
|
|
102
|
+
# than the required 80 characters
|
|
103
|
+
file.lines = [line.ljust(80) for line in file.lines]
|
|
104
|
+
file._index_models_and_atoms()
|
|
105
|
+
return file
|
|
106
|
+
|
|
107
|
+
def get_remark(self, number):
|
|
108
|
+
r"""
|
|
109
|
+
Get the lines containing the *REMARK* records with the given
|
|
110
|
+
`number`.
|
|
111
|
+
|
|
112
|
+
Parameters
|
|
113
|
+
----------
|
|
114
|
+
number : int
|
|
115
|
+
The *REMARK* number, i.e. the `XXX` in ``REMARK XXX``.
|
|
116
|
+
|
|
117
|
+
Returns
|
|
118
|
+
-------
|
|
119
|
+
remark_lines : None or list of str
|
|
120
|
+
The content of the selected *REMARK* lines.
|
|
121
|
+
Each line is an element of this list.
|
|
122
|
+
The ``REMARK XXX `` part of each line is omitted.
|
|
123
|
+
Furthermore, the first line, which always must be empty, is
|
|
124
|
+
not included.
|
|
125
|
+
``None`` is returned, if the selected *REMARK* records do not
|
|
126
|
+
exist in the file.
|
|
127
|
+
|
|
128
|
+
Examples
|
|
129
|
+
--------
|
|
130
|
+
|
|
131
|
+
>>> import os.path
|
|
132
|
+
>>> file = PDBFile.read(os.path.join(path_to_structures, "1l2y.pdb"))
|
|
133
|
+
>>> remarks = file.get_remark(900)
|
|
134
|
+
>>> print("\n".join(remarks))
|
|
135
|
+
RELATED ENTRIES
|
|
136
|
+
RELATED ID: 5292 RELATED DB: BMRB
|
|
137
|
+
BMRB 5292 IS CHEMICAL SHIFTS FOR TC5B IN BUFFER AND BUFFER
|
|
138
|
+
CONTAINING 30 VOL-% TFE.
|
|
139
|
+
RELATED ID: 1JRJ RELATED DB: PDB
|
|
140
|
+
1JRJ IS AN ANALAGOUS C-TERMINAL STRUCTURE.
|
|
141
|
+
>>> nonexistent_remark = file.get_remark(999)
|
|
142
|
+
>>> print(nonexistent_remark)
|
|
143
|
+
None
|
|
144
|
+
"""
|
|
145
|
+
CONTENT_START_COLUMN = 11
|
|
146
|
+
|
|
147
|
+
# in case a non-integer is accidentally given
|
|
148
|
+
number = int(number)
|
|
149
|
+
if number < 0 or number > 999:
|
|
150
|
+
raise ValueError("The number must be in range 0-999")
|
|
151
|
+
|
|
152
|
+
remark_string = f"REMARK {number:>3d}"
|
|
153
|
+
# Find lines and omit ``REMARK XXX `` part
|
|
154
|
+
remark_lines = [
|
|
155
|
+
line[CONTENT_START_COLUMN:]
|
|
156
|
+
for line in self.lines
|
|
157
|
+
if line.startswith(remark_string)
|
|
158
|
+
]
|
|
159
|
+
if len(remark_lines) == 0:
|
|
160
|
+
return None
|
|
161
|
+
# Remove first empty line
|
|
162
|
+
remark_lines = remark_lines[1:]
|
|
163
|
+
return remark_lines
|
|
164
|
+
|
|
165
|
+
def get_model_count(self):
|
|
166
|
+
"""
|
|
167
|
+
Get the number of models contained in the PDB file.
|
|
168
|
+
|
|
169
|
+
Returns
|
|
170
|
+
-------
|
|
171
|
+
model_count : int
|
|
172
|
+
The number of models.
|
|
173
|
+
"""
|
|
174
|
+
return len(self._model_start_i)
|
|
175
|
+
|
|
176
|
+
def get_coord(self, model=None):
|
|
177
|
+
"""
|
|
178
|
+
Get only the coordinates from the PDB file.
|
|
179
|
+
|
|
180
|
+
Parameters
|
|
181
|
+
----------
|
|
182
|
+
model : int, optional
|
|
183
|
+
If this parameter is given, the function will return a
|
|
184
|
+
2D coordinate array from the atoms corresponding to the
|
|
185
|
+
given model number (starting at 1).
|
|
186
|
+
Negative values are used to index models starting from the
|
|
187
|
+
last model instead of the first model.
|
|
188
|
+
If this parameter is omitted, an 3D coordinate array
|
|
189
|
+
containing all models will be returned, even if
|
|
190
|
+
the structure contains only one model.
|
|
191
|
+
|
|
192
|
+
Returns
|
|
193
|
+
-------
|
|
194
|
+
coord : ndarray, shape=(m,n,3) or shape=(n,3), dtype=float
|
|
195
|
+
The coordinates read from the ATOM and HETATM records of the
|
|
196
|
+
file.
|
|
197
|
+
|
|
198
|
+
Notes
|
|
199
|
+
-----
|
|
200
|
+
Note that :func:`get_coord()` may output more coordinates than
|
|
201
|
+
the atom array (stack) from the corresponding
|
|
202
|
+
:func:`get_structure()` call has.
|
|
203
|
+
The reason for this is, that :func:`get_structure()` filters
|
|
204
|
+
*altloc* IDs, while `get_coord()` does not.
|
|
205
|
+
|
|
206
|
+
Examples
|
|
207
|
+
--------
|
|
208
|
+
Read an :class:`AtomArrayStack` from multiple PDB files, where
|
|
209
|
+
each PDB file contains the same atoms but different positions.
|
|
210
|
+
This is an efficient approach when a trajectory is spread into
|
|
211
|
+
multiple PDB files, as done e.g. by the *Rosetta* modeling
|
|
212
|
+
software.
|
|
213
|
+
|
|
214
|
+
For the purpose of this example, the PDB files are created from
|
|
215
|
+
an existing :class:`AtomArrayStack`.
|
|
216
|
+
|
|
217
|
+
>>> import os.path
|
|
218
|
+
>>> from tempfile import gettempdir
|
|
219
|
+
>>> file_names = []
|
|
220
|
+
>>> for i in range(atom_array_stack.stack_depth()):
|
|
221
|
+
... pdb_file = PDBFile()
|
|
222
|
+
... pdb_file.set_structure(atom_array_stack[i])
|
|
223
|
+
... file_name = os.path.join(gettempdir(), f"model_{i+1}.pdb")
|
|
224
|
+
... pdb_file.write(file_name)
|
|
225
|
+
... file_names.append(file_name)
|
|
226
|
+
>>> print(file_names)
|
|
227
|
+
['...model_1.pdb', '...model_2.pdb', ..., '...model_38.pdb']
|
|
228
|
+
|
|
229
|
+
Now the PDB files are used to create an :class:`AtomArrayStack`,
|
|
230
|
+
where each model represents a different model.
|
|
231
|
+
|
|
232
|
+
Construct a new :class:`AtomArrayStack` with annotations taken
|
|
233
|
+
from one of the created files used as template and coordinates
|
|
234
|
+
from all of the PDB files.
|
|
235
|
+
|
|
236
|
+
>>> template_file = PDBFile.read(file_names[0])
|
|
237
|
+
>>> template = template_file.get_structure()
|
|
238
|
+
>>> coord = []
|
|
239
|
+
>>> for i, file_name in enumerate(file_names):
|
|
240
|
+
... pdb_file = PDBFile.read(file_name)
|
|
241
|
+
... coord.append(pdb_file.get_coord(model=1))
|
|
242
|
+
>>> new_stack = from_template(template, np.array(coord))
|
|
243
|
+
|
|
244
|
+
The newly created :class:`AtomArrayStack` should now be equal to
|
|
245
|
+
the :class:`AtomArrayStack` the PDB files were created from.
|
|
246
|
+
|
|
247
|
+
>>> print(np.allclose(new_stack.coord, atom_array_stack.coord))
|
|
248
|
+
True
|
|
249
|
+
"""
|
|
250
|
+
if model is None:
|
|
251
|
+
coord = np.zeros(
|
|
252
|
+
(len(self._model_start_i), self._get_model_length(), 3),
|
|
253
|
+
dtype=np.float32,
|
|
254
|
+
)
|
|
255
|
+
m = 0
|
|
256
|
+
i = 0
|
|
257
|
+
for line_i in self._atom_line_i:
|
|
258
|
+
if (
|
|
259
|
+
m < len(self._model_start_i) - 1
|
|
260
|
+
and line_i > self._model_start_i[m + 1]
|
|
261
|
+
):
|
|
262
|
+
m += 1
|
|
263
|
+
i = 0
|
|
264
|
+
line = self.lines[line_i]
|
|
265
|
+
coord[m, i, 0] = float(line[_coord_x])
|
|
266
|
+
coord[m, i, 1] = float(line[_coord_y])
|
|
267
|
+
coord[m, i, 2] = float(line[_coord_z])
|
|
268
|
+
i += 1
|
|
269
|
+
return coord
|
|
270
|
+
|
|
271
|
+
else:
|
|
272
|
+
coord_i = self._get_atom_record_indices_for_model(model)
|
|
273
|
+
coord = np.zeros((len(coord_i), 3), dtype=np.float32)
|
|
274
|
+
for i, line_i in enumerate(coord_i):
|
|
275
|
+
line = self.lines[line_i]
|
|
276
|
+
coord[i, 0] = float(line[_coord_x])
|
|
277
|
+
coord[i, 1] = float(line[_coord_y])
|
|
278
|
+
coord[i, 2] = float(line[_coord_z])
|
|
279
|
+
return coord
|
|
280
|
+
|
|
281
|
+
def get_b_factor(self, model=None):
|
|
282
|
+
"""
|
|
283
|
+
Get only the B-factors from the PDB file.
|
|
284
|
+
|
|
285
|
+
Parameters
|
|
286
|
+
----------
|
|
287
|
+
model : int, optional
|
|
288
|
+
If this parameter is given, the function will return a
|
|
289
|
+
1D B-factor array from the atoms corresponding to the
|
|
290
|
+
given model number (starting at 1).
|
|
291
|
+
Negative values are used to index models starting from the
|
|
292
|
+
last model instead of the first model.
|
|
293
|
+
If this parameter is omitted, an 2D B-factor array
|
|
294
|
+
containing all models will be returned, even if
|
|
295
|
+
the structure contains only one model.
|
|
296
|
+
|
|
297
|
+
Returns
|
|
298
|
+
-------
|
|
299
|
+
b_factor : ndarray, shape=(m,n) or shape=(n,), dtype=float
|
|
300
|
+
The B-factors read from the ATOM and HETATM records of the
|
|
301
|
+
file.
|
|
302
|
+
|
|
303
|
+
Notes
|
|
304
|
+
-----
|
|
305
|
+
Note that :func:`get_b_factor()` may output more B-factors
|
|
306
|
+
than the atom array (stack) from the corresponding
|
|
307
|
+
:func:`get_structure()` call has atoms.
|
|
308
|
+
The reason for this is, that :func:`get_structure()` filters
|
|
309
|
+
*altloc* IDs, while `get_b_factor()` does not.
|
|
310
|
+
"""
|
|
311
|
+
if model is None:
|
|
312
|
+
b_factor = np.zeros(
|
|
313
|
+
(len(self._model_start_i), self._get_model_length()), dtype=np.float32
|
|
314
|
+
)
|
|
315
|
+
m = 0
|
|
316
|
+
i = 0
|
|
317
|
+
for line_i in self._atom_line_i:
|
|
318
|
+
if (
|
|
319
|
+
m < len(self._model_start_i) - 1
|
|
320
|
+
and line_i > self._model_start_i[m + 1]
|
|
321
|
+
):
|
|
322
|
+
m += 1
|
|
323
|
+
i = 0
|
|
324
|
+
line = self.lines[line_i]
|
|
325
|
+
b_factor[m, i] = float(line[_temp_f])
|
|
326
|
+
i += 1
|
|
327
|
+
return b_factor
|
|
328
|
+
|
|
329
|
+
else:
|
|
330
|
+
b_factor_i = self._get_atom_record_indices_for_model(model)
|
|
331
|
+
b_factor = np.zeros(len(b_factor_i), dtype=np.float32)
|
|
332
|
+
for i, line_i in enumerate(b_factor_i):
|
|
333
|
+
line = self.lines[line_i]
|
|
334
|
+
b_factor[i] = float(line[_temp_f])
|
|
335
|
+
return b_factor
|
|
336
|
+
|
|
337
|
+
def get_structure(
|
|
338
|
+
self, model=None, altloc="first", extra_fields=[], include_bonds=False
|
|
339
|
+
):
|
|
340
|
+
"""
|
|
341
|
+
Get an :class:`AtomArray` or :class:`AtomArrayStack` from the PDB file.
|
|
342
|
+
|
|
343
|
+
This function parses standard base-10 PDB files as well as
|
|
344
|
+
hybrid-36 PDB.
|
|
345
|
+
|
|
346
|
+
Parameters
|
|
347
|
+
----------
|
|
348
|
+
model : int, optional
|
|
349
|
+
If this parameter is given, the function will return an
|
|
350
|
+
:class:`AtomArray` from the atoms corresponding to the given
|
|
351
|
+
model number (starting at 1).
|
|
352
|
+
Negative values are used to index models starting from the
|
|
353
|
+
last model instead of the first model.
|
|
354
|
+
If this parameter is omitted, an :class:`AtomArrayStack`
|
|
355
|
+
containing all models will be returned, even if the
|
|
356
|
+
structure contains only one model.
|
|
357
|
+
altloc : {'first', 'occupancy', 'all'}
|
|
358
|
+
This parameter defines how *altloc* IDs are handled:
|
|
359
|
+
- ``'first'`` - Use atoms that have the first
|
|
360
|
+
*altloc* ID appearing in a residue.
|
|
361
|
+
- ``'occupancy'`` - Use atoms that have the *altloc* ID
|
|
362
|
+
with the highest occupancy for a residue.
|
|
363
|
+
- ``'all'`` - Use all atoms.
|
|
364
|
+
Note that this leads to duplicate atoms.
|
|
365
|
+
When this option is chosen, the ``altloc_id``
|
|
366
|
+
annotation array is added to the returned structure.
|
|
367
|
+
extra_fields : list of str, optional
|
|
368
|
+
The strings in the list are optional annotation categories
|
|
369
|
+
that should be stored in the output array or stack.
|
|
370
|
+
These are valid values:
|
|
371
|
+
``'atom_id'``, ``'b_factor'``, ``'occupancy'`` and
|
|
372
|
+
``'charge'``.
|
|
373
|
+
include_bonds : bool, optional
|
|
374
|
+
If set to true, a :class:`BondList` will be created for the
|
|
375
|
+
resulting :class:`AtomArray` containing the bond information
|
|
376
|
+
from the file.
|
|
377
|
+
Bonds, whose order could not be determined from the
|
|
378
|
+
*Chemical Component Dictionary*
|
|
379
|
+
(e.g. especially inter-residue bonds),
|
|
380
|
+
have :attr:`BondType.ANY`, since the PDB format itself does
|
|
381
|
+
not support bond orders.
|
|
382
|
+
|
|
383
|
+
Returns
|
|
384
|
+
-------
|
|
385
|
+
array : AtomArray or AtomArrayStack
|
|
386
|
+
The return type depends on the `model` parameter.
|
|
387
|
+
"""
|
|
388
|
+
if model is None:
|
|
389
|
+
depth = len(self._model_start_i)
|
|
390
|
+
length = self._get_model_length()
|
|
391
|
+
array = AtomArrayStack(depth, length)
|
|
392
|
+
# Record indices for annotation determination
|
|
393
|
+
# Annotation is determined from model 1
|
|
394
|
+
annot_i = self._get_atom_record_indices_for_model(1)
|
|
395
|
+
# Record indices for coordinate determination
|
|
396
|
+
coord_i = self._atom_line_i
|
|
397
|
+
|
|
398
|
+
else:
|
|
399
|
+
annot_i = coord_i = self._get_atom_record_indices_for_model(model)
|
|
400
|
+
array = AtomArray(len(coord_i))
|
|
401
|
+
|
|
402
|
+
# Create mandatory and optional annotation arrays
|
|
403
|
+
chain_id = np.zeros(array.array_length(), array.chain_id.dtype)
|
|
404
|
+
res_id = np.zeros(array.array_length(), array.res_id.dtype)
|
|
405
|
+
ins_code = np.zeros(array.array_length(), array.ins_code.dtype)
|
|
406
|
+
res_name = np.zeros(array.array_length(), array.res_name.dtype)
|
|
407
|
+
hetero = np.zeros(array.array_length(), array.hetero.dtype)
|
|
408
|
+
atom_name = np.zeros(array.array_length(), array.atom_name.dtype)
|
|
409
|
+
element = np.zeros(array.array_length(), array.element.dtype)
|
|
410
|
+
atom_id_raw = np.zeros(array.array_length(), "U5")
|
|
411
|
+
charge_raw = np.zeros(array.array_length(), "U2")
|
|
412
|
+
occupancy = np.zeros(array.array_length(), float)
|
|
413
|
+
b_factor = np.zeros(array.array_length(), float)
|
|
414
|
+
altloc_id = np.zeros(array.array_length(), dtype="U1")
|
|
415
|
+
|
|
416
|
+
# Fill annotation array
|
|
417
|
+
# i is index in array, line_i is line index
|
|
418
|
+
for i, line_i in enumerate(annot_i):
|
|
419
|
+
line = self.lines[line_i]
|
|
420
|
+
chain_id[i] = line[_chain_id].strip()
|
|
421
|
+
res_id[i] = decode_hybrid36(line[_res_id])
|
|
422
|
+
ins_code[i] = line[_ins_code].strip()
|
|
423
|
+
res_name[i] = line[_res_name].strip()
|
|
424
|
+
hetero[i] = line[_record] == "HETATM"
|
|
425
|
+
atom_name[i] = line[_atom_name].strip()
|
|
426
|
+
element[i] = line[_element].strip()
|
|
427
|
+
altloc_id[i] = line[_alt_loc]
|
|
428
|
+
atom_id_raw[i] = line[_atom_id]
|
|
429
|
+
# turn "1-" into "-1", if necessary
|
|
430
|
+
if line[_charge][0] in "+-":
|
|
431
|
+
charge_raw[i] = line[_charge]
|
|
432
|
+
else:
|
|
433
|
+
charge_raw[i] = line[_charge][::-1]
|
|
434
|
+
occupancy[i] = float(line[_occupancy].strip())
|
|
435
|
+
b_factor[i] = float(line[_temp_f].strip())
|
|
436
|
+
|
|
437
|
+
if include_bonds or (extra_fields is not None and "atom_id" in extra_fields):
|
|
438
|
+
# The atom IDs are only required in these two cases
|
|
439
|
+
atom_id = np.array(
|
|
440
|
+
[decode_hybrid36(raw_id.item()) for raw_id in atom_id_raw], dtype=int
|
|
441
|
+
)
|
|
442
|
+
else:
|
|
443
|
+
atom_id = None
|
|
444
|
+
|
|
445
|
+
# Add annotation arrays to atom array (stack)
|
|
446
|
+
array.chain_id = chain_id
|
|
447
|
+
array.res_id = res_id
|
|
448
|
+
array.ins_code = ins_code
|
|
449
|
+
array.res_name = res_name
|
|
450
|
+
array.hetero = hetero
|
|
451
|
+
array.atom_name = atom_name
|
|
452
|
+
array.element = element
|
|
453
|
+
|
|
454
|
+
for field in extra_fields if extra_fields is not None else []:
|
|
455
|
+
if field == "atom_id":
|
|
456
|
+
# Copy is necessary to avoid double masking in
|
|
457
|
+
# later altloc ID filtering
|
|
458
|
+
array.set_annotation("atom_id", atom_id.copy())
|
|
459
|
+
elif field == "charge":
|
|
460
|
+
charge = np.array(charge_raw)
|
|
461
|
+
array.set_annotation(
|
|
462
|
+
"charge", np.where(charge == " ", "0", charge).astype(int)
|
|
463
|
+
)
|
|
464
|
+
elif field == "occupancy":
|
|
465
|
+
array.set_annotation("occupancy", occupancy)
|
|
466
|
+
elif field == "b_factor":
|
|
467
|
+
array.set_annotation("b_factor", b_factor)
|
|
468
|
+
else:
|
|
469
|
+
raise ValueError(f"Unknown extra field: {field}")
|
|
470
|
+
|
|
471
|
+
# Replace empty strings for elements with guessed types
|
|
472
|
+
# This is used e.g. for PDB files created by Gromacs
|
|
473
|
+
empty_element_mask = array.element == ""
|
|
474
|
+
if empty_element_mask.any():
|
|
475
|
+
warnings.warn(
|
|
476
|
+
f"{np.count_nonzero(empty_element_mask)} elements "
|
|
477
|
+
"were guessed from atom name"
|
|
478
|
+
)
|
|
479
|
+
array.element[empty_element_mask] = infer_elements(
|
|
480
|
+
array.atom_name[empty_element_mask]
|
|
481
|
+
)
|
|
482
|
+
|
|
483
|
+
# Fill in coordinates
|
|
484
|
+
if isinstance(array, AtomArray):
|
|
485
|
+
for i, line_i in enumerate(coord_i):
|
|
486
|
+
line = self.lines[line_i]
|
|
487
|
+
array.coord[i, 0] = float(line[_coord_x])
|
|
488
|
+
array.coord[i, 1] = float(line[_coord_y])
|
|
489
|
+
array.coord[i, 2] = float(line[_coord_z])
|
|
490
|
+
|
|
491
|
+
elif isinstance(array, AtomArrayStack):
|
|
492
|
+
m = 0
|
|
493
|
+
i = 0
|
|
494
|
+
for line_i in self._atom_line_i:
|
|
495
|
+
if (
|
|
496
|
+
m < len(self._model_start_i) - 1
|
|
497
|
+
and line_i > self._model_start_i[m + 1]
|
|
498
|
+
):
|
|
499
|
+
m += 1
|
|
500
|
+
i = 0
|
|
501
|
+
line = self.lines[line_i]
|
|
502
|
+
array.coord[m, i, 0] = float(line[_coord_x])
|
|
503
|
+
array.coord[m, i, 1] = float(line[_coord_y])
|
|
504
|
+
array.coord[m, i, 2] = float(line[_coord_z])
|
|
505
|
+
i += 1
|
|
506
|
+
|
|
507
|
+
# Fill in box vectors
|
|
508
|
+
# PDB does not support changing box dimensions. CRYST1 is a one-time
|
|
509
|
+
# record so we can extract it directly
|
|
510
|
+
for line in self.lines:
|
|
511
|
+
if line.startswith("CRYST1"):
|
|
512
|
+
try:
|
|
513
|
+
len_a = float(line[_a])
|
|
514
|
+
len_b = float(line[_b])
|
|
515
|
+
len_c = float(line[_c])
|
|
516
|
+
alpha = np.deg2rad(float(line[_alpha]))
|
|
517
|
+
beta = np.deg2rad(float(line[_beta]))
|
|
518
|
+
gamma = np.deg2rad(float(line[_gamma]))
|
|
519
|
+
box = vectors_from_unitcell(len_a, len_b, len_c, alpha, beta, gamma)
|
|
520
|
+
except ValueError:
|
|
521
|
+
# File contains invalid 'CRYST1' record
|
|
522
|
+
warnings.warn(
|
|
523
|
+
"File contains invalid 'CRYST1' record, box is ignored"
|
|
524
|
+
)
|
|
525
|
+
break
|
|
526
|
+
|
|
527
|
+
if isinstance(array, AtomArray):
|
|
528
|
+
array.box = box
|
|
529
|
+
else:
|
|
530
|
+
array.box = np.repeat(
|
|
531
|
+
box[np.newaxis, ...], array.stack_depth(), axis=0
|
|
532
|
+
)
|
|
533
|
+
break
|
|
534
|
+
|
|
535
|
+
# Filter altloc IDs
|
|
536
|
+
if altloc == "occupancy":
|
|
537
|
+
filter = filter_highest_occupancy_altloc(array, altloc_id, occupancy)
|
|
538
|
+
array = array[..., filter]
|
|
539
|
+
atom_id = atom_id[filter] if atom_id is not None else None
|
|
540
|
+
elif altloc == "first":
|
|
541
|
+
filter = filter_first_altloc(array, altloc_id)
|
|
542
|
+
array = array[..., filter]
|
|
543
|
+
atom_id = atom_id[filter] if atom_id is not None else None
|
|
544
|
+
elif altloc == "all":
|
|
545
|
+
array.set_annotation("altloc_id", altloc_id)
|
|
546
|
+
else:
|
|
547
|
+
raise ValueError(f"'{altloc}' is not a valid 'altloc' option")
|
|
548
|
+
|
|
549
|
+
# Read bonds
|
|
550
|
+
if include_bonds:
|
|
551
|
+
bond_list = self._get_bonds(atom_id)
|
|
552
|
+
# Create bond dict containing only non-hetero residues (+ water)
|
|
553
|
+
custom_bond_dict = {
|
|
554
|
+
res_name: bonds_in_residue(res_name)
|
|
555
|
+
for res_name in itertools.chain(
|
|
556
|
+
np.unique(array[..., ~array.hetero].res_name), ["HOH"]
|
|
557
|
+
)
|
|
558
|
+
}
|
|
559
|
+
bond_list = bond_list.merge(
|
|
560
|
+
connect_via_residue_names(array, custom_bond_dict=custom_bond_dict)
|
|
561
|
+
)
|
|
562
|
+
array.bonds = bond_list
|
|
563
|
+
|
|
564
|
+
return array
|
|
565
|
+
|
|
566
|
+
def get_space_group(self):
|
|
567
|
+
"""
|
|
568
|
+
Extract the space group and Z value from the CRYST1 record.
|
|
569
|
+
|
|
570
|
+
Returns
|
|
571
|
+
-------
|
|
572
|
+
space_group : str
|
|
573
|
+
The extracted space group.
|
|
574
|
+
z_val : int
|
|
575
|
+
The extracted Z value.
|
|
576
|
+
"""
|
|
577
|
+
# Initialize the namedtuple
|
|
578
|
+
SpaceGroupInfo = namedtuple("SpaceGroupInfo", ["space_group", "z_val"])
|
|
579
|
+
|
|
580
|
+
# CRYST1 is a one-time record so we can extract it directly
|
|
581
|
+
for line in self.lines:
|
|
582
|
+
if line.startswith("CRYST1"):
|
|
583
|
+
try:
|
|
584
|
+
# Extract space group and Z value
|
|
585
|
+
space_group = str(line[_space])
|
|
586
|
+
z_val = int(line[_z])
|
|
587
|
+
except ValueError:
|
|
588
|
+
# File contains invalid 'CRYST1' record
|
|
589
|
+
raise InvalidFileError(
|
|
590
|
+
"File does not contain valid space group and/or Z values"
|
|
591
|
+
)
|
|
592
|
+
# Set default values
|
|
593
|
+
space_group = "P 1"
|
|
594
|
+
z_val = 1
|
|
595
|
+
break
|
|
596
|
+
return SpaceGroupInfo(space_group=space_group, z_val=z_val)
|
|
597
|
+
|
|
598
|
+
def set_structure(self, array, hybrid36=False):
|
|
599
|
+
"""
|
|
600
|
+
Set the :class:`AtomArray` or :class:`AtomArrayStack` for the
|
|
601
|
+
file.
|
|
602
|
+
|
|
603
|
+
This makes also use of the optional annotation arrays
|
|
604
|
+
``'atom_id'``, ``'b_factor'``, ``'occupancy'`` and ``'charge'``.
|
|
605
|
+
If the atom array (stack) contains the annotation ``'atom_id'``,
|
|
606
|
+
these values will be used for atom numbering instead of
|
|
607
|
+
continuous numbering.
|
|
608
|
+
|
|
609
|
+
Parameters
|
|
610
|
+
----------
|
|
611
|
+
array : AtomArray or AtomArrayStack
|
|
612
|
+
The array or stack to be saved into this file. If a stack
|
|
613
|
+
is given, each array in the stack is saved as separate
|
|
614
|
+
model.
|
|
615
|
+
hybrid36 : bool, optional
|
|
616
|
+
Defines whether the file should be written in hybrid-36
|
|
617
|
+
format.
|
|
618
|
+
|
|
619
|
+
Notes
|
|
620
|
+
-----
|
|
621
|
+
If `array` has an associated :class:`BondList`, ``CONECT``
|
|
622
|
+
records are also written for all non-water hetero residues
|
|
623
|
+
and all inter-residue connections.
|
|
624
|
+
"""
|
|
625
|
+
_check_pdb_compatibility(array, hybrid36)
|
|
626
|
+
|
|
627
|
+
natoms = array.array_length()
|
|
628
|
+
annot_categories = array.get_annotation_categories()
|
|
629
|
+
record = np.char.array(np.where(array.hetero, "HETATM", "ATOM"))
|
|
630
|
+
# Check for optional annotation categories
|
|
631
|
+
if "atom_id" in annot_categories:
|
|
632
|
+
atom_id = array.atom_id
|
|
633
|
+
else:
|
|
634
|
+
atom_id = np.arange(1, natoms + 1)
|
|
635
|
+
if "b_factor" in annot_categories:
|
|
636
|
+
b_factor = np.char.array([f"{b:>6.2f}" for b in array.b_factor])
|
|
637
|
+
else:
|
|
638
|
+
b_factor = np.char.array(np.full(natoms, " 0.00", dtype="U6"))
|
|
639
|
+
if "occupancy" in annot_categories:
|
|
640
|
+
occupancy = np.char.array([f"{o:>6.2f}" for o in array.occupancy])
|
|
641
|
+
else:
|
|
642
|
+
occupancy = np.char.array(np.full(natoms, " 1.00", dtype="U6"))
|
|
643
|
+
if "charge" in annot_categories:
|
|
644
|
+
charge = np.char.array(
|
|
645
|
+
[
|
|
646
|
+
str(np.abs(charge)) + "+"
|
|
647
|
+
if charge > 0
|
|
648
|
+
else (str(np.abs(charge)) + "-" if charge < 0 else "")
|
|
649
|
+
for charge in array.get_annotation("charge")
|
|
650
|
+
]
|
|
651
|
+
)
|
|
652
|
+
else:
|
|
653
|
+
charge = np.char.array(np.full(natoms, " ", dtype="U2"))
|
|
654
|
+
|
|
655
|
+
if hybrid36:
|
|
656
|
+
pdb_atom_id = np.char.array([encode_hybrid36(i, 5) for i in atom_id])
|
|
657
|
+
pdb_res_id = np.char.array([encode_hybrid36(i, 4) for i in array.res_id])
|
|
658
|
+
else:
|
|
659
|
+
# Atom IDs are supported up to 99999,
|
|
660
|
+
# but negative IDs are also possible
|
|
661
|
+
pdb_atom_id = np.char.array(
|
|
662
|
+
np.where(
|
|
663
|
+
atom_id > 0, ((atom_id - 1) % _PDB_MAX_ATOMS) + 1, atom_id
|
|
664
|
+
).astype(str)
|
|
665
|
+
)
|
|
666
|
+
# Residue IDs are supported up to 9999,
|
|
667
|
+
# but negative IDs are also possible
|
|
668
|
+
pdb_res_id = np.char.array(
|
|
669
|
+
np.where(
|
|
670
|
+
array.res_id > 0,
|
|
671
|
+
((array.res_id - 1) % _PDB_MAX_RESIDUES) + 1,
|
|
672
|
+
array.res_id,
|
|
673
|
+
).astype(str)
|
|
674
|
+
)
|
|
675
|
+
|
|
676
|
+
names = np.char.array(
|
|
677
|
+
[
|
|
678
|
+
f" {atm}" if len(elem) == 1 and len(atm) < 4 else atm
|
|
679
|
+
for atm, elem in zip(array.atom_name, array.element)
|
|
680
|
+
]
|
|
681
|
+
)
|
|
682
|
+
res_names = np.char.array(array.res_name)
|
|
683
|
+
chain_ids = np.char.array(array.chain_id)
|
|
684
|
+
ins_codes = np.char.array(array.ins_code)
|
|
685
|
+
spaces = np.char.array(np.full(natoms, " ", dtype="U1"))
|
|
686
|
+
elements = np.char.array(array.element)
|
|
687
|
+
|
|
688
|
+
first_half = (
|
|
689
|
+
record.ljust(6)
|
|
690
|
+
+ pdb_atom_id.rjust(5)
|
|
691
|
+
+ spaces
|
|
692
|
+
+ names.ljust(4)
|
|
693
|
+
+ spaces
|
|
694
|
+
+ res_names.rjust(3)
|
|
695
|
+
+ spaces
|
|
696
|
+
+ chain_ids
|
|
697
|
+
+ pdb_res_id.rjust(4)
|
|
698
|
+
+ ins_codes.rjust(1)
|
|
699
|
+
)
|
|
700
|
+
|
|
701
|
+
second_half = (
|
|
702
|
+
occupancy + b_factor + 10 * spaces + elements.rjust(2) + charge.rjust(2)
|
|
703
|
+
)
|
|
704
|
+
|
|
705
|
+
coords = array.coord
|
|
706
|
+
if coords.ndim == 2:
|
|
707
|
+
coords = coords[np.newaxis, ...]
|
|
708
|
+
|
|
709
|
+
self.lines = []
|
|
710
|
+
# Prepend a single CRYST1 record if we have box information
|
|
711
|
+
if array.box is not None:
|
|
712
|
+
box = array.box
|
|
713
|
+
if len(box.shape) == 3:
|
|
714
|
+
box = box[0]
|
|
715
|
+
a, b, c, alpha, beta, gamma = unitcell_from_vectors(box)
|
|
716
|
+
self.lines.append(
|
|
717
|
+
f"CRYST1{a:>9.3f}{b:>9.3f}{c:>9.3f}"
|
|
718
|
+
f"{np.rad2deg(alpha):>7.2f}{np.rad2deg(beta):>7.2f}"
|
|
719
|
+
f"{np.rad2deg(gamma):>7.2f} P 1 1 "
|
|
720
|
+
)
|
|
721
|
+
is_stack = coords.shape[0] > 1
|
|
722
|
+
for model_num, coord_i in enumerate(coords, start=1):
|
|
723
|
+
# for an ArrayStack, this is run once
|
|
724
|
+
# only add model lines if is_stack
|
|
725
|
+
if is_stack:
|
|
726
|
+
self.lines.append(f"MODEL {model_num:4}")
|
|
727
|
+
# Bundle non-coordinate data to simplify iteration
|
|
728
|
+
self.lines.extend(
|
|
729
|
+
[
|
|
730
|
+
f"{start:27} {x:>8.3f}{y:>8.3f}{z:>8.3f}{end:26}"
|
|
731
|
+
for start, (x, y, z), end in zip(first_half, coord_i, second_half)
|
|
732
|
+
]
|
|
733
|
+
)
|
|
734
|
+
if is_stack:
|
|
735
|
+
self.lines.append("ENDMDL")
|
|
736
|
+
|
|
737
|
+
# Add CONECT records if bonds are present
|
|
738
|
+
if array.bonds is not None:
|
|
739
|
+
# Only non-water hetero records and connections between
|
|
740
|
+
# residues are added to the records
|
|
741
|
+
hetero_indices = np.where(array.hetero & ~filter_solvent(array))[0]
|
|
742
|
+
bond_array = array.bonds.as_array()
|
|
743
|
+
bond_array = bond_array[
|
|
744
|
+
np.isin(bond_array[:, 0], hetero_indices)
|
|
745
|
+
| np.isin(bond_array[:, 1], hetero_indices)
|
|
746
|
+
| (array.res_id[bond_array[:, 0]] != array.res_id[bond_array[:, 1]])
|
|
747
|
+
| (array.chain_id[bond_array[:, 0]] != array.chain_id[bond_array[:, 1]])
|
|
748
|
+
]
|
|
749
|
+
self._set_bonds(BondList(array.array_length(), bond_array), pdb_atom_id)
|
|
750
|
+
|
|
751
|
+
self._index_models_and_atoms()
|
|
752
|
+
|
|
753
|
+
def set_space_group(self, info):
|
|
754
|
+
"""
|
|
755
|
+
Update the CRYST1 record with the provided space group and Z value.
|
|
756
|
+
|
|
757
|
+
Parameters
|
|
758
|
+
----------
|
|
759
|
+
info : tuple(str, int) or SpaceGroupInfo
|
|
760
|
+
Contains the space group and Z-value.
|
|
761
|
+
"""
|
|
762
|
+
for i, line in enumerate(self.lines):
|
|
763
|
+
if line.startswith("CRYST1"):
|
|
764
|
+
try:
|
|
765
|
+
# Format the replacement string
|
|
766
|
+
space_group_str = info.space_group.ljust(11)
|
|
767
|
+
z_val_str = str(info.z_val).rjust(4)
|
|
768
|
+
|
|
769
|
+
# Replace the existing CRYST1 record
|
|
770
|
+
self.lines[i] = line[:55] + space_group_str + z_val_str + line[70:]
|
|
771
|
+
except (ValueError, AttributeError) as e:
|
|
772
|
+
# Raise an exception with context
|
|
773
|
+
raise AttributeError(
|
|
774
|
+
f"Failed to update CRYST1 record. "
|
|
775
|
+
f"Line: {line.strip()} | Error: {e}"
|
|
776
|
+
)
|
|
777
|
+
break
|
|
778
|
+
|
|
779
|
+
def list_assemblies(self):
|
|
780
|
+
"""
|
|
781
|
+
List the biological assemblies that are available for the
|
|
782
|
+
structure in the given file.
|
|
783
|
+
|
|
784
|
+
This function receives the data from the ``REMARK 300`` records
|
|
785
|
+
in the file.
|
|
786
|
+
Consequently, this remark must be present in the file.
|
|
787
|
+
|
|
788
|
+
Returns
|
|
789
|
+
-------
|
|
790
|
+
assemblies : list of str
|
|
791
|
+
A list that contains the available assembly IDs.
|
|
792
|
+
|
|
793
|
+
Examples
|
|
794
|
+
--------
|
|
795
|
+
>>> import os.path
|
|
796
|
+
>>> file = PDBFile.read(os.path.join(path_to_structures, "1f2n.pdb"))
|
|
797
|
+
>>> print(file.list_assemblies())
|
|
798
|
+
['1']
|
|
799
|
+
"""
|
|
800
|
+
# Get remarks listing available assemblies
|
|
801
|
+
remark_lines = self.get_remark(300)
|
|
802
|
+
if remark_lines is None:
|
|
803
|
+
raise InvalidFileError(
|
|
804
|
+
"File does not contain assembly information (REMARK 300)"
|
|
805
|
+
)
|
|
806
|
+
return [assembly_id.strip() for assembly_id in remark_lines[0][12:].split(",")]
|
|
807
|
+
|
|
808
|
+
def get_assembly(
|
|
809
|
+
self,
|
|
810
|
+
assembly_id=None,
|
|
811
|
+
model=None,
|
|
812
|
+
altloc="first",
|
|
813
|
+
extra_fields=[],
|
|
814
|
+
include_bonds=False,
|
|
815
|
+
):
|
|
816
|
+
"""
|
|
817
|
+
Build the given biological assembly.
|
|
818
|
+
|
|
819
|
+
This function receives the data from ``REMARK 350`` records in
|
|
820
|
+
the file.
|
|
821
|
+
Consequently, this remark must be present in the file.
|
|
822
|
+
|
|
823
|
+
Parameters
|
|
824
|
+
----------
|
|
825
|
+
assembly_id : str
|
|
826
|
+
The assembly to build.
|
|
827
|
+
Available assembly IDs can be obtained via
|
|
828
|
+
:func:`list_assemblies()`.
|
|
829
|
+
model : int, optional
|
|
830
|
+
If this parameter is given, the function will return an
|
|
831
|
+
:class:`AtomArray` from the atoms corresponding to the given
|
|
832
|
+
model number (starting at 1).
|
|
833
|
+
Negative values are used to index models starting from the
|
|
834
|
+
last model instead of the first model.
|
|
835
|
+
If this parameter is omitted, an :class:`AtomArrayStack`
|
|
836
|
+
containing all models will be returned, even if the
|
|
837
|
+
structure contains only one model.
|
|
838
|
+
altloc : {'first', 'occupancy', 'all'}
|
|
839
|
+
This parameter defines how *altloc* IDs are handled:
|
|
840
|
+
- ``'first'`` - Use atoms that have the first
|
|
841
|
+
*altloc* ID appearing in a residue.
|
|
842
|
+
- ``'occupancy'`` - Use atoms that have the *altloc* ID
|
|
843
|
+
with the highest occupancy for a residue.
|
|
844
|
+
- ``'all'`` - Use all atoms.
|
|
845
|
+
Note that this leads to duplicate atoms.
|
|
846
|
+
When this option is chosen, the ``altloc_id``
|
|
847
|
+
annotation array is added to the returned structure.
|
|
848
|
+
extra_fields : list of str, optional
|
|
849
|
+
The strings in the list are optional annotation categories
|
|
850
|
+
that should be stored in the output array or stack.
|
|
851
|
+
These are valid values:
|
|
852
|
+
``'atom_id'``, ``'b_factor'``, ``'occupancy'`` and
|
|
853
|
+
``'charge'``.
|
|
854
|
+
include_bonds : bool, optional
|
|
855
|
+
If set to true, a :class:`BondList` will be created for the
|
|
856
|
+
resulting :class:`AtomArray` containing the bond information
|
|
857
|
+
from the file.
|
|
858
|
+
Bonds, whose order could not be determined from the
|
|
859
|
+
*Chemical Component Dictionary*
|
|
860
|
+
(e.g. especially inter-residue bonds),
|
|
861
|
+
have :attr:`BondType.ANY`, since the PDB format itself does
|
|
862
|
+
not support bond orders.
|
|
863
|
+
|
|
864
|
+
Returns
|
|
865
|
+
-------
|
|
866
|
+
assembly : AtomArray or AtomArrayStack
|
|
867
|
+
The assembly.
|
|
868
|
+
The return type depends on the `model` parameter.
|
|
869
|
+
Contains the `sym_id` annotation, which enumerates the copies of the
|
|
870
|
+
asymmetric unit in the assembly.
|
|
871
|
+
|
|
872
|
+
Examples
|
|
873
|
+
--------
|
|
874
|
+
|
|
875
|
+
>>> import os.path
|
|
876
|
+
>>> file = PDBFile.read(os.path.join(path_to_structures, "1f2n.pdb"))
|
|
877
|
+
>>> assembly = file.get_assembly(model=1)
|
|
878
|
+
"""
|
|
879
|
+
# Get base structure
|
|
880
|
+
structure = self.get_structure(
|
|
881
|
+
model,
|
|
882
|
+
altloc,
|
|
883
|
+
extra_fields,
|
|
884
|
+
include_bonds,
|
|
885
|
+
)
|
|
886
|
+
|
|
887
|
+
# Get lines containing transformations for chosen assembly
|
|
888
|
+
remark_lines = self.get_remark(350)
|
|
889
|
+
if remark_lines is None:
|
|
890
|
+
raise InvalidFileError(
|
|
891
|
+
"File does not contain assembly information (REMARK 350)"
|
|
892
|
+
)
|
|
893
|
+
# Get lines corresponding to selected assembly ID
|
|
894
|
+
assembly_start_i = None
|
|
895
|
+
assembly_stop_i = None
|
|
896
|
+
for i, line in enumerate(remark_lines):
|
|
897
|
+
if line.startswith("BIOMOLECULE"):
|
|
898
|
+
current_assembly_id = line[12:].strip()
|
|
899
|
+
if assembly_start_i is not None:
|
|
900
|
+
# Start was already found -> this is the next entry
|
|
901
|
+
# -> this is the stop
|
|
902
|
+
assembly_stop_i = i
|
|
903
|
+
break
|
|
904
|
+
if current_assembly_id == assembly_id or assembly_id is None:
|
|
905
|
+
assembly_start_i = i
|
|
906
|
+
# In case of the final assembly of the file,
|
|
907
|
+
# the 'stop' is the end of REMARK 350 lines
|
|
908
|
+
assembly_stop_i = len(remark_lines) if assembly_stop_i is None else i
|
|
909
|
+
if assembly_start_i is None:
|
|
910
|
+
if assembly_id is None:
|
|
911
|
+
raise InvalidFileError(
|
|
912
|
+
"File does not contain transformation expressions for assemblies"
|
|
913
|
+
)
|
|
914
|
+
else:
|
|
915
|
+
raise KeyError(f"The assembly ID '{assembly_id}' is not found")
|
|
916
|
+
assembly_lines = remark_lines[assembly_start_i:assembly_stop_i]
|
|
917
|
+
|
|
918
|
+
# Get transformations for a set of chains
|
|
919
|
+
chain_set_start_indices = [
|
|
920
|
+
i
|
|
921
|
+
for i, line in enumerate(assembly_lines)
|
|
922
|
+
if line.startswith("APPLY THE FOLLOWING TO CHAINS")
|
|
923
|
+
]
|
|
924
|
+
# Add exclusive stop at end of records
|
|
925
|
+
chain_set_start_indices.append(len(assembly_lines))
|
|
926
|
+
assembly = None
|
|
927
|
+
for i in range(len(chain_set_start_indices) - 1):
|
|
928
|
+
start = chain_set_start_indices[i]
|
|
929
|
+
stop = chain_set_start_indices[i + 1]
|
|
930
|
+
# Read affected chain IDs from the following line(s)
|
|
931
|
+
affected_chain_ids = []
|
|
932
|
+
transform_start = None
|
|
933
|
+
for j, line in enumerate(assembly_lines[start:stop]):
|
|
934
|
+
if any(
|
|
935
|
+
line.startswith(chain_signal_string)
|
|
936
|
+
for chain_signal_string in [
|
|
937
|
+
"APPLY THE FOLLOWING TO CHAINS:",
|
|
938
|
+
" AND CHAINS:",
|
|
939
|
+
]
|
|
940
|
+
):
|
|
941
|
+
affected_chain_ids += [
|
|
942
|
+
chain_id.strip() for chain_id in line[30:].split(",")
|
|
943
|
+
]
|
|
944
|
+
else:
|
|
945
|
+
# Chain specification has finished
|
|
946
|
+
# BIOMT lines start directly after chain specification
|
|
947
|
+
transform_start = start + j
|
|
948
|
+
break
|
|
949
|
+
# Parse transformations from BIOMT lines
|
|
950
|
+
if transform_start is None:
|
|
951
|
+
raise InvalidFileError("No 'BIOMT' records found for chosen assembly")
|
|
952
|
+
rotations, translations = _parse_transformations(
|
|
953
|
+
[
|
|
954
|
+
line
|
|
955
|
+
for line in assembly_lines[transform_start:stop]
|
|
956
|
+
if len(line.strip()) > 0
|
|
957
|
+
]
|
|
958
|
+
)
|
|
959
|
+
# Filter affected chains
|
|
960
|
+
sub_structure = structure[
|
|
961
|
+
..., np.isin(structure.chain_id, affected_chain_ids)
|
|
962
|
+
]
|
|
963
|
+
sub_assembly = _apply_transformations(
|
|
964
|
+
sub_structure, rotations, translations
|
|
965
|
+
)
|
|
966
|
+
# Merge the chains with IDs for this transformation
|
|
967
|
+
# with chains from other transformations
|
|
968
|
+
if assembly is None:
|
|
969
|
+
assembly = sub_assembly
|
|
970
|
+
else:
|
|
971
|
+
assembly += sub_assembly
|
|
972
|
+
|
|
973
|
+
return assembly
|
|
974
|
+
|
|
975
|
+
def get_unit_cell(
|
|
976
|
+
self, model=None, altloc="first", extra_fields=[], include_bonds=False
|
|
977
|
+
):
|
|
978
|
+
"""
|
|
979
|
+
Build a structure model containing all symmetric copies
|
|
980
|
+
of the structure within a single unit cell, given by the space
|
|
981
|
+
group.
|
|
982
|
+
|
|
983
|
+
This function receives the data from ``REMARK 290`` records in
|
|
984
|
+
the file.
|
|
985
|
+
Consequently, this remark must be present in the file, which is
|
|
986
|
+
usually only true for crystal structures.
|
|
987
|
+
|
|
988
|
+
Parameters
|
|
989
|
+
----------
|
|
990
|
+
model : int, optional
|
|
991
|
+
If this parameter is given, the function will return an
|
|
992
|
+
:class:`AtomArray` from the atoms corresponding to the given
|
|
993
|
+
model number (starting at 1).
|
|
994
|
+
Negative values are used to index models starting from the
|
|
995
|
+
last model instead of the first model.
|
|
996
|
+
If this parameter is omitted, an :class:`AtomArrayStack`
|
|
997
|
+
containing all models will be returned, even if the
|
|
998
|
+
structure contains only one model.
|
|
999
|
+
altloc : {'first', 'occupancy', 'all'}
|
|
1000
|
+
This parameter defines how *altloc* IDs are handled:
|
|
1001
|
+
- ``'first'`` - Use atoms that have the first
|
|
1002
|
+
*altloc* ID appearing in a residue.
|
|
1003
|
+
- ``'occupancy'`` - Use atoms that have the *altloc* ID
|
|
1004
|
+
with the highest occupancy for a residue.
|
|
1005
|
+
- ``'all'`` - Use all atoms.
|
|
1006
|
+
Note that this leads to duplicate atoms.
|
|
1007
|
+
When this option is chosen, the ``altloc_id``
|
|
1008
|
+
annotation array is added to the returned structure.
|
|
1009
|
+
extra_fields : list of str, optional
|
|
1010
|
+
The strings in the list are optional annotation categories
|
|
1011
|
+
that should be stored in the output array or stack.
|
|
1012
|
+
These are valid values:
|
|
1013
|
+
``'atom_id'``, ``'b_factor'``, ``'occupancy'`` and
|
|
1014
|
+
``'charge'``.
|
|
1015
|
+
include_bonds : bool, optional
|
|
1016
|
+
If set to true, a :class:`BondList` will be created for the
|
|
1017
|
+
resulting :class:`AtomArray` containing the bond information
|
|
1018
|
+
from the file.
|
|
1019
|
+
Bonds, whose order could not be determined from the
|
|
1020
|
+
*Chemical Component Dictionary*
|
|
1021
|
+
(e.g. especially inter-residue bonds),
|
|
1022
|
+
have :attr:`BondType.ANY`, since the PDB format itself does
|
|
1023
|
+
not support bond orders.
|
|
1024
|
+
|
|
1025
|
+
Returns
|
|
1026
|
+
-------
|
|
1027
|
+
symmetry_mates : AtomArray or AtomArrayStack
|
|
1028
|
+
All atoms within a single unit cell.
|
|
1029
|
+
The return type depends on the `model` parameter.
|
|
1030
|
+
|
|
1031
|
+
Notes
|
|
1032
|
+
-----
|
|
1033
|
+
To expand the structure beyond a single unit cell, use
|
|
1034
|
+
:func:`repeat_box()` with the return value as its
|
|
1035
|
+
input.
|
|
1036
|
+
|
|
1037
|
+
Examples
|
|
1038
|
+
--------
|
|
1039
|
+
|
|
1040
|
+
>>> import os.path
|
|
1041
|
+
>>> file = PDBFile.read(os.path.join(path_to_structures, "1aki.pdb"))
|
|
1042
|
+
>>> atoms_in_unit_cell = file.get_unit_cell(model=1)
|
|
1043
|
+
"""
|
|
1044
|
+
# Get base structure
|
|
1045
|
+
structure = self.get_structure(
|
|
1046
|
+
model,
|
|
1047
|
+
altloc,
|
|
1048
|
+
extra_fields,
|
|
1049
|
+
include_bonds,
|
|
1050
|
+
)
|
|
1051
|
+
# Get lines containing transformations for crystallographic symmetry
|
|
1052
|
+
remark_lines = self.get_remark(290)
|
|
1053
|
+
if remark_lines is None:
|
|
1054
|
+
raise InvalidFileError(
|
|
1055
|
+
"File does not contain crystallographic symmetry "
|
|
1056
|
+
"information (REMARK 350)"
|
|
1057
|
+
)
|
|
1058
|
+
transform_lines = [line for line in remark_lines if line.startswith(" SMTRY")]
|
|
1059
|
+
rotations, translations = _parse_transformations(transform_lines)
|
|
1060
|
+
return _apply_transformations(structure, rotations, translations)
|
|
1061
|
+
|
|
1062
|
+
def get_symmetry_mates(
|
|
1063
|
+
self, model=None, altloc="first", extra_fields=[], include_bonds=False
|
|
1064
|
+
):
|
|
1065
|
+
"""
|
|
1066
|
+
Build a structure model containing all symmetric copies
|
|
1067
|
+
of the structure within a single unit cell, given by the space
|
|
1068
|
+
group.
|
|
1069
|
+
|
|
1070
|
+
This function receives the data from ``REMARK 290`` records in
|
|
1071
|
+
the file.
|
|
1072
|
+
Consequently, this remark must be present in the file, which is
|
|
1073
|
+
usually only true for crystal structures.
|
|
1074
|
+
|
|
1075
|
+
DEPRECATED: Use :meth:`get_unit_cell()` instead.
|
|
1076
|
+
|
|
1077
|
+
Parameters
|
|
1078
|
+
----------
|
|
1079
|
+
model : int, optional
|
|
1080
|
+
If this parameter is given, the function will return an
|
|
1081
|
+
:class:`AtomArray` from the atoms corresponding to the given
|
|
1082
|
+
model number (starting at 1).
|
|
1083
|
+
Negative values are used to index models starting from the
|
|
1084
|
+
last model instead of the first model.
|
|
1085
|
+
If this parameter is omitted, an :class:`AtomArrayStack`
|
|
1086
|
+
containing all models will be returned, even if the
|
|
1087
|
+
structure contains only one model.
|
|
1088
|
+
altloc : {'first', 'occupancy', 'all'}
|
|
1089
|
+
This parameter defines how *altloc* IDs are handled:
|
|
1090
|
+
- ``'first'`` - Use atoms that have the first
|
|
1091
|
+
*altloc* ID appearing in a residue.
|
|
1092
|
+
- ``'occupancy'`` - Use atoms that have the *altloc* ID
|
|
1093
|
+
with the highest occupancy for a residue.
|
|
1094
|
+
- ``'all'`` - Use all atoms.
|
|
1095
|
+
Note that this leads to duplicate atoms.
|
|
1096
|
+
When this option is chosen, the ``altloc_id``
|
|
1097
|
+
annotation array is added to the returned structure.
|
|
1098
|
+
extra_fields : list of str, optional
|
|
1099
|
+
The strings in the list are optional annotation categories
|
|
1100
|
+
that should be stored in the output array or stack.
|
|
1101
|
+
These are valid values:
|
|
1102
|
+
``'atom_id'``, ``'b_factor'``, ``'occupancy'`` and
|
|
1103
|
+
``'charge'``.
|
|
1104
|
+
include_bonds : bool, optional
|
|
1105
|
+
If set to true, a :class:`BondList` will be created for the
|
|
1106
|
+
resulting :class:`AtomArray` containing the bond information
|
|
1107
|
+
from the file.
|
|
1108
|
+
Bonds, whose order could not be determined from the
|
|
1109
|
+
*Chemical Component Dictionary*
|
|
1110
|
+
(e.g. especially inter-residue bonds),
|
|
1111
|
+
have :attr:`BondType.ANY`, since the PDB format itself does
|
|
1112
|
+
not support bond orders.
|
|
1113
|
+
|
|
1114
|
+
Returns
|
|
1115
|
+
-------
|
|
1116
|
+
symmetry_mates : AtomArray or AtomArrayStack
|
|
1117
|
+
All atoms within a single unit cell.
|
|
1118
|
+
The return type depends on the `model` parameter.
|
|
1119
|
+
|
|
1120
|
+
Notes
|
|
1121
|
+
-----
|
|
1122
|
+
To expand the structure beyond a single unit cell, use
|
|
1123
|
+
:func:`repeat_box()` with the return value as its
|
|
1124
|
+
input.
|
|
1125
|
+
|
|
1126
|
+
Examples
|
|
1127
|
+
--------
|
|
1128
|
+
|
|
1129
|
+
>>> import os.path
|
|
1130
|
+
>>> file = PDBFile.read(os.path.join(path_to_structures, "1aki.pdb"))
|
|
1131
|
+
>>> atoms_in_unit_cell = file.get_symmetry_mates(model=1)
|
|
1132
|
+
"""
|
|
1133
|
+
warnings.warn(
|
|
1134
|
+
"'get_symmetry_mates()' is deprecated, use 'get_unit_cell()' instead",
|
|
1135
|
+
DeprecationWarning,
|
|
1136
|
+
)
|
|
1137
|
+
return self.get_unit_cell(model, altloc, extra_fields, include_bonds)
|
|
1138
|
+
|
|
1139
|
+
def _index_models_and_atoms(self):
|
|
1140
|
+
# Line indices where a new model starts
|
|
1141
|
+
self._model_start_i = np.array(
|
|
1142
|
+
[i for i in range(len(self.lines)) if self.lines[i].startswith(("MODEL"))],
|
|
1143
|
+
dtype=int,
|
|
1144
|
+
)
|
|
1145
|
+
if len(self._model_start_i) == 0:
|
|
1146
|
+
# It could be an empty file or a file with a single model,
|
|
1147
|
+
# where the 'MODEL' line is missing
|
|
1148
|
+
for line in self.lines:
|
|
1149
|
+
if line.startswith(("ATOM", "HETATM")):
|
|
1150
|
+
# Single model
|
|
1151
|
+
self._model_start_i = np.array([0])
|
|
1152
|
+
break
|
|
1153
|
+
|
|
1154
|
+
# Line indices with ATOM or HETATM records
|
|
1155
|
+
self._atom_line_i = np.array(
|
|
1156
|
+
[
|
|
1157
|
+
i
|
|
1158
|
+
for i in range(len(self.lines))
|
|
1159
|
+
if self.lines[i].startswith(("ATOM", "HETATM"))
|
|
1160
|
+
],
|
|
1161
|
+
dtype=int,
|
|
1162
|
+
)
|
|
1163
|
+
|
|
1164
|
+
def _get_atom_record_indices_for_model(self, model):
|
|
1165
|
+
last_model = len(self._model_start_i)
|
|
1166
|
+
if model == 0:
|
|
1167
|
+
raise ValueError("The model index must not be 0")
|
|
1168
|
+
# Negative models mean index starting from last model
|
|
1169
|
+
model = last_model + model + 1 if model < 0 else model
|
|
1170
|
+
|
|
1171
|
+
if model < last_model:
|
|
1172
|
+
line_filter = (self._atom_line_i >= self._model_start_i[model - 1]) & (
|
|
1173
|
+
self._atom_line_i < self._model_start_i[model]
|
|
1174
|
+
)
|
|
1175
|
+
elif model == last_model:
|
|
1176
|
+
line_filter = self._atom_line_i >= self._model_start_i[model - 1]
|
|
1177
|
+
else:
|
|
1178
|
+
raise ValueError(
|
|
1179
|
+
f"The file has {last_model} models, "
|
|
1180
|
+
f"the given model {model} does not exist"
|
|
1181
|
+
)
|
|
1182
|
+
return self._atom_line_i[line_filter]
|
|
1183
|
+
|
|
1184
|
+
def _get_model_length(self):
|
|
1185
|
+
"""
|
|
1186
|
+
Determine length of models and check that all models
|
|
1187
|
+
have equal length.
|
|
1188
|
+
"""
|
|
1189
|
+
n_models = len(self._model_start_i)
|
|
1190
|
+
length = None
|
|
1191
|
+
for model_i in range(len(self._model_start_i)):
|
|
1192
|
+
model_start = self._model_start_i[model_i]
|
|
1193
|
+
model_stop = (
|
|
1194
|
+
self._model_start_i[model_i + 1]
|
|
1195
|
+
if model_i + 1 < n_models
|
|
1196
|
+
else len(self.lines)
|
|
1197
|
+
)
|
|
1198
|
+
model_length = np.count_nonzero(
|
|
1199
|
+
(self._atom_line_i >= model_start) & (self._atom_line_i < model_stop)
|
|
1200
|
+
)
|
|
1201
|
+
if length is None:
|
|
1202
|
+
length = model_length
|
|
1203
|
+
if model_length != length:
|
|
1204
|
+
raise InvalidFileError(
|
|
1205
|
+
f"Model {model_i + 1} has {model_length} atoms, "
|
|
1206
|
+
f"but model 1 has {length} atoms, must be equal"
|
|
1207
|
+
)
|
|
1208
|
+
return length
|
|
1209
|
+
|
|
1210
|
+
def _get_bonds(self, atom_ids):
|
|
1211
|
+
conect_lines = [line for line in self.lines if line.startswith("CONECT")]
|
|
1212
|
+
|
|
1213
|
+
# Mapping from atom ids to indices in an AtomArray
|
|
1214
|
+
atom_id_to_index = np.full(atom_ids[-1] + 1, -1, dtype=int)
|
|
1215
|
+
try:
|
|
1216
|
+
for i, id in enumerate(atom_ids):
|
|
1217
|
+
atom_id_to_index[id] = i
|
|
1218
|
+
except IndexError as e:
|
|
1219
|
+
raise InvalidFileError("Atom IDs are not strictly increasing") from e
|
|
1220
|
+
|
|
1221
|
+
bonds = []
|
|
1222
|
+
for line in conect_lines:
|
|
1223
|
+
center_index = atom_id_to_index[decode_hybrid36(line[6:11])]
|
|
1224
|
+
if center_index == -1:
|
|
1225
|
+
# Atom ID is not in the AtomArray (probably removed altloc)
|
|
1226
|
+
continue
|
|
1227
|
+
for i in range(11, 31, 5):
|
|
1228
|
+
id_string = line[i : i + 5]
|
|
1229
|
+
try:
|
|
1230
|
+
contact_index = atom_id_to_index[decode_hybrid36(id_string)]
|
|
1231
|
+
if contact_index == -1:
|
|
1232
|
+
# Atom ID is not in the AtomArray (probably removed altloc)
|
|
1233
|
+
continue
|
|
1234
|
+
except ValueError:
|
|
1235
|
+
# String is empty -> no further IDs
|
|
1236
|
+
break
|
|
1237
|
+
bonds.append((center_index, contact_index))
|
|
1238
|
+
|
|
1239
|
+
# The length of the 'atom_ids' array
|
|
1240
|
+
# is equal to the length of the AtomArray
|
|
1241
|
+
return BondList(len(atom_ids), np.array(bonds, dtype=np.uint32))
|
|
1242
|
+
|
|
1243
|
+
def _set_bonds(self, bond_list, atom_ids):
|
|
1244
|
+
# Bond type is unused since PDB does not support bond orders
|
|
1245
|
+
bonds, _ = bond_list.get_all_bonds()
|
|
1246
|
+
|
|
1247
|
+
for center_i, bonded_indices in enumerate(bonds):
|
|
1248
|
+
n_added = 0
|
|
1249
|
+
for bonded_i in bonded_indices:
|
|
1250
|
+
if bonded_i == -1:
|
|
1251
|
+
# Reached padding values
|
|
1252
|
+
break
|
|
1253
|
+
if n_added == 0:
|
|
1254
|
+
# Add new record
|
|
1255
|
+
line = f"CONECT{atom_ids[center_i]:>5}"
|
|
1256
|
+
line += f"{atom_ids[bonded_i]:>5}"
|
|
1257
|
+
n_added += 1
|
|
1258
|
+
if n_added == 4:
|
|
1259
|
+
# Only a maximum of 4 bond partners can be put
|
|
1260
|
+
# into a single line
|
|
1261
|
+
# If there are more, use an extra record
|
|
1262
|
+
n_added = 0
|
|
1263
|
+
self.lines.append(line)
|
|
1264
|
+
if n_added > 0:
|
|
1265
|
+
self.lines.append(line)
|
|
1266
|
+
|
|
1267
|
+
|
|
1268
|
+
def _parse_transformations(lines):
|
|
1269
|
+
"""
|
|
1270
|
+
Parse the rotation and translation transformations from
|
|
1271
|
+
*REMARK* 290 or 350.
|
|
1272
|
+
Return as array of matrices and vectors respectively
|
|
1273
|
+
"""
|
|
1274
|
+
# Each transformation requires 3 lines for the (x,y,z) components
|
|
1275
|
+
if len(lines) % 3 != 0:
|
|
1276
|
+
raise InvalidFileError("Invalid number of transformation vectors")
|
|
1277
|
+
n_transformations = len(lines) // 3
|
|
1278
|
+
|
|
1279
|
+
rotations = np.zeros((n_transformations, 3, 3), dtype=float)
|
|
1280
|
+
translations = np.zeros((n_transformations, 3), dtype=float)
|
|
1281
|
+
|
|
1282
|
+
transformation_i = 0
|
|
1283
|
+
component_i = 0
|
|
1284
|
+
for line in lines:
|
|
1285
|
+
# The first two elements (component and
|
|
1286
|
+
# transformation index) are not used
|
|
1287
|
+
transformations = [float(e) for e in line.split()[2:]]
|
|
1288
|
+
if len(transformations) != 4:
|
|
1289
|
+
raise InvalidFileError("Invalid number of transformation vector elements")
|
|
1290
|
+
rotations[transformation_i, component_i, :] = transformations[:3]
|
|
1291
|
+
translations[transformation_i, component_i] = transformations[3]
|
|
1292
|
+
|
|
1293
|
+
component_i += 1
|
|
1294
|
+
if component_i == 3:
|
|
1295
|
+
# All (x,y,z) components were parsed
|
|
1296
|
+
# -> head to the next transformation
|
|
1297
|
+
transformation_i += 1
|
|
1298
|
+
component_i = 0
|
|
1299
|
+
|
|
1300
|
+
return rotations, translations
|
|
1301
|
+
|
|
1302
|
+
|
|
1303
|
+
def _apply_transformations(structure, rotations, translations):
|
|
1304
|
+
"""
|
|
1305
|
+
Get subassembly by applying the given transformations to the input
|
|
1306
|
+
structure containing affected chains.
|
|
1307
|
+
"""
|
|
1308
|
+
# Additional first dimension for 'structure.repeat()'
|
|
1309
|
+
assembly_coord = np.zeros((len(rotations),) + structure.coord.shape)
|
|
1310
|
+
|
|
1311
|
+
# Apply corresponding transformation for each copy in the assembly
|
|
1312
|
+
for i, (rotation, translation) in enumerate(zip(rotations, translations)):
|
|
1313
|
+
coord = structure.coord
|
|
1314
|
+
# Rotate
|
|
1315
|
+
coord = matrix_rotate(coord, rotation)
|
|
1316
|
+
# Translate
|
|
1317
|
+
coord += translation
|
|
1318
|
+
assembly_coord[i] = coord
|
|
1319
|
+
|
|
1320
|
+
assembly = repeat(structure, assembly_coord)
|
|
1321
|
+
assembly.set_annotation(
|
|
1322
|
+
"sym_id", np.repeat(np.arange(len(rotations)), structure.array_length())
|
|
1323
|
+
)
|
|
1324
|
+
return assembly
|
|
1325
|
+
|
|
1326
|
+
|
|
1327
|
+
def _check_pdb_compatibility(array, hybrid36):
|
|
1328
|
+
annot_categories = array.get_annotation_categories()
|
|
1329
|
+
|
|
1330
|
+
if hybrid36:
|
|
1331
|
+
max_atoms = max_hybrid36_number(5)
|
|
1332
|
+
max_residues = max_hybrid36_number(4)
|
|
1333
|
+
else:
|
|
1334
|
+
max_atoms, max_residues = _PDB_MAX_ATOMS, _PDB_MAX_RESIDUES
|
|
1335
|
+
if "atom_id" in annot_categories:
|
|
1336
|
+
max_atom_id = np.max(array.atom_id)
|
|
1337
|
+
else:
|
|
1338
|
+
max_atom_id = array.array_length()
|
|
1339
|
+
|
|
1340
|
+
if max_atom_id > max_atoms:
|
|
1341
|
+
warnings.warn(f"Atom IDs exceed {max_atoms:,}, will be wrapped")
|
|
1342
|
+
if (array.res_id > max_residues).any():
|
|
1343
|
+
warnings.warn(f"Residue IDs exceed {max_residues:,}, will be wrapped")
|
|
1344
|
+
if np.isnan(array.coord).any():
|
|
1345
|
+
raise BadStructureError("Coordinates contain 'NaN' values")
|
|
1346
|
+
if any([len(name) > 1 for name in array.chain_id]):
|
|
1347
|
+
raise BadStructureError("Some chain IDs exceed 1 character")
|
|
1348
|
+
if any([len(name) > 3 for name in array.res_name]):
|
|
1349
|
+
raise BadStructureError("Some residue names exceed 3 characters")
|
|
1350
|
+
if any([len(name) > 4 for name in array.atom_name]):
|
|
1351
|
+
raise BadStructureError("Some atom names exceed 4 characters")
|
|
1352
|
+
for i, coord_name in enumerate(["x", "y", "z"]):
|
|
1353
|
+
n_coord_digits = number_of_integer_digits(array.coord[..., i])
|
|
1354
|
+
if n_coord_digits > 4:
|
|
1355
|
+
raise BadStructureError(
|
|
1356
|
+
f"4 pre-decimal columns for {coord_name}-coordinates are "
|
|
1357
|
+
f"available, but array would require {n_coord_digits}"
|
|
1358
|
+
)
|
|
1359
|
+
if "b_factor" in annot_categories:
|
|
1360
|
+
n_b_factor_digits = number_of_integer_digits(array.b_factor)
|
|
1361
|
+
if n_b_factor_digits > 3:
|
|
1362
|
+
raise BadStructureError(
|
|
1363
|
+
"3 pre-decimal columns for B-factor are available, "
|
|
1364
|
+
f"but array would require {n_b_factor_digits}"
|
|
1365
|
+
)
|
|
1366
|
+
if "occupancy" in annot_categories:
|
|
1367
|
+
n_occupancy_digits = number_of_integer_digits(array.occupancy)
|
|
1368
|
+
if n_occupancy_digits > 3:
|
|
1369
|
+
raise BadStructureError(
|
|
1370
|
+
"3 pre-decimal columns for occupancy are available, "
|
|
1371
|
+
f"but array would require {n_occupancy_digits}"
|
|
1372
|
+
)
|
|
1373
|
+
if "charge" in annot_categories:
|
|
1374
|
+
# The sign can be omitted is it is put into the adjacent column
|
|
1375
|
+
n_charge_digits = number_of_integer_digits(np.abs(array.charge))
|
|
1376
|
+
if n_charge_digits > 1:
|
|
1377
|
+
raise BadStructureError(
|
|
1378
|
+
"1 column for charge is available, "
|
|
1379
|
+
f"but array would require {n_charge_digits}"
|
|
1380
|
+
)
|