biotite 1.3.0__cp312-cp312-macosx_11_0_arm64.whl → 1.5.0__cp312-cp312-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biotite/application/dssp/app.py +63 -6
- biotite/database/afdb/download.py +12 -6
- biotite/database/rcsb/download.py +1 -0
- biotite/database/rcsb/query.py +2 -2
- biotite/interface/pymol/object.py +3 -1
- biotite/interface/rdkit/mol.py +5 -5
- biotite/sequence/align/banded.cpython-312-darwin.so +0 -0
- biotite/sequence/align/kmeralphabet.cpython-312-darwin.so +0 -0
- biotite/sequence/align/kmersimilarity.cpython-312-darwin.so +0 -0
- biotite/sequence/align/kmertable.cpython-312-darwin.so +0 -0
- biotite/sequence/align/localgapped.cpython-312-darwin.so +0 -0
- biotite/sequence/align/localungapped.cpython-312-darwin.so +0 -0
- biotite/sequence/align/multiple.cpython-312-darwin.so +0 -0
- biotite/sequence/align/pairwise.cpython-312-darwin.so +0 -0
- biotite/sequence/align/permutation.cpython-312-darwin.so +0 -0
- biotite/sequence/align/selector.cpython-312-darwin.so +0 -0
- biotite/sequence/align/tracetable.cpython-312-darwin.so +0 -0
- biotite/sequence/codec.cpython-312-darwin.so +0 -0
- biotite/sequence/phylo/nj.cpython-312-darwin.so +0 -0
- biotite/sequence/phylo/tree.cpython-312-darwin.so +0 -0
- biotite/sequence/phylo/upgma.cpython-312-darwin.so +0 -0
- biotite/structure/atoms.py +1 -1
- biotite/structure/bonds.cpython-312-darwin.so +0 -0
- biotite/structure/bonds.pyx +67 -6
- biotite/structure/box.py +1 -1
- biotite/structure/celllist.cpython-312-darwin.so +0 -0
- biotite/structure/chains.py +34 -0
- biotite/structure/charges.cpython-312-darwin.so +0 -0
- biotite/structure/compare.py +2 -0
- biotite/structure/filter.py +2 -1
- biotite/structure/geometry.py +164 -2
- biotite/structure/info/atoms.py +8 -0
- biotite/structure/info/components.bcif +0 -0
- biotite/structure/io/pdb/convert.py +1 -0
- biotite/structure/io/pdb/file.py +31 -7
- biotite/structure/io/pdb/hybrid36.cpython-312-darwin.so +0 -0
- biotite/structure/io/pdbx/bcif.py +7 -4
- biotite/structure/io/pdbx/cif.py +6 -3
- biotite/structure/io/pdbx/compress.py +15 -11
- biotite/structure/io/pdbx/convert.py +42 -26
- biotite/structure/io/pdbx/encoding.cpython-312-darwin.so +0 -0
- biotite/structure/io/pdbx/encoding.pyx +39 -8
- biotite/structure/residues.py +173 -1
- biotite/structure/rings.py +117 -1
- biotite/structure/sasa.cpython-312-darwin.so +0 -0
- biotite/structure/segments.py +39 -3
- biotite/structure/util.py +14 -22
- biotite/version.py +16 -3
- {biotite-1.3.0.dist-info → biotite-1.5.0.dist-info}/METADATA +1 -1
- {biotite-1.3.0.dist-info → biotite-1.5.0.dist-info}/RECORD +52 -52
- {biotite-1.3.0.dist-info → biotite-1.5.0.dist-info}/WHEEL +0 -0
- {biotite-1.3.0.dist-info → biotite-1.5.0.dist-info}/licenses/LICENSE.rst +0 -0
biotite/structure/geometry.py
CHANGED
|
@@ -19,19 +19,79 @@ __all__ = [
|
|
|
19
19
|
"dihedral",
|
|
20
20
|
"index_dihedral",
|
|
21
21
|
"dihedral_backbone",
|
|
22
|
+
"dihedral_side_chain",
|
|
22
23
|
"centroid",
|
|
23
24
|
]
|
|
24
25
|
|
|
26
|
+
import functools
|
|
25
27
|
import numpy as np
|
|
26
28
|
from biotite.structure.atoms import AtomArray, AtomArrayStack, coord
|
|
27
29
|
from biotite.structure.box import coord_to_fraction, fraction_to_coord, is_orthogonal
|
|
28
|
-
from biotite.structure.filter import filter_amino_acids
|
|
30
|
+
from biotite.structure.filter import filter_amino_acids, filter_canonical_amino_acids
|
|
31
|
+
from biotite.structure.residues import get_residue_starts
|
|
29
32
|
from biotite.structure.util import (
|
|
30
33
|
coord_for_atom_name_per_residue,
|
|
31
34
|
norm_vector,
|
|
32
35
|
vector_dot,
|
|
33
36
|
)
|
|
34
37
|
|
|
38
|
+
# The names of the atoms participating in chi angle
|
|
39
|
+
_CHI_ATOMS = {
|
|
40
|
+
"ARG": [
|
|
41
|
+
("N", "CA", "CB", "CG"),
|
|
42
|
+
("CA", "CB", "CG", "CD"),
|
|
43
|
+
("CB", "CG", "CD", "NE"),
|
|
44
|
+
("CG", "CD", "NE", "CZ"),
|
|
45
|
+
],
|
|
46
|
+
"LEU": [
|
|
47
|
+
("N", "CA", "CB", "CG"),
|
|
48
|
+
# By convention chi2 is defined using CD1 instead of CD2
|
|
49
|
+
("CA", "CB", "CG", "CD1"),
|
|
50
|
+
],
|
|
51
|
+
"VAL": [("N", "CA", "CB", "CG1")],
|
|
52
|
+
"ILE": [("N", "CA", "CB", "CG1"), ("CA", "CB", "CG1", "CD1")],
|
|
53
|
+
"MET": [
|
|
54
|
+
("N", "CA", "CB", "CG"),
|
|
55
|
+
("CA", "CB", "CG", "SD"),
|
|
56
|
+
("CB", "CG", "SD", "CE"),
|
|
57
|
+
],
|
|
58
|
+
"LYS": [
|
|
59
|
+
("N", "CA", "CB", "CG"),
|
|
60
|
+
("CA", "CB", "CG", "CD"),
|
|
61
|
+
("CB", "CG", "CD", "CE"),
|
|
62
|
+
("CG", "CD", "CE", "NZ"),
|
|
63
|
+
],
|
|
64
|
+
"PHE": [
|
|
65
|
+
("N", "CA", "CB", "CG"),
|
|
66
|
+
("CA", "CB", "CG", "CD1"),
|
|
67
|
+
],
|
|
68
|
+
"TRP": [
|
|
69
|
+
("N", "CA", "CB", "CG"),
|
|
70
|
+
("CA", "CB", "CG", "CD1"),
|
|
71
|
+
],
|
|
72
|
+
"TYR": [
|
|
73
|
+
("N", "CA", "CB", "CG"),
|
|
74
|
+
("CA", "CB", "CG", "CD1"),
|
|
75
|
+
],
|
|
76
|
+
"ASN": [("N", "CA", "CB", "CG"), ("CA", "CB", "CG", "OD1")],
|
|
77
|
+
"GLN": [
|
|
78
|
+
("N", "CA", "CB", "CG"),
|
|
79
|
+
("CA", "CB", "CG", "CD"),
|
|
80
|
+
("CB", "CG", "CD", "OE1"),
|
|
81
|
+
],
|
|
82
|
+
"ASP": [("N", "CA", "CB", "CG"), ("CA", "CB", "CG", "OD1")],
|
|
83
|
+
"GLU": [
|
|
84
|
+
("N", "CA", "CB", "CG"),
|
|
85
|
+
("CA", "CB", "CG", "CD"),
|
|
86
|
+
("CB", "CG", "CD", "OE1"),
|
|
87
|
+
],
|
|
88
|
+
"CYS": [("N", "CA", "CB", "SG")],
|
|
89
|
+
"HIS": [("N", "CA", "CB", "CG"), ("CA", "CB", "CG", "ND1")],
|
|
90
|
+
"PRO": [("N", "CA", "CB", "CG"), ("CA", "CB", "CG", "CD")],
|
|
91
|
+
"SER": [("N", "CA", "CB", "OG")],
|
|
92
|
+
"THR": [("N", "CA", "CB", "OG1")],
|
|
93
|
+
}
|
|
94
|
+
|
|
35
95
|
|
|
36
96
|
def displacement(atoms1, atoms2, box=None):
|
|
37
97
|
"""
|
|
@@ -492,7 +552,7 @@ def dihedral_backbone(atom_array):
|
|
|
492
552
|
|
|
493
553
|
Returns
|
|
494
554
|
-------
|
|
495
|
-
phi, psi, omega : ndarray
|
|
555
|
+
phi, psi, omega : ndarray, shape=(m,n) or shape=(n,), dtype=float
|
|
496
556
|
An array containing the 3 backbone dihedral angles for every CA atom.
|
|
497
557
|
`phi` is not defined at the N-terminus, `psi` and `omega` are not defined at the
|
|
498
558
|
C-terminus.
|
|
@@ -562,6 +622,96 @@ def dihedral_backbone(atom_array):
|
|
|
562
622
|
return phi, psi, omg
|
|
563
623
|
|
|
564
624
|
|
|
625
|
+
def dihedral_side_chain(atoms):
|
|
626
|
+
r"""
|
|
627
|
+
Measure the side chain :math:`\chi` dihedral angles of amino acid residues.
|
|
628
|
+
|
|
629
|
+
Parameters
|
|
630
|
+
----------
|
|
631
|
+
atoms : AtomArray or AtomArrayStack
|
|
632
|
+
The protein structure to measure the side chain dihedral angles for.
|
|
633
|
+
|
|
634
|
+
Returns
|
|
635
|
+
-------
|
|
636
|
+
chi : ndarray, shape=(m, n, 4) or shape=(n, 4), dtype=float
|
|
637
|
+
An array containing the up to four side chain dihedral angles for every
|
|
638
|
+
amino acid residue.
|
|
639
|
+
Trailing :math:`\chi` angles that are not defined for an amino acid are filled
|
|
640
|
+
with :math:`NaN` values.
|
|
641
|
+
The same is True for all residues that are not canonical amino acids.
|
|
642
|
+
|
|
643
|
+
Notes
|
|
644
|
+
-----
|
|
645
|
+
By convention, the :math:`\chi_2` angle of leucine is defined using ``CD1``
|
|
646
|
+
instead of ``CD2``.
|
|
647
|
+
|
|
648
|
+
Examples
|
|
649
|
+
--------
|
|
650
|
+
|
|
651
|
+
>>> res_ids, res_names = get_residues(atom_array)
|
|
652
|
+
>>> dihedrals = dihedral_side_chain(atom_array)
|
|
653
|
+
>>> for res_id, res_name, dihedrals in zip(res_ids, res_names, dihedrals):
|
|
654
|
+
... print(f"{res_name.capitalize()}{res_id:<2d}:", dihedrals)
|
|
655
|
+
Asn1 : [-1.180 -0.066 nan nan]
|
|
656
|
+
Leu2 : [0.923 1.866 nan nan]
|
|
657
|
+
Tyr3 : [-2.593 -1.487 nan nan]
|
|
658
|
+
Ile4 : [-0.781 -0.972 nan nan]
|
|
659
|
+
Gln5 : [-2.557 1.410 -1.776 nan]
|
|
660
|
+
Trp6 : [3.117 1.372 nan nan]
|
|
661
|
+
Leu7 : [-1.33 3.08 nan nan]
|
|
662
|
+
Lys8 : [ 1.320 1.734 3.076 -2.022]
|
|
663
|
+
Asp9 : [-1.623 0.909 nan nan]
|
|
664
|
+
Gly10: [nan nan nan nan]
|
|
665
|
+
Gly11: [nan nan nan nan]
|
|
666
|
+
Pro12: [-0.331 0.539 nan nan]
|
|
667
|
+
Ser13: [-1.067 nan nan nan]
|
|
668
|
+
Ser14: [-2.514 nan nan nan]
|
|
669
|
+
Gly15: [nan nan nan nan]
|
|
670
|
+
Arg16: [ 1.032 -3.063 1.541 -1.568]
|
|
671
|
+
Pro17: [ 0.522 -0.601 nan nan]
|
|
672
|
+
Pro18: [ 0.475 -0.577 nan nan]
|
|
673
|
+
Pro19: [ 0.561 -0.602 nan nan]
|
|
674
|
+
Ser20: [-1.055 nan nan nan]
|
|
675
|
+
"""
|
|
676
|
+
is_multi_model = isinstance(atoms, AtomArrayStack)
|
|
677
|
+
|
|
678
|
+
chi_atoms = _all_chi_atoms()
|
|
679
|
+
res_names = atoms.res_name[get_residue_starts(atoms)]
|
|
680
|
+
chi_atom_coord = coord_for_atom_name_per_residue(
|
|
681
|
+
atoms, chi_atoms, filter_canonical_amino_acids(atoms)
|
|
682
|
+
)
|
|
683
|
+
chi_atoms_to_coord_index = {atom_name: i for i, atom_name in enumerate(chi_atoms)}
|
|
684
|
+
|
|
685
|
+
if is_multi_model:
|
|
686
|
+
shape = (atoms.stack_depth(), len(res_names), 4)
|
|
687
|
+
else:
|
|
688
|
+
shape = (len(res_names), 4)
|
|
689
|
+
chi_angles = np.full(shape, np.nan, dtype=np.float32)
|
|
690
|
+
for res_name, chi_atom_names_for_all_angles in _CHI_ATOMS.items():
|
|
691
|
+
res_mask = res_names == res_name
|
|
692
|
+
for chi_i, chi_atom_names in enumerate(chi_atom_names_for_all_angles):
|
|
693
|
+
dihedrals = dihedral(
|
|
694
|
+
chi_atom_coord[
|
|
695
|
+
chi_atoms_to_coord_index[chi_atom_names[0]], ..., res_mask, :
|
|
696
|
+
],
|
|
697
|
+
chi_atom_coord[
|
|
698
|
+
chi_atoms_to_coord_index[chi_atom_names[1]], ..., res_mask, :
|
|
699
|
+
],
|
|
700
|
+
chi_atom_coord[
|
|
701
|
+
chi_atoms_to_coord_index[chi_atom_names[2]], ..., res_mask, :
|
|
702
|
+
],
|
|
703
|
+
chi_atom_coord[
|
|
704
|
+
chi_atoms_to_coord_index[chi_atom_names[3]], ..., res_mask, :
|
|
705
|
+
],
|
|
706
|
+
)
|
|
707
|
+
if is_multi_model:
|
|
708
|
+
# Swap dimensions due to NumPy's behavior when using advanced indexing
|
|
709
|
+
# (https://numpy.org/devdocs/user/basics.indexing.html#combining-advanced-and-basic-indexing)
|
|
710
|
+
dihedrals = dihedrals.T
|
|
711
|
+
chi_angles[..., res_mask, chi_i] = dihedrals
|
|
712
|
+
return chi_angles
|
|
713
|
+
|
|
714
|
+
|
|
565
715
|
def centroid(atoms):
|
|
566
716
|
"""
|
|
567
717
|
Measure the centroid of a structure.
|
|
@@ -653,3 +803,15 @@ def _displacement_triclinic_box(fractions, box, disp):
|
|
|
653
803
|
disp[:] = shifted_diffs[
|
|
654
804
|
np.arange(len(shifted_diffs)), np.argmin(sq_distance, axis=1)
|
|
655
805
|
]
|
|
806
|
+
|
|
807
|
+
|
|
808
|
+
@functools.cache
|
|
809
|
+
def _all_chi_atoms():
|
|
810
|
+
"""
|
|
811
|
+
Get the names of the atoms participating in any chi angle.
|
|
812
|
+
"""
|
|
813
|
+
atom_names = set()
|
|
814
|
+
for angles in _CHI_ATOMS.values():
|
|
815
|
+
for angle in angles:
|
|
816
|
+
atom_names.update(angle)
|
|
817
|
+
return sorted(atom_names)
|
biotite/structure/info/atoms.py
CHANGED
|
@@ -6,6 +6,7 @@ __name__ = "biotite.structure.info"
|
|
|
6
6
|
__author__ = "Patrick Kunzmann"
|
|
7
7
|
__all__ = ["residue"]
|
|
8
8
|
|
|
9
|
+
import functools
|
|
9
10
|
from biotite.structure.info.ccd import get_ccd
|
|
10
11
|
|
|
11
12
|
# fmt: off
|
|
@@ -75,6 +76,13 @@ def residue(res_name, allow_missing_coord=False):
|
|
|
75
76
|
['CB' 'HB3']
|
|
76
77
|
['OXT' 'HXT']]
|
|
77
78
|
"""
|
|
79
|
+
# Use a cache internally, but always return a copy,
|
|
80
|
+
# as the returned AtomArray is mutable
|
|
81
|
+
return _residue(res_name, allow_missing_coord).copy()
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
@functools.lru_cache(maxsize=100)
|
|
85
|
+
def _residue(res_name, allow_missing_coord=False):
|
|
78
86
|
# Avoid circular import
|
|
79
87
|
from biotite.structure.io.pdbx import get_component
|
|
80
88
|
|
|
Binary file
|
biotite/structure/io/pdb/file.py
CHANGED
|
@@ -6,12 +6,16 @@ __name__ = "biotite.structure.io.pdb"
|
|
|
6
6
|
__author__ = "Patrick Kunzmann, Daniel Bauer, Claude J. Rogers"
|
|
7
7
|
__all__ = ["PDBFile"]
|
|
8
8
|
|
|
9
|
+
import itertools
|
|
9
10
|
import warnings
|
|
10
11
|
from collections import namedtuple
|
|
11
12
|
import numpy as np
|
|
12
13
|
from biotite.file import InvalidFileError, TextFile
|
|
13
14
|
from biotite.structure.atoms import AtomArray, AtomArrayStack, repeat
|
|
14
|
-
from biotite.structure.bonds import
|
|
15
|
+
from biotite.structure.bonds import (
|
|
16
|
+
BondList,
|
|
17
|
+
connect_via_residue_names,
|
|
18
|
+
)
|
|
15
19
|
from biotite.structure.box import unitcell_from_vectors, vectors_from_unitcell
|
|
16
20
|
from biotite.structure.error import BadStructureError
|
|
17
21
|
from biotite.structure.filter import (
|
|
@@ -19,6 +23,7 @@ from biotite.structure.filter import (
|
|
|
19
23
|
filter_highest_occupancy_altloc,
|
|
20
24
|
filter_solvent,
|
|
21
25
|
)
|
|
26
|
+
from biotite.structure.info.bonds import bonds_in_residue
|
|
22
27
|
from biotite.structure.io.pdb.hybrid36 import (
|
|
23
28
|
decode_hybrid36,
|
|
24
29
|
encode_hybrid36,
|
|
@@ -544,7 +549,16 @@ class PDBFile(TextFile):
|
|
|
544
549
|
# Read bonds
|
|
545
550
|
if include_bonds:
|
|
546
551
|
bond_list = self._get_bonds(atom_id)
|
|
547
|
-
|
|
552
|
+
# Create bond dict containing only non-hetero residues (+ water)
|
|
553
|
+
custom_bond_dict = {
|
|
554
|
+
res_name: bonds_in_residue(res_name)
|
|
555
|
+
for res_name in itertools.chain(
|
|
556
|
+
np.unique(array[..., ~array.hetero].res_name), ["HOH"]
|
|
557
|
+
)
|
|
558
|
+
}
|
|
559
|
+
bond_list = bond_list.merge(
|
|
560
|
+
connect_via_residue_names(array, custom_bond_dict=custom_bond_dict)
|
|
561
|
+
)
|
|
548
562
|
array.bonds = bond_list
|
|
549
563
|
|
|
550
564
|
return array
|
|
@@ -936,7 +950,11 @@ class PDBFile(TextFile):
|
|
|
936
950
|
if transform_start is None:
|
|
937
951
|
raise InvalidFileError("No 'BIOMT' records found for chosen assembly")
|
|
938
952
|
rotations, translations = _parse_transformations(
|
|
939
|
-
|
|
953
|
+
[
|
|
954
|
+
line
|
|
955
|
+
for line in assembly_lines[transform_start:stop]
|
|
956
|
+
if len(line.strip()) > 0
|
|
957
|
+
]
|
|
940
958
|
)
|
|
941
959
|
# Filter affected chains
|
|
942
960
|
sub_structure = structure[
|
|
@@ -1193,7 +1211,7 @@ class PDBFile(TextFile):
|
|
|
1193
1211
|
conect_lines = [line for line in self.lines if line.startswith("CONECT")]
|
|
1194
1212
|
|
|
1195
1213
|
# Mapping from atom ids to indices in an AtomArray
|
|
1196
|
-
atom_id_to_index = np.
|
|
1214
|
+
atom_id_to_index = np.full(atom_ids[-1] + 1, -1, dtype=int)
|
|
1197
1215
|
try:
|
|
1198
1216
|
for i, id in enumerate(atom_ids):
|
|
1199
1217
|
atom_id_to_index[id] = i
|
|
@@ -1202,15 +1220,21 @@ class PDBFile(TextFile):
|
|
|
1202
1220
|
|
|
1203
1221
|
bonds = []
|
|
1204
1222
|
for line in conect_lines:
|
|
1205
|
-
|
|
1223
|
+
center_index = atom_id_to_index[decode_hybrid36(line[6:11])]
|
|
1224
|
+
if center_index == -1:
|
|
1225
|
+
# Atom ID is not in the AtomArray (probably removed altloc)
|
|
1226
|
+
continue
|
|
1206
1227
|
for i in range(11, 31, 5):
|
|
1207
1228
|
id_string = line[i : i + 5]
|
|
1208
1229
|
try:
|
|
1209
|
-
|
|
1230
|
+
contact_index = atom_id_to_index[decode_hybrid36(id_string)]
|
|
1231
|
+
if contact_index == -1:
|
|
1232
|
+
# Atom ID is not in the AtomArray (probably removed altloc)
|
|
1233
|
+
continue
|
|
1210
1234
|
except ValueError:
|
|
1211
1235
|
# String is empty -> no further IDs
|
|
1212
1236
|
break
|
|
1213
|
-
bonds.append((
|
|
1237
|
+
bonds.append((center_index, contact_index))
|
|
1214
1238
|
|
|
1215
1239
|
# The length of the 'atom_ids' array
|
|
1216
1240
|
# is equal to the length of the AtomArray
|
|
Binary file
|
|
@@ -292,7 +292,7 @@ class BinaryCIFColumn(_Component):
|
|
|
292
292
|
else:
|
|
293
293
|
# Array needs to be converted, but masked values are
|
|
294
294
|
# not necessarily convertible
|
|
295
|
-
# (e.g. '' cannot be converted to int)
|
|
295
|
+
# (e.g. '.' cannot be converted to int)
|
|
296
296
|
if masked_value is None:
|
|
297
297
|
array = np.zeros(len(self._data), dtype=dtype)
|
|
298
298
|
else:
|
|
@@ -511,7 +511,7 @@ class BinaryCIFBlock(_HierarchicalContainer):
|
|
|
511
511
|
|
|
512
512
|
def __delitem__(self, key):
|
|
513
513
|
try:
|
|
514
|
-
return super().
|
|
514
|
+
return super().__delitem__("_" + key)
|
|
515
515
|
except KeyError:
|
|
516
516
|
raise KeyError(key)
|
|
517
517
|
|
|
@@ -581,9 +581,12 @@ class BinaryCIFFile(File, _HierarchicalContainer):
|
|
|
581
581
|
|
|
582
582
|
@property
|
|
583
583
|
def block(self):
|
|
584
|
-
if len(self)
|
|
584
|
+
if len(self) == 0:
|
|
585
|
+
raise ValueError("There are no blocks in the file")
|
|
586
|
+
elif len(self) > 1:
|
|
585
587
|
raise ValueError("There are multiple blocks in the file")
|
|
586
|
-
|
|
588
|
+
else:
|
|
589
|
+
return self[next(iter(self))]
|
|
587
590
|
|
|
588
591
|
@staticmethod
|
|
589
592
|
def subcomponent_class():
|
biotite/structure/io/pdbx/cif.py
CHANGED
|
@@ -243,7 +243,7 @@ class CIFColumn:
|
|
|
243
243
|
else:
|
|
244
244
|
# Array needs to be converted, but masked values are
|
|
245
245
|
# not necessarily convertible
|
|
246
|
-
# (e.g. '' cannot be converted to int)
|
|
246
|
+
# (e.g. '.' cannot be converted to int)
|
|
247
247
|
if masked_value is None:
|
|
248
248
|
array = np.zeros(len(self._data), dtype=dtype)
|
|
249
249
|
else:
|
|
@@ -799,9 +799,12 @@ class CIFFile(_Component, File, MutableMapping):
|
|
|
799
799
|
|
|
800
800
|
@property
|
|
801
801
|
def block(self):
|
|
802
|
-
if len(self)
|
|
802
|
+
if len(self) == 0:
|
|
803
|
+
raise ValueError("There are no blocks in the file")
|
|
804
|
+
elif len(self) > 1:
|
|
803
805
|
raise ValueError("There are multiple blocks in the file")
|
|
804
|
-
|
|
806
|
+
else:
|
|
807
|
+
return self[next(iter(self))]
|
|
805
808
|
|
|
806
809
|
@staticmethod
|
|
807
810
|
def subcomponent_class():
|
|
@@ -56,14 +56,14 @@ def compress(data, float_tolerance=None, rtol=1e-6, atol=1e-4):
|
|
|
56
56
|
>>> pdbx_file.write(uncompressed_file)
|
|
57
57
|
>>> _ = uncompressed_file.seek(0)
|
|
58
58
|
>>> print(f"{len(uncompressed_file.read()) // 1000} KB")
|
|
59
|
-
|
|
59
|
+
937 KB
|
|
60
60
|
>>> # Write compressed file
|
|
61
61
|
>>> pdbx_file = compress(pdbx_file)
|
|
62
62
|
>>> compressed_file = BytesIO()
|
|
63
63
|
>>> pdbx_file.write(compressed_file)
|
|
64
64
|
>>> _ = compressed_file.seek(0)
|
|
65
65
|
>>> print(f"{len(compressed_file.read()) // 1000} KB")
|
|
66
|
-
|
|
66
|
+
114 KB
|
|
67
67
|
"""
|
|
68
68
|
if float_tolerance is not None:
|
|
69
69
|
warnings.warn(
|
|
@@ -140,8 +140,8 @@ def _compress_data(bcif_data, rtol, atol):
|
|
|
140
140
|
# Run encode to initialize the data and offset arrays
|
|
141
141
|
indices = encoding.encode(array)
|
|
142
142
|
offsets = np.cumsum([0] + [len(s) for s in encoding.strings])
|
|
143
|
-
encoding.data_encoding
|
|
144
|
-
encoding.offset_encoding
|
|
143
|
+
encoding.data_encoding = _find_best_integer_compression(indices)
|
|
144
|
+
encoding.offset_encoding = _find_best_integer_compression(offsets)
|
|
145
145
|
return bcif.BinaryCIFData(array, [encoding])
|
|
146
146
|
|
|
147
147
|
elif np.issubdtype(array.dtype, np.floating):
|
|
@@ -159,18 +159,22 @@ def _compress_data(bcif_data, rtol, atol):
|
|
|
159
159
|
# -> do not use integer encoding
|
|
160
160
|
return bcif.BinaryCIFData(array, [ByteArrayEncoding()])
|
|
161
161
|
else:
|
|
162
|
-
best_encoding
|
|
163
|
-
|
|
162
|
+
best_encoding = _find_best_integer_compression(integer_array)
|
|
163
|
+
compressed_data = bcif.BinaryCIFData(
|
|
164
|
+
array, [to_integer_encoding] + best_encoding
|
|
164
165
|
)
|
|
165
|
-
|
|
166
|
-
|
|
166
|
+
uncompressed_data = bcif.BinaryCIFData(array, [ByteArrayEncoding()])
|
|
167
|
+
if _data_size_in_file(compressed_data) < _data_size_in_file(
|
|
168
|
+
uncompressed_data
|
|
169
|
+
):
|
|
170
|
+
return compressed_data
|
|
167
171
|
else:
|
|
168
172
|
# The float array is smaller -> encode it directly as bytes
|
|
169
|
-
return
|
|
173
|
+
return uncompressed_data
|
|
170
174
|
|
|
171
175
|
elif np.issubdtype(array.dtype, np.integer):
|
|
172
176
|
array = _to_smallest_integer_type(array)
|
|
173
|
-
encodings
|
|
177
|
+
encodings = _find_best_integer_compression(array)
|
|
174
178
|
return bcif.BinaryCIFData(array, encodings)
|
|
175
179
|
|
|
176
180
|
else:
|
|
@@ -233,7 +237,7 @@ def _find_best_integer_compression(array):
|
|
|
233
237
|
if size < smallest_size:
|
|
234
238
|
best_encoding_sequence = encodings
|
|
235
239
|
smallest_size = size
|
|
236
|
-
return best_encoding_sequence
|
|
240
|
+
return best_encoding_sequence
|
|
237
241
|
|
|
238
242
|
|
|
239
243
|
def _estimate_packed_length(array, packed_byte_count):
|
|
@@ -55,6 +55,7 @@ from biotite.structure.io.pdbx.bcif import (
|
|
|
55
55
|
from biotite.structure.io.pdbx.cif import CIFBlock, CIFFile
|
|
56
56
|
from biotite.structure.io.pdbx.component import MaskValue
|
|
57
57
|
from biotite.structure.io.pdbx.encoding import StringArrayEncoding
|
|
58
|
+
from biotite.structure.repair import create_continuous_res_ids
|
|
58
59
|
from biotite.structure.residues import (
|
|
59
60
|
get_residue_count,
|
|
60
61
|
get_residue_positions,
|
|
@@ -496,12 +497,6 @@ def _fill_annotations(array, atom_site, extra_fields, use_author_fields):
|
|
|
496
497
|
atom_site, f"{prefix}_asym_id", f"{alt_prefix}_asym_id"
|
|
497
498
|
).as_array(str),
|
|
498
499
|
)
|
|
499
|
-
array.set_annotation(
|
|
500
|
-
"res_id",
|
|
501
|
-
_get_or_fallback(
|
|
502
|
-
atom_site, f"{prefix}_seq_id", f"{alt_prefix}_seq_id"
|
|
503
|
-
).as_array(int, -1),
|
|
504
|
-
)
|
|
505
500
|
array.set_annotation("ins_code", atom_site["pdbx_PDB_ins_code"].as_array(str, ""))
|
|
506
501
|
array.set_annotation(
|
|
507
502
|
"res_name",
|
|
@@ -518,6 +513,22 @@ def _fill_annotations(array, atom_site, extra_fields, use_author_fields):
|
|
|
518
513
|
)
|
|
519
514
|
array.set_annotation("element", atom_site["type_symbol"].as_array(str))
|
|
520
515
|
|
|
516
|
+
# Special handling for `res_id`, as the `label_seq_id` is equal (`.`) for all
|
|
517
|
+
# hetero residues, which makes distinguishing subsequent residues from another
|
|
518
|
+
# difficult (https://github.com/biotite-dev/biotite/issues/553)
|
|
519
|
+
res_id = _get_or_fallback(
|
|
520
|
+
atom_site, f"{prefix}_seq_id", f"{alt_prefix}_seq_id"
|
|
521
|
+
).as_array(int, -1)
|
|
522
|
+
if not use_author_fields and "auth_seq_id" in atom_site:
|
|
523
|
+
# Therefore, the `auth_seq_id` is still used to determine residue starts
|
|
524
|
+
# in `create_continuous_res_ids()`, even if `use_author_fields = False`.
|
|
525
|
+
res_id_for_residue_starts = atom_site["auth_seq_id"].as_array(int, -1)
|
|
526
|
+
array.set_annotation("res_id", res_id_for_residue_starts)
|
|
527
|
+
fallback_res_ids = create_continuous_res_ids(array)
|
|
528
|
+
array.set_annotation("res_id", np.where(res_id == -1, fallback_res_ids, res_id))
|
|
529
|
+
else:
|
|
530
|
+
array.set_annotation("res_id", res_id)
|
|
531
|
+
|
|
521
532
|
if "atom_id" in extra_fields:
|
|
522
533
|
if "id" in atom_site:
|
|
523
534
|
array.set_annotation("atom_id", atom_site["id"].as_array(int))
|
|
@@ -775,7 +786,10 @@ def _filter_altloc(array, atom_site, altloc):
|
|
|
775
786
|
if altloc == "all":
|
|
776
787
|
array.set_annotation("altloc_id", altloc_ids.as_array(str))
|
|
777
788
|
return array, atom_site
|
|
778
|
-
elif altloc_ids is None or (
|
|
789
|
+
elif altloc_ids is None or (
|
|
790
|
+
altloc_ids.mask is not None
|
|
791
|
+
and (altloc_ids.mask.array != MaskValue.PRESENT).all()
|
|
792
|
+
):
|
|
779
793
|
# No altlocs in atom_site category
|
|
780
794
|
return array, atom_site
|
|
781
795
|
elif altloc == "occupancy" and occupancy is not None:
|
|
@@ -873,11 +887,7 @@ def set_structure(
|
|
|
873
887
|
this parameter is ignored.
|
|
874
888
|
If the file is empty, a new data block will be created.
|
|
875
889
|
include_bonds : bool, optional
|
|
876
|
-
|
|
877
|
-
intra-residue bonds will be written into the ``chem_comp_bond``
|
|
878
|
-
category.
|
|
879
|
-
Inter-residue bonds will be written into the ``struct_conn``
|
|
880
|
-
independent of this parameter.
|
|
890
|
+
DEPRECATED: Has no effect anymore.
|
|
881
891
|
extra_fields : list of str, optional
|
|
882
892
|
List of additional fields from the ``atom_site`` category
|
|
883
893
|
that should be written into the file.
|
|
@@ -898,6 +908,13 @@ def set_structure(
|
|
|
898
908
|
>>> set_structure(file, atom_array)
|
|
899
909
|
>>> file.write(os.path.join(path_to_directory, "structure.cif"))
|
|
900
910
|
"""
|
|
911
|
+
if include_bonds:
|
|
912
|
+
warnings.warn(
|
|
913
|
+
"`include_bonds` parameter is deprecated, "
|
|
914
|
+
"intra-residue are always written, if available",
|
|
915
|
+
DeprecationWarning,
|
|
916
|
+
)
|
|
917
|
+
|
|
901
918
|
_check_non_empty(array)
|
|
902
919
|
|
|
903
920
|
block = _get_or_create_block(pdbx_file, data_block)
|
|
@@ -975,10 +992,9 @@ def set_structure(
|
|
|
975
992
|
struct_conn = _set_inter_residue_bonds(array, atom_site)
|
|
976
993
|
if struct_conn is not None:
|
|
977
994
|
block["struct_conn"] = struct_conn
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
block["chem_comp_bond"] = chem_comp_bond
|
|
995
|
+
chem_comp_bond = _set_intra_residue_bonds(array, atom_site)
|
|
996
|
+
if chem_comp_bond is not None:
|
|
997
|
+
block["chem_comp_bond"] = chem_comp_bond
|
|
982
998
|
|
|
983
999
|
# In case of a single model handle each coordinate
|
|
984
1000
|
# simply like a flattened array
|
|
@@ -1652,11 +1668,11 @@ def get_assembly(
|
|
|
1652
1668
|
If set to true, a :class:`BondList` will be created for the
|
|
1653
1669
|
resulting :class:`AtomArray` containing the bond information
|
|
1654
1670
|
from the file.
|
|
1655
|
-
|
|
1656
|
-
|
|
1657
|
-
|
|
1658
|
-
|
|
1659
|
-
|
|
1671
|
+
Inter-residue bonds, will be read from the ``struct_conn``
|
|
1672
|
+
category.
|
|
1673
|
+
Intra-residue bonds will be read from the ``chem_comp_bond``, if
|
|
1674
|
+
available, otherwise they will be derived from the Chemical
|
|
1675
|
+
Component Dictionary.
|
|
1660
1676
|
|
|
1661
1677
|
Returns
|
|
1662
1678
|
-------
|
|
@@ -1926,11 +1942,11 @@ def get_unit_cell(
|
|
|
1926
1942
|
If set to true, a :class:`BondList` will be created for the
|
|
1927
1943
|
resulting :class:`AtomArray` containing the bond information
|
|
1928
1944
|
from the file.
|
|
1929
|
-
|
|
1930
|
-
|
|
1931
|
-
|
|
1932
|
-
|
|
1933
|
-
|
|
1945
|
+
Inter-residue bonds, will be read from the ``struct_conn``
|
|
1946
|
+
category.
|
|
1947
|
+
Intra-residue bonds will be read from the ``chem_comp_bond``, if
|
|
1948
|
+
available, otherwise they will be derived from the Chemical
|
|
1949
|
+
Component Dictionary.
|
|
1934
1950
|
|
|
1935
1951
|
Returns
|
|
1936
1952
|
-------
|
|
Binary file
|
|
@@ -225,9 +225,13 @@ class Encoding(_Component, metaclass=ABCMeta):
|
|
|
225
225
|
-------
|
|
226
226
|
decoded_data : ndarray
|
|
227
227
|
The decoded data.
|
|
228
|
+
|
|
229
|
+
Warnings
|
|
230
|
+
--------
|
|
231
|
+
When overriding this method, do not omit bound checks with
|
|
232
|
+
``@cython.boundscheck(False)`` or ``@cython.wraparound(False)``,
|
|
233
|
+
since the file content may be invalid/malicious.
|
|
228
234
|
"""
|
|
229
|
-
# Important: Do not omit bound checks for decoding,
|
|
230
|
-
# since the file content may be invalid/malicious.
|
|
231
235
|
raise NotImplementedError()
|
|
232
236
|
|
|
233
237
|
def __str__(self):
|
|
@@ -883,17 +887,39 @@ class StringArrayEncoding(Encoding):
|
|
|
883
887
|
else:
|
|
884
888
|
check_present = True
|
|
885
889
|
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
|
|
890
|
+
if len(self.strings) > 0:
|
|
891
|
+
string_order = _safe_cast(np.argsort(self.strings), np.int32)
|
|
892
|
+
sorted_strings = self.strings[string_order]
|
|
893
|
+
sorted_indices = np.searchsorted(sorted_strings, data)
|
|
894
|
+
indices = string_order[sorted_indices]
|
|
895
|
+
# `"" not in self.strings` can be quite costly and is only necessary,
|
|
896
|
+
# if the the `strings` were given by the user, as otherwise we always
|
|
897
|
+
# include an empty string explicitly when we compute them in this function
|
|
898
|
+
# -> Only run if `check_present` is True
|
|
899
|
+
if check_present and "" not in self.strings:
|
|
900
|
+
# Represent empty strings as -1
|
|
901
|
+
indices[data == ""] = -1
|
|
902
|
+
else:
|
|
903
|
+
# There are no strings -> The indices can only ever be -1 to indicate
|
|
904
|
+
# missing values
|
|
905
|
+
# The check if this is correct is done below
|
|
906
|
+
indices = np.full(data.shape[0], -1, dtype=np.int32)
|
|
907
|
+
|
|
908
|
+
valid_indices_mask = indices != -1
|
|
909
|
+
if check_present and not np.all(
|
|
910
|
+
self.strings[indices[valid_indices_mask]] == data[valid_indices_mask]
|
|
911
|
+
):
|
|
891
912
|
raise ValueError("Data contains strings not present in 'strings'")
|
|
892
913
|
return encode_stepwise(indices, self.data_encoding)
|
|
893
914
|
|
|
894
915
|
def decode(self, data):
|
|
895
916
|
indices = decode_stepwise(data, self.data_encoding)
|
|
896
|
-
|
|
917
|
+
# Initialize with empty strings
|
|
918
|
+
strings = np.zeros(indices.shape[0], dtype=self.strings.dtype)
|
|
919
|
+
# `-1`` indices indicate missing values
|
|
920
|
+
valid_indices_mask = indices != -1
|
|
921
|
+
strings[valid_indices_mask] = self.strings[indices[valid_indices_mask]]
|
|
922
|
+
return strings
|
|
897
923
|
|
|
898
924
|
def __eq__(self, other):
|
|
899
925
|
if not isinstance(other, type(self)):
|
|
@@ -1009,6 +1035,11 @@ def decode_stepwise(data, encoding):
|
|
|
1009
1035
|
"""
|
|
1010
1036
|
for enc in reversed(encoding):
|
|
1011
1037
|
data = enc.decode(data)
|
|
1038
|
+
# ByteEncoding may decode in a non-writable array,
|
|
1039
|
+
# as it creates the ndarray cheaply from buffer
|
|
1040
|
+
if not data.flags.writeable:
|
|
1041
|
+
# Make the resulting ndarray writable, by copying the underlying buffer
|
|
1042
|
+
data = data.copy()
|
|
1012
1043
|
return data
|
|
1013
1044
|
|
|
1014
1045
|
|