biotite 1.0.0__cp311-cp311-macosx_11_0_arm64.whl → 1.1.0__cp311-cp311-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biotite might be problematic. Click here for more details.
- biotite/application/dssp/app.py +13 -3
- biotite/application/localapp.py +34 -0
- biotite/application/muscle/app3.py +2 -15
- biotite/application/muscle/app5.py +2 -2
- biotite/application/util.py +1 -1
- biotite/application/viennarna/rnaplot.py +6 -2
- biotite/database/rcsb/query.py +6 -6
- biotite/database/uniprot/check.py +20 -15
- biotite/database/uniprot/download.py +1 -1
- biotite/database/uniprot/query.py +1 -1
- biotite/sequence/align/alignment.py +16 -3
- biotite/sequence/align/banded.cpython-311-darwin.so +0 -0
- biotite/sequence/align/banded.pyx +5 -5
- biotite/sequence/align/kmeralphabet.cpython-311-darwin.so +0 -0
- biotite/sequence/align/kmeralphabet.pyx +17 -0
- biotite/sequence/align/kmersimilarity.cpython-311-darwin.so +0 -0
- biotite/sequence/align/kmertable.cpython-311-darwin.so +0 -0
- biotite/sequence/align/kmertable.pyx +52 -42
- biotite/sequence/align/localgapped.cpython-311-darwin.so +0 -0
- biotite/sequence/align/localungapped.cpython-311-darwin.so +0 -0
- biotite/sequence/align/matrix.py +273 -55
- biotite/sequence/align/matrix_data/3Di.mat +24 -0
- biotite/sequence/align/matrix_data/PB.license +21 -0
- biotite/sequence/align/matrix_data/PB.mat +18 -0
- biotite/sequence/align/multiple.cpython-311-darwin.so +0 -0
- biotite/sequence/align/pairwise.cpython-311-darwin.so +0 -0
- biotite/sequence/align/permutation.cpython-311-darwin.so +0 -0
- biotite/sequence/align/selector.cpython-311-darwin.so +0 -0
- biotite/sequence/align/tracetable.cpython-311-darwin.so +0 -0
- biotite/sequence/alphabet.py +3 -0
- biotite/sequence/codec.cpython-311-darwin.so +0 -0
- biotite/sequence/graphics/color_schemes/3di_flower.json +48 -0
- biotite/sequence/graphics/color_schemes/pb_flower.json +2 -1
- biotite/sequence/graphics/colorschemes.py +44 -11
- biotite/sequence/phylo/nj.cpython-311-darwin.so +0 -0
- biotite/sequence/phylo/tree.cpython-311-darwin.so +0 -0
- biotite/sequence/phylo/upgma.cpython-311-darwin.so +0 -0
- biotite/sequence/profile.py +86 -4
- biotite/sequence/seqtypes.py +124 -3
- biotite/setup_ccd.py +197 -0
- biotite/structure/__init__.py +4 -3
- biotite/structure/alphabet/__init__.py +25 -0
- biotite/structure/alphabet/encoder.py +332 -0
- biotite/structure/alphabet/encoder_weights_3di.kerasify +0 -0
- biotite/structure/alphabet/i3d.py +110 -0
- biotite/structure/alphabet/layers.py +86 -0
- biotite/structure/alphabet/pb.license +21 -0
- biotite/structure/alphabet/pb.py +171 -0
- biotite/structure/alphabet/unkerasify.py +122 -0
- biotite/structure/atoms.py +156 -43
- biotite/structure/bonds.cpython-311-darwin.so +0 -0
- biotite/structure/bonds.pyx +72 -21
- biotite/structure/celllist.cpython-311-darwin.so +0 -0
- biotite/structure/charges.cpython-311-darwin.so +0 -0
- biotite/structure/filter.py +1 -1
- biotite/structure/geometry.py +60 -113
- biotite/structure/info/__init__.py +1 -0
- biotite/structure/info/atoms.py +13 -13
- biotite/structure/info/bonds.py +12 -6
- biotite/structure/info/ccd.py +125 -32
- biotite/structure/info/{ccd/components.bcif → components.bcif} +0 -0
- biotite/structure/info/groups.py +63 -17
- biotite/structure/info/masses.py +9 -6
- biotite/structure/info/misc.py +15 -21
- biotite/structure/info/standardize.py +3 -2
- biotite/structure/io/mol/sdf.py +41 -40
- biotite/structure/io/pdb/convert.py +2 -0
- biotite/structure/io/pdb/file.py +74 -3
- biotite/structure/io/pdb/hybrid36.cpython-311-darwin.so +0 -0
- biotite/structure/io/pdbqt/file.py +32 -32
- biotite/structure/io/pdbx/__init__.py +1 -0
- biotite/structure/io/pdbx/bcif.py +32 -8
- biotite/structure/io/pdbx/cif.py +148 -107
- biotite/structure/io/pdbx/component.py +9 -4
- biotite/structure/io/pdbx/compress.py +321 -0
- biotite/structure/io/pdbx/convert.py +227 -68
- biotite/structure/io/pdbx/encoding.cpython-311-darwin.so +0 -0
- biotite/structure/io/pdbx/encoding.pyx +98 -17
- biotite/structure/io/trajfile.py +16 -16
- biotite/structure/molecules.py +141 -141
- biotite/structure/sasa.cpython-311-darwin.so +0 -0
- biotite/structure/segments.py +1 -2
- biotite/structure/util.py +73 -1
- biotite/version.py +2 -2
- {biotite-1.0.0.dist-info → biotite-1.1.0.dist-info}/METADATA +4 -1
- {biotite-1.0.0.dist-info → biotite-1.1.0.dist-info}/RECORD +88 -78
- biotite/structure/info/ccd/README.rst +0 -8
- biotite/structure/info/ccd/amino_acids.txt +0 -1663
- biotite/structure/info/ccd/carbohydrates.txt +0 -1135
- biotite/structure/info/ccd/nucleotides.txt +0 -798
- {biotite-1.0.0.dist-info → biotite-1.1.0.dist-info}/WHEEL +0 -0
- {biotite-1.0.0.dist-info → biotite-1.1.0.dist-info}/licenses/LICENSE.rst +0 -0
biotite/sequence/profile.py
CHANGED
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
# information.
|
|
4
4
|
|
|
5
5
|
import warnings
|
|
6
|
+
from numbers import Integral
|
|
6
7
|
import numpy as np
|
|
7
8
|
from biotite.sequence.align.alignment import get_codes
|
|
8
9
|
from biotite.sequence.alphabet import LetterAlphabet
|
|
@@ -66,6 +67,9 @@ class SequenceProfile(object):
|
|
|
66
67
|
It also saves the number of gaps at each position in the array
|
|
67
68
|
'gaps'.
|
|
68
69
|
|
|
70
|
+
With :meth:`from_alignment()` a :class:`SequenceProfile` object can
|
|
71
|
+
be created from an indefinite number of aligned sequences.
|
|
72
|
+
|
|
69
73
|
With :meth:`probability_matrix()` the position probability matrix
|
|
70
74
|
can be created based on 'symbols' and a pseudocount.
|
|
71
75
|
|
|
@@ -73,9 +77,6 @@ class SequenceProfile(object):
|
|
|
73
77
|
be created based on the before calculated position probability
|
|
74
78
|
matrix and the background frequencies.
|
|
75
79
|
|
|
76
|
-
With :meth:`from_alignment()` a :class:`SequenceProfile` object can
|
|
77
|
-
be created from an indefinite number of aligned sequences.
|
|
78
|
-
|
|
79
80
|
With :meth:`sequence_probability_from_matrix()` the probability of a
|
|
80
81
|
sequence can be calculated based on the before calculated position
|
|
81
82
|
probability matrix of this instance of object SequenceProfile.
|
|
@@ -105,6 +106,63 @@ class SequenceProfile(object):
|
|
|
105
106
|
Array which indicates the number of gaps at each position.
|
|
106
107
|
alphabet : Alphabet, length=k
|
|
107
108
|
Alphabet of sequences of sequence profile
|
|
109
|
+
|
|
110
|
+
Examples
|
|
111
|
+
--------
|
|
112
|
+
|
|
113
|
+
Create a profile from a multiple sequence alignment:
|
|
114
|
+
|
|
115
|
+
>>> sequences = [
|
|
116
|
+
... NucleotideSequence("CGCTCATTC"),
|
|
117
|
+
... NucleotideSequence("CGCTATTC"),
|
|
118
|
+
... NucleotideSequence("CCCTCAATC"),
|
|
119
|
+
... ]
|
|
120
|
+
>>> msa, _, _, _ = align_multiple(
|
|
121
|
+
... sequences, SubstitutionMatrix.std_nucleotide_matrix(), gap_penalty=-5
|
|
122
|
+
... )
|
|
123
|
+
>>> print(msa)
|
|
124
|
+
CGCTCATTC
|
|
125
|
+
CGCT-ATTC
|
|
126
|
+
CCCTCAATC
|
|
127
|
+
>>> profile = SequenceProfile.from_alignment(msa)
|
|
128
|
+
>>> print(profile)
|
|
129
|
+
A C G T
|
|
130
|
+
0 0 3 0 0
|
|
131
|
+
1 0 1 2 0
|
|
132
|
+
2 0 3 0 0
|
|
133
|
+
3 0 0 0 3
|
|
134
|
+
4 0 2 0 0
|
|
135
|
+
5 3 0 0 0
|
|
136
|
+
6 1 0 0 2
|
|
137
|
+
7 0 0 0 3
|
|
138
|
+
8 0 3 0 0
|
|
139
|
+
>>> print(profile.gaps)
|
|
140
|
+
[0 0 0 0 1 0 0 0 0]
|
|
141
|
+
|
|
142
|
+
Slice the profile (masks and index arrays are also supported):
|
|
143
|
+
|
|
144
|
+
>>> print(profile[2:])
|
|
145
|
+
A C G T
|
|
146
|
+
0 0 3 0 0
|
|
147
|
+
1 0 0 0 3
|
|
148
|
+
2 0 2 0 0
|
|
149
|
+
3 3 0 0 0
|
|
150
|
+
4 1 0 0 2
|
|
151
|
+
5 0 0 0 3
|
|
152
|
+
6 0 3 0 0
|
|
153
|
+
|
|
154
|
+
Use the profile to compute the position probability matrix:
|
|
155
|
+
|
|
156
|
+
>>> print(profile.probability_matrix())
|
|
157
|
+
[[0.000 1.000 0.000 0.000]
|
|
158
|
+
[0.000 0.333 0.667 0.000]
|
|
159
|
+
[0.000 1.000 0.000 0.000]
|
|
160
|
+
[0.000 0.000 0.000 1.000]
|
|
161
|
+
[0.000 1.000 0.000 0.000]
|
|
162
|
+
[1.000 0.000 0.000 0.000]
|
|
163
|
+
[0.333 0.000 0.000 0.667]
|
|
164
|
+
[0.000 0.000 0.000 1.000]
|
|
165
|
+
[0.000 1.000 0.000 0.000]]
|
|
108
166
|
"""
|
|
109
167
|
|
|
110
168
|
def __init__(self, symbols, gaps, alphabet):
|
|
@@ -156,8 +214,23 @@ class SequenceProfile(object):
|
|
|
156
214
|
)
|
|
157
215
|
self._gaps = new_gaps
|
|
158
216
|
|
|
217
|
+
def __str__(self):
|
|
218
|
+
# Add an additional row and column for the position and symbol indicators
|
|
219
|
+
print_matrix = np.full(
|
|
220
|
+
(self.symbols.shape[0] + 1, self.symbols.shape[1] + 1), "", dtype=object
|
|
221
|
+
)
|
|
222
|
+
print_matrix[1:, 1:] = self.symbols.astype(str)
|
|
223
|
+
print_matrix[0, 1:] = [str(sym) for sym in self.alphabet]
|
|
224
|
+
print_matrix[1:, 0] = [str(i) for i in range(self.symbols.shape[0])]
|
|
225
|
+
max_len = len(max(print_matrix.flatten(), key=len))
|
|
226
|
+
return "\n".join(
|
|
227
|
+
[
|
|
228
|
+
" ".join([str(cell).rjust(max_len) for cell in row])
|
|
229
|
+
for row in print_matrix
|
|
230
|
+
]
|
|
231
|
+
)
|
|
232
|
+
|
|
159
233
|
def __repr__(self):
|
|
160
|
-
"""Represent SequenceProfile as a string for debugging."""
|
|
161
234
|
return (
|
|
162
235
|
f"SequenceProfile(np.{np.array_repr(self.symbols)}, "
|
|
163
236
|
f"np.{np.array_repr(self.gaps)}, Alphabet({self.alphabet}))"
|
|
@@ -483,3 +556,12 @@ class SequenceProfile(object):
|
|
|
483
556
|
f"as 'symbols' {self.symbols.shape}"
|
|
484
557
|
)
|
|
485
558
|
return np.sum(pwm[np.arange(len(sequence)), sequence.code])
|
|
559
|
+
|
|
560
|
+
def __getitem__(self, index):
|
|
561
|
+
if isinstance(index, Integral):
|
|
562
|
+
# Do not allow to collapse dimensions
|
|
563
|
+
index = slice(index, index + 1)
|
|
564
|
+
return SequenceProfile(self.symbols[index], self.gaps[index], self.alphabet)
|
|
565
|
+
|
|
566
|
+
def __len__(self):
|
|
567
|
+
return len(self.symbols)
|
biotite/sequence/seqtypes.py
CHANGED
|
@@ -4,10 +4,22 @@
|
|
|
4
4
|
|
|
5
5
|
__name__ = "biotite.sequence"
|
|
6
6
|
__author__ = "Patrick Kunzmann", "Thomas Nevolianis"
|
|
7
|
-
__all__ = [
|
|
8
|
-
|
|
7
|
+
__all__ = [
|
|
8
|
+
"GeneralSequence",
|
|
9
|
+
"NucleotideSequence",
|
|
10
|
+
"ProteinSequence",
|
|
11
|
+
"PositionalSequence",
|
|
12
|
+
"PurePositionalSequence",
|
|
13
|
+
]
|
|
14
|
+
|
|
15
|
+
from dataclasses import dataclass, field
|
|
9
16
|
import numpy as np
|
|
10
|
-
from biotite.sequence.alphabet import
|
|
17
|
+
from biotite.sequence.alphabet import (
|
|
18
|
+
Alphabet,
|
|
19
|
+
AlphabetError,
|
|
20
|
+
AlphabetMapper,
|
|
21
|
+
LetterAlphabet,
|
|
22
|
+
)
|
|
11
23
|
from biotite.sequence.sequence import Sequence
|
|
12
24
|
|
|
13
25
|
|
|
@@ -590,3 +602,112 @@ class ProteinSequence(Sequence):
|
|
|
590
602
|
"Sequence contains ambiguous amino acids, " "cannot calculate weight"
|
|
591
603
|
)
|
|
592
604
|
return weight
|
|
605
|
+
|
|
606
|
+
|
|
607
|
+
class PositionalSequence(Sequence):
|
|
608
|
+
"""
|
|
609
|
+
A sequence where each symbol is associated with a position.
|
|
610
|
+
|
|
611
|
+
For each individual position the sequence contains a separate
|
|
612
|
+
:class:`PositionalSequence.Symbol`, encoded by a custom alphabet for this sequence.
|
|
613
|
+
In consequence the symbol code is the position in the sequence itself.
|
|
614
|
+
This is useful for aligning sequences based on a position-specific
|
|
615
|
+
substitution matrix.
|
|
616
|
+
|
|
617
|
+
Parameters
|
|
618
|
+
----------
|
|
619
|
+
original_sequence : seq.Sequence
|
|
620
|
+
The original sequence to create the positional sequence from.
|
|
621
|
+
"""
|
|
622
|
+
|
|
623
|
+
@dataclass(frozen=True)
|
|
624
|
+
class Symbol:
|
|
625
|
+
"""
|
|
626
|
+
Combination of a symbol and its position in a sequence.
|
|
627
|
+
|
|
628
|
+
Attributes
|
|
629
|
+
----------
|
|
630
|
+
original_alphabet : Alphabet
|
|
631
|
+
The original alphabet, where the symbol stems from.
|
|
632
|
+
original_code : int
|
|
633
|
+
The code of the original symbol in the original alphabet.
|
|
634
|
+
position : int
|
|
635
|
+
The 0-based position of the symbol in the sequence.
|
|
636
|
+
symbol : object
|
|
637
|
+
The symbol from the original alphabet.
|
|
638
|
+
|
|
639
|
+
See Also
|
|
640
|
+
--------
|
|
641
|
+
PositionalSequence
|
|
642
|
+
The sequence type containing :class:`PositionalSymbol` objects.
|
|
643
|
+
"""
|
|
644
|
+
|
|
645
|
+
original_alphabet: ...
|
|
646
|
+
original_code: ...
|
|
647
|
+
position: ...
|
|
648
|
+
symbol: ... = field(init=False)
|
|
649
|
+
|
|
650
|
+
def __post_init__(self):
|
|
651
|
+
sym = self.original_alphabet.decode(self.original_code)
|
|
652
|
+
super().__setattr__("symbol", sym)
|
|
653
|
+
|
|
654
|
+
def __str__(self):
|
|
655
|
+
return str(self.symbol)
|
|
656
|
+
|
|
657
|
+
def __init__(self, original_sequence):
|
|
658
|
+
self._orig_alphabet = original_sequence.get_alphabet()
|
|
659
|
+
self._alphabet = Alphabet(
|
|
660
|
+
[
|
|
661
|
+
PositionalSequence.Symbol(self._orig_alphabet, code, pos)
|
|
662
|
+
for pos, code in enumerate(original_sequence.code)
|
|
663
|
+
]
|
|
664
|
+
)
|
|
665
|
+
self.code = np.arange(
|
|
666
|
+
len(original_sequence), dtype=Sequence.dtype(len(self._alphabet))
|
|
667
|
+
)
|
|
668
|
+
|
|
669
|
+
def reconstruct(self):
|
|
670
|
+
"""
|
|
671
|
+
Reconstruct the original sequence from the positional sequence.
|
|
672
|
+
|
|
673
|
+
Returns
|
|
674
|
+
-------
|
|
675
|
+
original_sequence : GeneralSequence
|
|
676
|
+
The original sequence.
|
|
677
|
+
Although the actual type of the returned sequence is always a
|
|
678
|
+
:class:`GeneralSequence`, the alphabet and the symbols of the returned
|
|
679
|
+
sequence are equal to the original sequence.
|
|
680
|
+
"""
|
|
681
|
+
original_sequence = GeneralSequence(self._orig_alphabet)
|
|
682
|
+
original_sequence.code = np.array([sym.original_code for sym in self._alphabet])
|
|
683
|
+
return original_sequence
|
|
684
|
+
|
|
685
|
+
def get_alphabet(self):
|
|
686
|
+
return self._alphabet
|
|
687
|
+
|
|
688
|
+
def __str__(self) -> str:
|
|
689
|
+
return "".join([str(sym) for sym in self.symbols])
|
|
690
|
+
|
|
691
|
+
def __repr__(self):
|
|
692
|
+
return f"PositionalSequence({self.reconstruct()!r})"
|
|
693
|
+
|
|
694
|
+
|
|
695
|
+
class PurePositionalSequence(Sequence):
|
|
696
|
+
"""
|
|
697
|
+
An object of this class is a 'placeholder' sequence, where each symbol is the
|
|
698
|
+
position in the sequence itself.
|
|
699
|
+
|
|
700
|
+
This class is similar to :class:`PositionalSequence`, but the symbols are not
|
|
701
|
+
derived from an original sequence, but are the pure position.
|
|
702
|
+
Hence, there is no meaningful string representation of the sequence and its symbols.
|
|
703
|
+
"""
|
|
704
|
+
|
|
705
|
+
def __init__(self, length):
|
|
706
|
+
self._alphabet = Alphabet(range(length))
|
|
707
|
+
self.code = np.arange(length, dtype=Sequence.dtype(length))
|
|
708
|
+
|
|
709
|
+
def get_alphabet(self):
|
|
710
|
+
return self._alphabet
|
|
711
|
+
|
|
712
|
+
def __repr__(self):
|
|
713
|
+
return f"PurePositionalSequence({len(self)})"
|
biotite/setup_ccd.py
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
__author__ = "Patrick Kunzmann"
|
|
2
|
+
__all__ = []
|
|
3
|
+
|
|
4
|
+
import gzip
|
|
5
|
+
import logging
|
|
6
|
+
from collections import defaultdict
|
|
7
|
+
from io import StringIO
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
import numpy as np
|
|
10
|
+
import requests
|
|
11
|
+
from biotite.structure.io.pdbx import *
|
|
12
|
+
|
|
13
|
+
OUTPUT_CCD = Path(__file__).parent / "structure" / "info" / "components.bcif"
|
|
14
|
+
CCD_URL = "https://files.wwpdb.org/pub/pdb/data/monomers/components.cif.gz"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def concatenate_ccd(categories=None):
|
|
18
|
+
"""
|
|
19
|
+
Create the CCD in BinaryCIF format with each category contains the
|
|
20
|
+
data of all blocks.
|
|
21
|
+
|
|
22
|
+
Parameters
|
|
23
|
+
----------
|
|
24
|
+
categories : list of str, optional
|
|
25
|
+
The names of the categories to include.
|
|
26
|
+
By default, all categories from the CCD are included.
|
|
27
|
+
|
|
28
|
+
Returns
|
|
29
|
+
-------
|
|
30
|
+
compressed_file : BinaryCIFFile
|
|
31
|
+
The compressed CCD in BinaryCIF format.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
logging.info("Download and read CCD...")
|
|
35
|
+
ccd_cif_text = gzip.decompress(requests.get(CCD_URL).content).decode()
|
|
36
|
+
ccd_file = CIFFile.read(StringIO(ccd_cif_text))
|
|
37
|
+
|
|
38
|
+
compressed_block = BinaryCIFBlock()
|
|
39
|
+
if categories is None:
|
|
40
|
+
categories = _list_all_category_names(ccd_file)
|
|
41
|
+
for category_name in categories:
|
|
42
|
+
logging.info(f"Concatenate and compress '{category_name}' category...")
|
|
43
|
+
compressed_block[category_name] = compress(
|
|
44
|
+
_concatenate_blocks_into_category(ccd_file, category_name)
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
logging.info("Write concatenated CCD into BinaryCIF...")
|
|
48
|
+
compressed_file = BinaryCIFFile()
|
|
49
|
+
compressed_file["components"] = compressed_block
|
|
50
|
+
return compressed_file
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _concatenate_blocks_into_category(pdbx_file, category_name):
|
|
54
|
+
"""
|
|
55
|
+
Concatenate the given category from all blocks into a single
|
|
56
|
+
category.
|
|
57
|
+
|
|
58
|
+
Parameters
|
|
59
|
+
----------
|
|
60
|
+
pdbx_file : PDBxFile
|
|
61
|
+
The PDBx file, whose blocks should be concatenated.
|
|
62
|
+
category_name : str
|
|
63
|
+
The name of the category to concatenate.
|
|
64
|
+
|
|
65
|
+
Returns
|
|
66
|
+
-------
|
|
67
|
+
category : BinaryCIFCategory
|
|
68
|
+
The concatenated category.
|
|
69
|
+
"""
|
|
70
|
+
columns_names = _list_all_column_names(pdbx_file, category_name)
|
|
71
|
+
data_chunks = defaultdict(list)
|
|
72
|
+
mask_chunks = defaultdict(list)
|
|
73
|
+
for block in pdbx_file.values():
|
|
74
|
+
if category_name not in block:
|
|
75
|
+
continue
|
|
76
|
+
category = block[category_name]
|
|
77
|
+
for column_name in columns_names:
|
|
78
|
+
if column_name in category:
|
|
79
|
+
column = category[column_name]
|
|
80
|
+
data_chunks[column_name].append(column.data.array)
|
|
81
|
+
if column.mask is not None:
|
|
82
|
+
mask_chunks[column_name].append(column.mask.array)
|
|
83
|
+
else:
|
|
84
|
+
mask_chunks[column_name].append(
|
|
85
|
+
np.full(category.row_count, MaskValue.PRESENT, dtype=np.uint8)
|
|
86
|
+
)
|
|
87
|
+
else:
|
|
88
|
+
# Column is missing in this block
|
|
89
|
+
# -> handle it as data masked as 'missing'
|
|
90
|
+
data_chunks[column_name].append(
|
|
91
|
+
# For now all arrays are of type string anyway,
|
|
92
|
+
# as they are read from a CIF file
|
|
93
|
+
np.full(category.row_count, "", dtype="U1")
|
|
94
|
+
)
|
|
95
|
+
mask_chunks[column_name].append(
|
|
96
|
+
np.full(category.row_count, MaskValue.MISSING, dtype=np.uint8)
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
bcif_columns = {}
|
|
100
|
+
for col_name in columns_names:
|
|
101
|
+
data = np.concatenate(data_chunks[col_name])
|
|
102
|
+
mask = np.concatenate(mask_chunks[col_name])
|
|
103
|
+
data = _into_fitting_type(data, mask)
|
|
104
|
+
if np.all(mask == MaskValue.PRESENT):
|
|
105
|
+
mask = None
|
|
106
|
+
bcif_columns[col_name] = BinaryCIFColumn(data, mask)
|
|
107
|
+
return BinaryCIFCategory(bcif_columns)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _list_all_column_names(pdbx_file, category_name):
|
|
111
|
+
"""
|
|
112
|
+
Get all columns that exist in any block for a given category.
|
|
113
|
+
|
|
114
|
+
Parameters
|
|
115
|
+
----------
|
|
116
|
+
pdbx_file : PDBxFile
|
|
117
|
+
The PDBx file to search in for the columns.
|
|
118
|
+
category_name : str
|
|
119
|
+
The name of the category to search in.
|
|
120
|
+
|
|
121
|
+
Returns
|
|
122
|
+
-------
|
|
123
|
+
columns_names : list of str
|
|
124
|
+
The names of the columns.
|
|
125
|
+
"""
|
|
126
|
+
columns_names = set()
|
|
127
|
+
for block in pdbx_file.values():
|
|
128
|
+
if category_name in block:
|
|
129
|
+
columns_names.update(block[category_name].keys())
|
|
130
|
+
return sorted(columns_names)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def _list_all_category_names(pdbx_file):
|
|
134
|
+
"""
|
|
135
|
+
Get all categories that exist in any block.
|
|
136
|
+
|
|
137
|
+
Parameters
|
|
138
|
+
----------
|
|
139
|
+
pdbx_file : PDBxFile
|
|
140
|
+
The PDBx file to search in for the columns.
|
|
141
|
+
|
|
142
|
+
Returns
|
|
143
|
+
-------
|
|
144
|
+
columns_names : list of str
|
|
145
|
+
The names of the columns.
|
|
146
|
+
"""
|
|
147
|
+
category_names = set()
|
|
148
|
+
for block in pdbx_file.values():
|
|
149
|
+
category_names.update(block.keys())
|
|
150
|
+
return sorted(category_names)
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def _into_fitting_type(string_array, mask):
|
|
154
|
+
"""
|
|
155
|
+
Try to find a numeric type for a string ndarray, if possible.
|
|
156
|
+
|
|
157
|
+
Parameters
|
|
158
|
+
----------
|
|
159
|
+
string_array : ndarray, dtype=string
|
|
160
|
+
The array to convert.
|
|
161
|
+
mask : ndarray, dtype=uint8
|
|
162
|
+
Only values in `string_array` where the mask is ``MaskValue.PRESENT`` are
|
|
163
|
+
considered for type conversion.
|
|
164
|
+
|
|
165
|
+
Returns
|
|
166
|
+
-------
|
|
167
|
+
array : ndarray
|
|
168
|
+
The array converted into an appropriate dtype.
|
|
169
|
+
"""
|
|
170
|
+
mask = mask == MaskValue.PRESENT
|
|
171
|
+
# Only try to find an appropriate dtype for unmasked values
|
|
172
|
+
values = string_array[mask]
|
|
173
|
+
try:
|
|
174
|
+
# Try to fit into integer type
|
|
175
|
+
values = values.astype(int)
|
|
176
|
+
except ValueError:
|
|
177
|
+
try:
|
|
178
|
+
# Try to fit into float type
|
|
179
|
+
values = values.astype(float)
|
|
180
|
+
except ValueError:
|
|
181
|
+
# Keep string type
|
|
182
|
+
pass
|
|
183
|
+
array = np.zeros(string_array.shape, dtype=values.dtype)
|
|
184
|
+
array[mask] = values
|
|
185
|
+
return array
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def main():
|
|
189
|
+
logging.basicConfig(level=logging.INFO, format="%(levelname)s:%(message)s")
|
|
190
|
+
OUTPUT_CCD.parent.mkdir(parents=True, exist_ok=True)
|
|
191
|
+
|
|
192
|
+
compressed_ccd = concatenate_ccd(["chem_comp", "chem_comp_atom", "chem_comp_bond"])
|
|
193
|
+
compressed_ccd.write(OUTPUT_CCD)
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
if __name__ == "__main__":
|
|
197
|
+
main()
|
biotite/structure/__init__.py
CHANGED
|
@@ -57,14 +57,15 @@ The annotation arrays can be accessed either via the method
|
|
|
57
57
|
The following annotation categories are optionally used by some
|
|
58
58
|
functions:
|
|
59
59
|
|
|
60
|
-
========= =========== =================
|
|
60
|
+
========= =========== ================= =========================================
|
|
61
61
|
Category Type Examples Description
|
|
62
|
-
========= =========== =================
|
|
62
|
+
========= =========== ================= =========================================
|
|
63
63
|
atom_id int 1,2,3, ... Atom serial number
|
|
64
64
|
b_factor float 0.9, 12.3, ... Temperature factor
|
|
65
65
|
occupancy float .1, .3, .9, ... Occupancy
|
|
66
66
|
charge int -2,-1,0,1,2, ... Electric charge of the atom
|
|
67
|
-
|
|
67
|
+
sym_id string '1','2','3', ... Symmetry ID for assemblies/symmetry mates
|
|
68
|
+
========= =========== ================= =========================================
|
|
68
69
|
|
|
69
70
|
For each type, the attributes can be accessed directly.
|
|
70
71
|
Both :class:`AtomArray` and :class:`AtomArrayStack` support
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# This source code is part of the Biotite package and is distributed
|
|
2
|
+
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
|
|
3
|
+
# information.
|
|
4
|
+
|
|
5
|
+
"""
|
|
6
|
+
A subpackage for converting structures to structural alphabet sequences.
|
|
7
|
+
|
|
8
|
+
Structural alphabets represent the local geometry of each residue in a structure as
|
|
9
|
+
symbol in a sequence.
|
|
10
|
+
This allows using sequence-based functionality from :mod:`biotite.sequence` on
|
|
11
|
+
structural data.
|
|
12
|
+
|
|
13
|
+
For each supported structural alphabet, this subpackage provides a conversion function
|
|
14
|
+
that converts each chain of a given structure into a :class:`Sequence` object from the
|
|
15
|
+
respective structural alphabet.
|
|
16
|
+
|
|
17
|
+
Note that the structural alphabets use lower-case letters as symbols, in order to
|
|
18
|
+
distinguish them better from the nucleotide and amino acid alphabets.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
__name__ = "biotite.structure.alphabet"
|
|
22
|
+
__author__ = "Martin Larralde, Patrick Kunzmann"
|
|
23
|
+
|
|
24
|
+
from .i3d import *
|
|
25
|
+
from .pb import *
|