biotite 1.0.1__cp310-cp310-win_amd64.whl → 1.2.0__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biotite might be problematic. Click here for more details.

Files changed (177) hide show
  1. biotite/application/application.py +3 -3
  2. biotite/application/autodock/app.py +1 -1
  3. biotite/application/blast/webapp.py +1 -1
  4. biotite/application/clustalo/app.py +1 -1
  5. biotite/application/dssp/app.py +13 -3
  6. biotite/application/localapp.py +36 -2
  7. biotite/application/msaapp.py +10 -10
  8. biotite/application/muscle/app3.py +5 -18
  9. biotite/application/muscle/app5.py +5 -5
  10. biotite/application/sra/app.py +0 -5
  11. biotite/application/util.py +22 -2
  12. biotite/application/viennarna/rnaalifold.py +8 -8
  13. biotite/application/viennarna/rnaplot.py +9 -3
  14. biotite/application/viennarna/util.py +1 -1
  15. biotite/application/webapp.py +1 -1
  16. biotite/database/afdb/__init__.py +12 -0
  17. biotite/database/afdb/download.py +191 -0
  18. biotite/database/entrez/dbnames.py +10 -0
  19. biotite/database/entrez/download.py +9 -10
  20. biotite/database/entrez/key.py +1 -1
  21. biotite/database/entrez/query.py +5 -4
  22. biotite/database/pubchem/download.py +6 -6
  23. biotite/database/pubchem/error.py +10 -0
  24. biotite/database/pubchem/query.py +12 -23
  25. biotite/database/rcsb/download.py +3 -2
  26. biotite/database/rcsb/query.py +8 -9
  27. biotite/database/uniprot/check.py +22 -17
  28. biotite/database/uniprot/download.py +3 -6
  29. biotite/database/uniprot/query.py +4 -5
  30. biotite/file.py +14 -2
  31. biotite/interface/__init__.py +19 -0
  32. biotite/interface/openmm/__init__.py +16 -0
  33. biotite/interface/openmm/state.py +93 -0
  34. biotite/interface/openmm/system.py +227 -0
  35. biotite/interface/pymol/__init__.py +198 -0
  36. biotite/interface/pymol/cgo.py +346 -0
  37. biotite/interface/pymol/convert.py +185 -0
  38. biotite/interface/pymol/display.py +267 -0
  39. biotite/interface/pymol/object.py +1226 -0
  40. biotite/interface/pymol/shapes.py +178 -0
  41. biotite/interface/pymol/startup.py +169 -0
  42. biotite/interface/rdkit/__init__.py +15 -0
  43. biotite/interface/rdkit/mol.py +490 -0
  44. biotite/interface/version.py +71 -0
  45. biotite/interface/warning.py +19 -0
  46. biotite/sequence/align/__init__.py +0 -4
  47. biotite/sequence/align/alignment.py +49 -14
  48. biotite/sequence/align/banded.cp310-win_amd64.pyd +0 -0
  49. biotite/sequence/align/banded.pyx +26 -26
  50. biotite/sequence/align/cigar.py +2 -2
  51. biotite/sequence/align/kmeralphabet.cp310-win_amd64.pyd +0 -0
  52. biotite/sequence/align/kmeralphabet.pyx +19 -2
  53. biotite/sequence/align/kmersimilarity.cp310-win_amd64.pyd +0 -0
  54. biotite/sequence/align/kmertable.cp310-win_amd64.pyd +0 -0
  55. biotite/sequence/align/kmertable.pyx +58 -48
  56. biotite/sequence/align/localgapped.cp310-win_amd64.pyd +0 -0
  57. biotite/sequence/align/localgapped.pyx +47 -47
  58. biotite/sequence/align/localungapped.cp310-win_amd64.pyd +0 -0
  59. biotite/sequence/align/localungapped.pyx +10 -10
  60. biotite/sequence/align/matrix.py +284 -57
  61. biotite/sequence/align/matrix_data/3Di.mat +24 -0
  62. biotite/sequence/align/matrix_data/PB.license +21 -0
  63. biotite/sequence/align/matrix_data/PB.mat +18 -0
  64. biotite/sequence/align/multiple.cp310-win_amd64.pyd +0 -0
  65. biotite/sequence/align/pairwise.cp310-win_amd64.pyd +0 -0
  66. biotite/sequence/align/pairwise.pyx +35 -35
  67. biotite/sequence/align/permutation.cp310-win_amd64.pyd +0 -0
  68. biotite/sequence/align/selector.cp310-win_amd64.pyd +0 -0
  69. biotite/sequence/align/selector.pyx +2 -2
  70. biotite/sequence/align/statistics.py +1 -1
  71. biotite/sequence/align/tracetable.cp310-win_amd64.pyd +0 -0
  72. biotite/sequence/alphabet.py +5 -2
  73. biotite/sequence/annotation.py +19 -13
  74. biotite/sequence/codec.cp310-win_amd64.pyd +0 -0
  75. biotite/sequence/codon.py +1 -2
  76. biotite/sequence/graphics/alignment.py +25 -39
  77. biotite/sequence/graphics/color_schemes/3di_flower.json +48 -0
  78. biotite/sequence/graphics/color_schemes/pb_flower.json +2 -1
  79. biotite/sequence/graphics/colorschemes.py +44 -11
  80. biotite/sequence/graphics/dendrogram.py +4 -2
  81. biotite/sequence/graphics/features.py +2 -2
  82. biotite/sequence/graphics/logo.py +10 -12
  83. biotite/sequence/io/fasta/convert.py +1 -2
  84. biotite/sequence/io/fasta/file.py +1 -1
  85. biotite/sequence/io/fastq/file.py +3 -3
  86. biotite/sequence/io/genbank/file.py +3 -3
  87. biotite/sequence/io/genbank/sequence.py +2 -0
  88. biotite/sequence/io/gff/convert.py +1 -1
  89. biotite/sequence/io/gff/file.py +1 -2
  90. biotite/sequence/phylo/nj.cp310-win_amd64.pyd +0 -0
  91. biotite/sequence/phylo/tree.cp310-win_amd64.pyd +0 -0
  92. biotite/sequence/phylo/upgma.cp310-win_amd64.pyd +0 -0
  93. biotite/sequence/profile.py +105 -29
  94. biotite/sequence/search.py +0 -1
  95. biotite/sequence/seqtypes.py +136 -8
  96. biotite/sequence/sequence.py +1 -2
  97. biotite/setup_ccd.py +197 -0
  98. biotite/structure/__init__.py +6 -3
  99. biotite/structure/alphabet/__init__.py +25 -0
  100. biotite/structure/alphabet/encoder.py +332 -0
  101. biotite/structure/alphabet/encoder_weights_3di.kerasify +0 -0
  102. biotite/structure/alphabet/i3d.py +109 -0
  103. biotite/structure/alphabet/layers.py +86 -0
  104. biotite/structure/alphabet/pb.license +21 -0
  105. biotite/structure/alphabet/pb.py +170 -0
  106. biotite/structure/alphabet/unkerasify.py +128 -0
  107. biotite/structure/atoms.py +163 -66
  108. biotite/structure/basepairs.py +26 -26
  109. biotite/structure/bonds.cp310-win_amd64.pyd +0 -0
  110. biotite/structure/bonds.pyx +79 -25
  111. biotite/structure/box.py +19 -21
  112. biotite/structure/celllist.cp310-win_amd64.pyd +0 -0
  113. biotite/structure/celllist.pyx +83 -67
  114. biotite/structure/chains.py +5 -37
  115. biotite/structure/charges.cp310-win_amd64.pyd +0 -0
  116. biotite/structure/compare.py +420 -13
  117. biotite/structure/density.py +1 -1
  118. biotite/structure/dotbracket.py +27 -28
  119. biotite/structure/filter.py +8 -8
  120. biotite/structure/geometry.py +74 -127
  121. biotite/structure/hbond.py +17 -19
  122. biotite/structure/info/__init__.py +1 -0
  123. biotite/structure/info/atoms.py +24 -15
  124. biotite/structure/info/bonds.py +12 -6
  125. biotite/structure/info/ccd.py +125 -34
  126. biotite/structure/info/{ccd/components.bcif → components.bcif} +0 -0
  127. biotite/structure/info/groups.py +62 -19
  128. biotite/structure/info/masses.py +9 -6
  129. biotite/structure/info/misc.py +15 -22
  130. biotite/structure/info/radii.py +92 -22
  131. biotite/structure/info/standardize.py +4 -4
  132. biotite/structure/integrity.py +4 -6
  133. biotite/structure/io/general.py +2 -2
  134. biotite/structure/io/gro/file.py +8 -9
  135. biotite/structure/io/mol/convert.py +1 -1
  136. biotite/structure/io/mol/ctab.py +33 -28
  137. biotite/structure/io/mol/mol.py +1 -1
  138. biotite/structure/io/mol/sdf.py +80 -53
  139. biotite/structure/io/pdb/convert.py +4 -3
  140. biotite/structure/io/pdb/file.py +85 -25
  141. biotite/structure/io/pdb/hybrid36.cp310-win_amd64.pyd +0 -0
  142. biotite/structure/io/pdbqt/file.py +36 -36
  143. biotite/structure/io/pdbx/__init__.py +1 -0
  144. biotite/structure/io/pdbx/bcif.py +54 -15
  145. biotite/structure/io/pdbx/cif.py +92 -66
  146. biotite/structure/io/pdbx/component.py +15 -4
  147. biotite/structure/io/pdbx/compress.py +321 -0
  148. biotite/structure/io/pdbx/convert.py +410 -75
  149. biotite/structure/io/pdbx/encoding.cp310-win_amd64.pyd +0 -0
  150. biotite/structure/io/pdbx/encoding.pyx +98 -17
  151. biotite/structure/io/trajfile.py +9 -6
  152. biotite/structure/io/util.py +38 -0
  153. biotite/structure/mechanics.py +0 -1
  154. biotite/structure/molecules.py +141 -156
  155. biotite/structure/pseudoknots.py +7 -13
  156. biotite/structure/repair.py +2 -4
  157. biotite/structure/residues.py +13 -24
  158. biotite/structure/rings.py +335 -0
  159. biotite/structure/sasa.cp310-win_amd64.pyd +0 -0
  160. biotite/structure/sasa.pyx +2 -1
  161. biotite/structure/segments.py +69 -11
  162. biotite/structure/sequence.py +0 -1
  163. biotite/structure/sse.py +0 -2
  164. biotite/structure/superimpose.py +74 -62
  165. biotite/structure/tm.py +581 -0
  166. biotite/structure/transform.py +12 -25
  167. biotite/structure/util.py +76 -4
  168. biotite/version.py +9 -4
  169. biotite/visualize.py +111 -1
  170. {biotite-1.0.1.dist-info → biotite-1.2.0.dist-info}/METADATA +6 -2
  171. {biotite-1.0.1.dist-info → biotite-1.2.0.dist-info}/RECORD +173 -143
  172. biotite/structure/info/ccd/README.rst +0 -8
  173. biotite/structure/info/ccd/amino_acids.txt +0 -1663
  174. biotite/structure/info/ccd/carbohydrates.txt +0 -1135
  175. biotite/structure/info/ccd/nucleotides.txt +0 -798
  176. {biotite-1.0.1.dist-info → biotite-1.2.0.dist-info}/WHEEL +0 -0
  177. {biotite-1.0.1.dist-info → biotite-1.2.0.dist-info}/licenses/LICENSE.rst +0 -0
@@ -16,7 +16,8 @@
16
16
  "m",
17
17
  "n",
18
18
  "o",
19
- "p"
19
+ "p",
20
+ "z"
20
21
  ],
21
22
  "colors": {
22
23
  "a": "#31b5fc",
@@ -94,27 +94,32 @@ def get_color_scheme(name, alphabet, default="#FFFFFF"):
94
94
  >>> print(color_scheme)
95
95
  ['#3737f5', '#37f537', '#f5f537', '#f53737']
96
96
  """
97
+ # Try exact alphabet match first
98
+ for scheme in _color_schemes:
99
+ if scheme["name"] == name and scheme["alphabet"] == alphabet:
100
+ return _fit_color_scheme(alphabet, scheme, default)
101
+ # If no exact match was found, try to find a scheme for an alphabet
102
+ # that extends the given alphabet
97
103
  for scheme in _color_schemes:
98
104
  if scheme["name"] == name and scheme["alphabet"].extends(alphabet):
99
- colors = scheme["colors"]
100
- # Replace None values with default color
101
- colors = [color if color is not None else default for color in colors]
102
- # Only return colors that are in scope of this alphabet
103
- # and not the extended alphabet
104
- return colors[: len(alphabet)]
105
+ return _fit_color_scheme(alphabet, scheme, default)
106
+
105
107
  raise ValueError(f"Unkown scheme '{name}' for given alphabet")
106
108
 
107
109
 
108
- def list_color_scheme_names(alphabet):
110
+ def list_color_scheme_names(alphabet, strict=False):
109
111
  """
110
112
  Get a list of available color scheme names for a given alphabet.
111
113
 
112
114
  Parameters
113
115
  ----------
114
116
  alphabet : Alphabet
115
- The alphbet to get the color scheme names for.
116
- The alphabet of the scheme must equal or extend this parameter,
117
- to be included in the list.
117
+ The alphabet to get the color scheme names for.
118
+ strict : bool, optional
119
+ If set to true, only schemes with an exact match to the given
120
+ alphabet are included in the list.
121
+ If set to false, schemes with an alphabet that extends the given
122
+ alphabet are also included.
118
123
 
119
124
  Returns
120
125
  -------
@@ -123,7 +128,9 @@ def list_color_scheme_names(alphabet):
123
128
  """
124
129
  scheme_list = []
125
130
  for scheme in _color_schemes:
126
- if scheme["alphabet"].extends(alphabet):
131
+ if strict and scheme["alphabet"] == alphabet:
132
+ scheme_list.append(scheme["name"])
133
+ if not strict and scheme["alphabet"].extends(alphabet):
127
134
  scheme_list.append(scheme["name"])
128
135
  return scheme_list
129
136
 
@@ -135,3 +142,29 @@ _color_schemes = []
135
142
  for file_name in glob.glob(_scheme_dir + os.sep + "*.json"):
136
143
  scheme = load_color_scheme(file_name)
137
144
  _color_schemes.append(scheme)
145
+
146
+
147
+ def _fit_color_scheme(alphabet, color_scheme, default_color):
148
+ """
149
+ Fit a color scheme to the given alphabet.
150
+
151
+ Parameters
152
+ ----------
153
+ alphabet : Alphabet
154
+ The alphabet to get the color scheme for.
155
+ color_scheme : dict
156
+ The color scheme.
157
+ default_color : str or tuple
158
+ The default color.
159
+
160
+ Returns
161
+ -------
162
+ scheme : list of str
163
+ The colors from the scheme.
164
+ """
165
+ colors = color_scheme["colors"]
166
+ # Replace None values with default color
167
+ colors = [color if color is not None else default_color for color in colors]
168
+ # Only return colors that are in scope of this alphabet
169
+ # and not the extended alphabet
170
+ return colors[: len(alphabet)]
@@ -25,8 +25,10 @@ def plot_dendrogram(
25
25
 
26
26
  Parameters
27
27
  ----------
28
+ axes : Axes
29
+ A *Matplotlib* axes, that is used as plotting area.
28
30
  tree : Tree
29
- The tree to be visualized
31
+ The tree to be visualized.
30
32
  orientation : {'left', 'right', 'bottom', 'top'}, optional
31
33
  The position of the root node in the plot
32
34
  use_distances : bool, optional
@@ -38,7 +40,7 @@ def plot_dendrogram(
38
40
  The label of a leaf node is the entry at the position of its
39
41
  `index` attribute.
40
42
  label_size : float, optional
41
- The font size of the labels
43
+ The font size of the labels.
42
44
  color : tuple or str, optional
43
45
  A *Matplotlib* compatible color, that is used to draw the lines
44
46
  of the dendrogram.
@@ -71,7 +71,7 @@ def plot_feature_map(
71
71
  If true, the sequence position the base/residue of a line is
72
72
  shown on the right side of the plot.
73
73
  number_size : float, optional
74
- The font size of the position numbers
74
+ The font size of the position numbers.
75
75
  line_width : float, optional
76
76
  The size of the continuous line as fraction of the height of
77
77
  the drawn features.
@@ -416,7 +416,7 @@ class PromoterPlotter(FeaturePlotter):
416
416
  line_width : float, optional
417
417
  The width of the curved arrow tail.
418
418
  head_width : float, optional
419
- The width of the arrow head
419
+ The width of the arrow head.
420
420
  head_length : float, optional
421
421
  The length of the arrow.
422
422
  head_height : float, optional
@@ -9,7 +9,7 @@ __all__ = ["plot_sequence_logo"]
9
9
  import numpy as np
10
10
  from biotite.sequence.alphabet import LetterAlphabet
11
11
  from biotite.sequence.graphics.colorschemes import get_color_scheme
12
- from biotite.visualize import set_font_size_in_coord
12
+ from biotite.visualize import plot_scaled_text
13
13
 
14
14
 
15
15
  def plot_sequence_logo(axes, profile, scheme=None, **kwargs):
@@ -29,7 +29,7 @@ def plot_sequence_logo(axes, profile, scheme=None, **kwargs):
29
29
  ----------
30
30
  axes : Axes
31
31
  The axes to draw the logo one.
32
- profile: SequenceProfile
32
+ profile : SequenceProfile
33
33
  The logo is created based on this profile.
34
34
  scheme : str or list of (tuple or str)
35
35
  Either a valid color scheme name
@@ -38,7 +38,8 @@ def plot_sequence_logo(axes, profile, scheme=None, **kwargs):
38
38
  The list length must be at least as long as the
39
39
  length of the alphabet used by the `profile`.
40
40
  **kwargs
41
- Additional `text parameters <https://matplotlib.org/api/text_api.html#matplotlib.text.Text>`_.
41
+ Additional parameters for the :class:`matplotlib.font_manager.FontProperties`
42
+ of the text or the created :class:`matplotlib.patches.PathPatch`.
42
43
 
43
44
  References
44
45
  ----------
@@ -69,23 +70,20 @@ def plot_sequence_logo(axes, profile, scheme=None, **kwargs):
69
70
  index_order = np.argsort(symbols_heights)
70
71
  start_height = 0
71
72
  for j in index_order[i]:
72
- # Stack the symbols at position on top of the preceeding one
73
+ # Stack the symbols at position on top of the preceding one
73
74
  height = symbols_heights[i, j]
74
75
  if height > 0:
75
76
  symbol = alphabet.decode(j)
76
- text = axes.text(
77
+ plot_scaled_text(
78
+ axes,
79
+ symbol,
77
80
  i + 0.5,
78
81
  start_height,
79
- symbol,
80
- ha="left",
81
- va="bottom",
82
+ width=1,
83
+ height=height,
82
84
  color=colors[j],
83
- # Best results are obtained with this font size
84
- size=1,
85
85
  **kwargs,
86
86
  )
87
- text.set_clip_on(True)
88
- set_font_size_in_coord(text, width=1, height=height)
89
87
  start_height += height
90
88
 
91
89
  axes.set_xlim(0.5, len(profile.symbols) + 0.5)
@@ -275,8 +275,7 @@ def _process_nucleotide_sequence(x):
275
275
  def _convert_to_string(sequence, as_rna):
276
276
  if not isinstance(sequence.get_alphabet(), LetterAlphabet):
277
277
  raise ValueError(
278
- "Only sequences using single letter alphabets "
279
- "can be stored in a FASTA file"
278
+ "Only sequences using single letter alphabets can be stored in a FASTA file"
280
279
  )
281
280
  if isinstance(sequence, NucleotideSequence) and as_rna:
282
281
  return str(sequence).replace("T", "U")
@@ -102,7 +102,7 @@ class FastaFile(TextFile, MutableMapping):
102
102
  if not isinstance(header, str):
103
103
  raise IndexError("'FastaFile' only supports header strings as keys")
104
104
  if not isinstance(seq_str, str):
105
- raise TypeError("'FastaFile' only supports sequence strings " "as values")
105
+ raise TypeError("'FastaFile' only supports sequence strings as values")
106
106
  # Create lines for new header and sequence (with line breaks)
107
107
  new_lines = [">" + header.replace("\n", "").strip()] + wrap_string(
108
108
  seq_str, width=self._chars_per_line
@@ -302,10 +302,10 @@ class FastqFile(TextFile, MutableMapping):
302
302
  else: # score_len > seq_len
303
303
  raise InvalidFileError(
304
304
  f"The amount of scores is not equal to the sequence "
305
- f"length for the sequence in line {seq_start_i+1} "
305
+ f"length for the sequence in line {seq_start_i + 1} "
306
306
  )
307
307
  else:
308
- raise InvalidFileError(f"Line {i+1} in FASTQ file is invalid")
308
+ raise InvalidFileError(f"Line {i + 1} in FASTQ file is invalid")
309
309
  # At the end of the file, the last sequence or score block
310
310
  # must have properly ended
311
311
  if in_sequence or in_scores:
@@ -392,7 +392,7 @@ class FastqFile(TextFile, MutableMapping):
392
392
  yield identifier, ("".join(seq_str_list), scores)
393
393
  else: # score_len > seq_len
394
394
  raise InvalidFileError(
395
- "The amount of scores is not equal to the sequence " "length"
395
+ "The amount of scores is not equal to the sequence length"
396
396
  )
397
397
 
398
398
  else:
@@ -80,7 +80,7 @@ class GenBankFile(TextFile):
80
80
  >>> print(content)
81
81
  ['One line', 'A second line']
82
82
  >>> print(subfields)
83
- OrderedDict([('SUBFIELD1', ['Single Line']), ('SUBFIELD2', ['Two', 'lines'])])
83
+ OrderedDict({'SUBFIELD1': ['Single Line'], 'SUBFIELD2': ['Two', 'lines']})
84
84
 
85
85
  Adding an additional field:
86
86
 
@@ -391,7 +391,7 @@ class GenBankFile(TextFile):
391
391
  The field name.
392
392
  content : list of str
393
393
  The content lines.
394
- subfield_dict : dict of str -> str, optional
394
+ subfields : dict of str -> str, optional
395
395
  The subfields of the field.
396
396
  The dictionary maps subfield names to the content lines of
397
397
  the respective subfield.
@@ -432,7 +432,7 @@ class GenBankFile(TextFile):
432
432
  The field name.
433
433
  content : list of str
434
434
  The content lines.
435
- subfield_dict : dict of str -> str, optional
435
+ subfields : dict of str -> str, optional
436
436
  The subfields of the field.
437
437
  The dictionary maps subfield names to the content lines of
438
438
  the respective subfield.
@@ -82,6 +82,8 @@ def get_annotated_sequence(gb_file, format="gb", include_only=None):
82
82
  ----------
83
83
  gb_file : GenBankFile
84
84
  The GenBank file to read the fields from.
85
+ format : {'gb', 'gp'}
86
+ Whether the file is a *GenBank* or *GenPept* file.
85
87
  include_only : iterable object of str, optional
86
88
  List of names of feature keys, which should included
87
89
  in the annotation. By default all features are included.
@@ -84,7 +84,7 @@ def set_annotation(gff_file, annotation, seqid=None, source=None, is_stranded=Tr
84
84
  for feature in sorted(annotation):
85
85
  if len(feature.locs) > 1 and "ID" not in feature.qual:
86
86
  raise ValueError(
87
- "The 'Id' qualifier is required " "for features with multiple locations"
87
+ "The 'Id' qualifier is required for features with multiple locations"
88
88
  )
89
89
  ## seqid ##
90
90
  if seqid is not None and " " in seqid:
@@ -303,8 +303,7 @@ class GFFFile(TextFile):
303
303
  def __getitem__(self, index):
304
304
  if (index >= 0 and index >= len(self)) or (index < 0 and -index > len(self)):
305
305
  raise IndexError(
306
- f"Index {index} is out of range for GFFFile with "
307
- f"{len(self)} entries"
306
+ f"Index {index} is out of range for GFFFile with {len(self)} entries"
308
307
  )
309
308
 
310
309
  line_index = self._entries[index]
@@ -3,6 +3,7 @@
3
3
  # information.
4
4
 
5
5
  import warnings
6
+ from numbers import Integral
6
7
  import numpy as np
7
8
  from biotite.sequence.align.alignment import get_codes
8
9
  from biotite.sequence.alphabet import LetterAlphabet
@@ -66,6 +67,9 @@ class SequenceProfile(object):
66
67
  It also saves the number of gaps at each position in the array
67
68
  'gaps'.
68
69
 
70
+ With :meth:`from_alignment()` a :class:`SequenceProfile` object can
71
+ be created from an indefinite number of aligned sequences.
72
+
69
73
  With :meth:`probability_matrix()` the position probability matrix
70
74
  can be created based on 'symbols' and a pseudocount.
71
75
 
@@ -73,9 +77,6 @@ class SequenceProfile(object):
73
77
  be created based on the before calculated position probability
74
78
  matrix and the background frequencies.
75
79
 
76
- With :meth:`from_alignment()` a :class:`SequenceProfile` object can
77
- be created from an indefinite number of aligned sequences.
78
-
79
80
  With :meth:`sequence_probability_from_matrix()` the probability of a
80
81
  sequence can be calculated based on the before calculated position
81
82
  probability matrix of this instance of object SequenceProfile.
@@ -94,7 +95,7 @@ class SequenceProfile(object):
94
95
  gaps : ndarray, dtype=int, shape=n
95
96
  Array which indicates the number of gaps at each position.
96
97
  alphabet : Alphabet, length=k
97
- Alphabet of sequences of sequence profile
98
+ Alphabet of sequences of sequence profile.
98
99
 
99
100
  Attributes
100
101
  ----------
@@ -105,6 +106,63 @@ class SequenceProfile(object):
105
106
  Array which indicates the number of gaps at each position.
106
107
  alphabet : Alphabet, length=k
107
108
  Alphabet of sequences of sequence profile
109
+
110
+ Examples
111
+ --------
112
+
113
+ Create a profile from a multiple sequence alignment:
114
+
115
+ >>> sequences = [
116
+ ... NucleotideSequence("CGCTCATTC"),
117
+ ... NucleotideSequence("CGCTATTC"),
118
+ ... NucleotideSequence("CCCTCAATC"),
119
+ ... ]
120
+ >>> msa, _, _, _ = align_multiple(
121
+ ... sequences, SubstitutionMatrix.std_nucleotide_matrix(), gap_penalty=-5
122
+ ... )
123
+ >>> print(msa)
124
+ CGCTCATTC
125
+ CGCT-ATTC
126
+ CCCTCAATC
127
+ >>> profile = SequenceProfile.from_alignment(msa)
128
+ >>> print(profile)
129
+ A C G T
130
+ 0 0 3 0 0
131
+ 1 0 1 2 0
132
+ 2 0 3 0 0
133
+ 3 0 0 0 3
134
+ 4 0 2 0 0
135
+ 5 3 0 0 0
136
+ 6 1 0 0 2
137
+ 7 0 0 0 3
138
+ 8 0 3 0 0
139
+ >>> print(profile.gaps)
140
+ [0 0 0 0 1 0 0 0 0]
141
+
142
+ Slice the profile (masks and index arrays are also supported):
143
+
144
+ >>> print(profile[2:])
145
+ A C G T
146
+ 0 0 3 0 0
147
+ 1 0 0 0 3
148
+ 2 0 2 0 0
149
+ 3 3 0 0 0
150
+ 4 1 0 0 2
151
+ 5 0 0 0 3
152
+ 6 0 3 0 0
153
+
154
+ Use the profile to compute the position probability matrix:
155
+
156
+ >>> print(profile.probability_matrix())
157
+ [[0.000 1.000 0.000 0.000]
158
+ [0.000 0.333 0.667 0.000]
159
+ [0.000 1.000 0.000 0.000]
160
+ [0.000 0.000 0.000 1.000]
161
+ [0.000 1.000 0.000 0.000]
162
+ [1.000 0.000 0.000 0.000]
163
+ [0.333 0.000 0.000 0.667]
164
+ [0.000 0.000 0.000 1.000]
165
+ [0.000 1.000 0.000 0.000]]
108
166
  """
109
167
 
110
168
  def __init__(self, symbols, gaps, alphabet):
@@ -156,8 +214,23 @@ class SequenceProfile(object):
156
214
  )
157
215
  self._gaps = new_gaps
158
216
 
217
+ def __str__(self):
218
+ # Add an additional row and column for the position and symbol indicators
219
+ print_matrix = np.full(
220
+ (self.symbols.shape[0] + 1, self.symbols.shape[1] + 1), "", dtype=object
221
+ )
222
+ print_matrix[1:, 1:] = self.symbols.astype(str)
223
+ print_matrix[0, 1:] = [str(sym) for sym in self.alphabet]
224
+ print_matrix[1:, 0] = [str(i) for i in range(self.symbols.shape[0])]
225
+ max_len = len(max(print_matrix.flatten(), key=len))
226
+ return "\n".join(
227
+ [
228
+ " ".join([str(cell).rjust(max_len) for cell in row])
229
+ for row in print_matrix
230
+ ]
231
+ )
232
+
159
233
  def __repr__(self):
160
- """Represent SequenceProfile as a string for debugging."""
161
234
  return (
162
235
  f"SequenceProfile(np.{np.array_repr(self.symbols)}, "
163
236
  f"np.{np.array_repr(self.gaps)}, Alphabet({self.alphabet}))"
@@ -191,15 +264,14 @@ class SequenceProfile(object):
191
264
  alphabet : bool
192
265
  This alphabet will be used when creating the SequenceProfile
193
266
  object. If no alphabet is selected, the alphabet for this
194
- SequenceProfile
267
+ :class:`SequenceProfile`.
195
268
  object will be calculated from the sequences of object
196
269
  Alignment.
197
- (Default: None).
198
270
 
199
271
  Returns
200
272
  -------
201
273
  profile: SequenceProfile
202
- The created SequenceProfile object
274
+ The created :class:`SequenceProfile` object.
203
275
  """
204
276
  sequences = get_codes(alignment)
205
277
  if alphabet is None:
@@ -233,13 +305,12 @@ class SequenceProfile(object):
233
305
  If true, returns consensus sequence as GeneralSequence
234
306
  object.
235
307
  Otherwise, the consensus sequence object type is chosen
236
- based on the alphabet of this SequenceProfile object
237
- (Default: False).
308
+ based on the alphabet of this SequenceProfile object.
238
309
 
239
310
  Returns
240
311
  -------
241
312
  consensus: Sequence
242
- The calculated consensus sequence
313
+ The calculated consensus sequence.
243
314
  """
244
315
  # https://en.wikipedia.org/wiki/International_Union_of_Pure_and_Applied_Chemistry#Amino_acid_and_nucleotide_base_codes
245
316
  if as_general:
@@ -347,14 +418,13 @@ class SequenceProfile(object):
347
418
 
348
419
  Parameters
349
420
  ----------
350
- pseudocount: int, optional
421
+ pseudocount : int, optional
351
422
  Amount added to the number of observed cases in order to
352
423
  change the expected probability of the PPM.
353
- (Default: 0)
354
424
 
355
425
  Returns
356
426
  -------
357
- ppm: ndarray, dtype=float, shape=(n,k)
427
+ ppm : ndarray, dtype=float, shape=(n,k)
358
428
  The calculated the position probability matrix.
359
429
  """
360
430
  if pseudocount < 0:
@@ -383,17 +453,16 @@ class SequenceProfile(object):
383
453
 
384
454
  Parameters
385
455
  ----------
386
- pseudocount: int, optional
387
- Amount added to the number of observed cases in order to change
388
- the expected probability of the PPM.
389
- (Default: 0)
390
- background_frequencies: ndarray, shape=(k,), dtype=float, optional
456
+ background_frequencies : ndarray, shape=(k,), dtype=float, optional
391
457
  The background frequencies for each symbol in the alphabet.
392
458
  By default, a uniform distribution is assumed.
459
+ pseudocount : int, optional
460
+ Amount added to the number of observed cases in order to change
461
+ the expected probability of the PPM.
393
462
 
394
463
  Returns
395
464
  -------
396
- pwm: ndarray, dtype=float, shape=(n,k)
465
+ pwm : ndarray, dtype=float, shape=(n,k)
397
466
  The calculated the position weight matrix.
398
467
  """
399
468
  if background_frequencies is None:
@@ -417,14 +486,13 @@ class SequenceProfile(object):
417
486
  ----------
418
487
  sequence : Sequence
419
488
  The input sequence.
420
- pseudocount: int, optional
489
+ pseudocount : int, optional
421
490
  Amount added to the number of observed cases in order to change
422
491
  the expected probability of the PPM.
423
- (Default: 0)
424
492
 
425
493
  Returns
426
494
  -------
427
- probability: float
495
+ probability : float
428
496
  The calculated probability for the input sequence based on
429
497
  the PPM.
430
498
  """
@@ -453,17 +521,16 @@ class SequenceProfile(object):
453
521
  ----------
454
522
  sequence : Sequence
455
523
  The input sequence.
456
- pseudocount: int, optional
457
- Amount added to the number of observed cases in order to change
458
- the expected probability of the PPM.
459
- (Default: 0)
460
- background_frequencies: ndarray, shape=(k,), dtype=float, optional
524
+ background_frequencies : ndarray, shape=(k,), dtype=float, optional
461
525
  The background frequencies for each symbol in the alphabet.
462
526
  By default a uniform distribution is assumed.
527
+ pseudocount : int, optional
528
+ Amount added to the number of observed cases in order to change
529
+ the expected probability of the PPM.
463
530
 
464
531
  Returns
465
532
  -------
466
- score: float
533
+ score : float
467
534
  The calculated score for the input sequence based on
468
535
  the PWM.
469
536
  """
@@ -483,3 +550,12 @@ class SequenceProfile(object):
483
550
  f"as 'symbols' {self.symbols.shape}"
484
551
  )
485
552
  return np.sum(pwm[np.arange(len(sequence)), sequence.code])
553
+
554
+ def __getitem__(self, index):
555
+ if isinstance(index, Integral):
556
+ # Do not allow to collapse dimensions
557
+ index = slice(index, index + 1)
558
+ return SequenceProfile(self.symbols[index], self.gaps[index], self.alphabet)
559
+
560
+ def __len__(self):
561
+ return len(self.symbols)
@@ -39,7 +39,6 @@ def find_subsequence(sequence, query):
39
39
  >>> sub_seq = NucleotideSequence("TGA")
40
40
  >>> print(find_subsequence(main_seq, sub_seq))
41
41
  [2 6]
42
-
43
42
  """
44
43
  if not sequence.get_alphabet().extends(query.get_alphabet()):
45
44
  raise ValueError("The sequences alphabets are not equal")