biotite 0.41.2__cp310-cp310-win_amd64.whl → 1.0.1__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biotite might be problematic. Click here for more details.

Files changed (205) hide show
  1. biotite/__init__.py +2 -3
  2. biotite/application/__init__.py +1 -1
  3. biotite/application/application.py +20 -10
  4. biotite/application/autodock/__init__.py +1 -1
  5. biotite/application/autodock/app.py +74 -79
  6. biotite/application/blast/__init__.py +1 -1
  7. biotite/application/blast/alignment.py +19 -10
  8. biotite/application/blast/webapp.py +92 -85
  9. biotite/application/clustalo/__init__.py +1 -1
  10. biotite/application/clustalo/app.py +46 -61
  11. biotite/application/dssp/__init__.py +1 -1
  12. biotite/application/dssp/app.py +8 -11
  13. biotite/application/localapp.py +62 -60
  14. biotite/application/mafft/__init__.py +1 -1
  15. biotite/application/mafft/app.py +16 -22
  16. biotite/application/msaapp.py +78 -89
  17. biotite/application/muscle/__init__.py +1 -1
  18. biotite/application/muscle/app3.py +50 -64
  19. biotite/application/muscle/app5.py +23 -31
  20. biotite/application/sra/__init__.py +1 -1
  21. biotite/application/sra/app.py +64 -68
  22. biotite/application/tantan/__init__.py +1 -1
  23. biotite/application/tantan/app.py +22 -45
  24. biotite/application/util.py +7 -9
  25. biotite/application/viennarna/rnaalifold.py +34 -28
  26. biotite/application/viennarna/rnafold.py +24 -39
  27. biotite/application/viennarna/rnaplot.py +36 -21
  28. biotite/application/viennarna/util.py +17 -12
  29. biotite/application/webapp.py +13 -14
  30. biotite/copyable.py +13 -13
  31. biotite/database/__init__.py +1 -1
  32. biotite/database/entrez/__init__.py +1 -1
  33. biotite/database/entrez/check.py +2 -3
  34. biotite/database/entrez/dbnames.py +7 -5
  35. biotite/database/entrez/download.py +55 -49
  36. biotite/database/entrez/key.py +1 -1
  37. biotite/database/entrez/query.py +62 -23
  38. biotite/database/error.py +2 -1
  39. biotite/database/pubchem/__init__.py +1 -1
  40. biotite/database/pubchem/download.py +43 -45
  41. biotite/database/pubchem/error.py +2 -2
  42. biotite/database/pubchem/query.py +34 -31
  43. biotite/database/pubchem/throttle.py +3 -4
  44. biotite/database/rcsb/__init__.py +1 -1
  45. biotite/database/rcsb/download.py +44 -52
  46. biotite/database/rcsb/query.py +85 -80
  47. biotite/database/uniprot/check.py +6 -3
  48. biotite/database/uniprot/download.py +6 -11
  49. biotite/database/uniprot/query.py +115 -31
  50. biotite/file.py +12 -31
  51. biotite/sequence/__init__.py +3 -3
  52. biotite/sequence/align/__init__.py +2 -2
  53. biotite/sequence/align/alignment.py +99 -90
  54. biotite/sequence/align/banded.cp310-win_amd64.pyd +0 -0
  55. biotite/sequence/align/buckets.py +12 -10
  56. biotite/sequence/align/cigar.py +43 -52
  57. biotite/sequence/align/kmeralphabet.cp310-win_amd64.pyd +0 -0
  58. biotite/sequence/align/kmeralphabet.pyx +55 -51
  59. biotite/sequence/align/kmersimilarity.cp310-win_amd64.pyd +0 -0
  60. biotite/sequence/align/kmertable.cp310-win_amd64.pyd +0 -0
  61. biotite/sequence/align/kmertable.pyx +3 -2
  62. biotite/sequence/align/localgapped.cp310-win_amd64.pyd +0 -0
  63. biotite/sequence/align/localungapped.cp310-win_amd64.pyd +0 -0
  64. biotite/sequence/align/matrix.py +81 -82
  65. biotite/sequence/align/multiple.cp310-win_amd64.pyd +0 -0
  66. biotite/sequence/align/multiple.pyx +1 -1
  67. biotite/sequence/align/pairwise.cp310-win_amd64.pyd +0 -0
  68. biotite/sequence/align/permutation.cp310-win_amd64.pyd +0 -0
  69. biotite/sequence/align/permutation.pyx +12 -4
  70. biotite/sequence/align/selector.cp310-win_amd64.pyd +0 -0
  71. biotite/sequence/align/selector.pyx +52 -54
  72. biotite/sequence/align/statistics.py +32 -33
  73. biotite/sequence/align/tracetable.cp310-win_amd64.pyd +0 -0
  74. biotite/sequence/alphabet.py +51 -65
  75. biotite/sequence/annotation.py +78 -77
  76. biotite/sequence/codec.cp310-win_amd64.pyd +0 -0
  77. biotite/sequence/codon.py +90 -79
  78. biotite/sequence/graphics/__init__.py +1 -1
  79. biotite/sequence/graphics/alignment.py +184 -103
  80. biotite/sequence/graphics/colorschemes.py +10 -12
  81. biotite/sequence/graphics/dendrogram.py +79 -34
  82. biotite/sequence/graphics/features.py +133 -99
  83. biotite/sequence/graphics/logo.py +22 -28
  84. biotite/sequence/graphics/plasmid.py +229 -178
  85. biotite/sequence/io/fasta/__init__.py +1 -1
  86. biotite/sequence/io/fasta/convert.py +44 -33
  87. biotite/sequence/io/fasta/file.py +42 -55
  88. biotite/sequence/io/fastq/__init__.py +1 -1
  89. biotite/sequence/io/fastq/convert.py +11 -14
  90. biotite/sequence/io/fastq/file.py +68 -112
  91. biotite/sequence/io/genbank/__init__.py +2 -2
  92. biotite/sequence/io/genbank/annotation.py +12 -20
  93. biotite/sequence/io/genbank/file.py +74 -76
  94. biotite/sequence/io/genbank/metadata.py +74 -62
  95. biotite/sequence/io/genbank/sequence.py +13 -14
  96. biotite/sequence/io/general.py +39 -30
  97. biotite/sequence/io/gff/__init__.py +2 -2
  98. biotite/sequence/io/gff/convert.py +10 -15
  99. biotite/sequence/io/gff/file.py +81 -65
  100. biotite/sequence/phylo/__init__.py +1 -1
  101. biotite/sequence/phylo/nj.cp310-win_amd64.pyd +0 -0
  102. biotite/sequence/phylo/tree.cp310-win_amd64.pyd +0 -0
  103. biotite/sequence/phylo/upgma.cp310-win_amd64.pyd +0 -0
  104. biotite/sequence/profile.py +57 -28
  105. biotite/sequence/search.py +17 -15
  106. biotite/sequence/seqtypes.py +200 -164
  107. biotite/sequence/sequence.py +15 -17
  108. biotite/structure/__init__.py +3 -3
  109. biotite/structure/atoms.py +246 -236
  110. biotite/structure/basepairs.py +260 -271
  111. biotite/structure/bonds.cp310-win_amd64.pyd +0 -0
  112. biotite/structure/bonds.pyx +29 -32
  113. biotite/structure/box.py +67 -71
  114. biotite/structure/celllist.cp310-win_amd64.pyd +0 -0
  115. biotite/structure/chains.py +55 -39
  116. biotite/structure/charges.cp310-win_amd64.pyd +0 -0
  117. biotite/structure/compare.py +32 -32
  118. biotite/structure/density.py +13 -18
  119. biotite/structure/dotbracket.py +20 -22
  120. biotite/structure/error.py +10 -2
  121. biotite/structure/filter.py +83 -78
  122. biotite/structure/geometry.py +130 -119
  123. biotite/structure/graphics/atoms.py +60 -43
  124. biotite/structure/graphics/rna.py +81 -68
  125. biotite/structure/hbond.py +112 -93
  126. biotite/structure/info/__init__.py +0 -2
  127. biotite/structure/info/atoms.py +10 -11
  128. biotite/structure/info/bonds.py +41 -43
  129. biotite/structure/info/ccd.py +4 -5
  130. biotite/structure/info/groups.py +1 -3
  131. biotite/structure/info/masses.py +5 -10
  132. biotite/structure/info/misc.py +1 -1
  133. biotite/structure/info/radii.py +20 -20
  134. biotite/structure/info/standardize.py +15 -26
  135. biotite/structure/integrity.py +18 -71
  136. biotite/structure/io/__init__.py +3 -4
  137. biotite/structure/io/dcd/__init__.py +1 -1
  138. biotite/structure/io/dcd/file.py +22 -20
  139. biotite/structure/io/general.py +47 -61
  140. biotite/structure/io/gro/__init__.py +1 -1
  141. biotite/structure/io/gro/file.py +73 -72
  142. biotite/structure/io/mol/__init__.py +1 -1
  143. biotite/structure/io/mol/convert.py +8 -11
  144. biotite/structure/io/mol/ctab.py +37 -36
  145. biotite/structure/io/mol/header.py +14 -10
  146. biotite/structure/io/mol/mol.py +9 -53
  147. biotite/structure/io/mol/sdf.py +47 -50
  148. biotite/structure/io/netcdf/__init__.py +1 -1
  149. biotite/structure/io/netcdf/file.py +24 -23
  150. biotite/structure/io/pdb/__init__.py +1 -1
  151. biotite/structure/io/pdb/convert.py +32 -20
  152. biotite/structure/io/pdb/file.py +151 -172
  153. biotite/structure/io/pdb/hybrid36.cp310-win_amd64.pyd +0 -0
  154. biotite/structure/io/pdbqt/__init__.py +1 -1
  155. biotite/structure/io/pdbqt/convert.py +17 -11
  156. biotite/structure/io/pdbqt/file.py +128 -80
  157. biotite/structure/io/pdbx/__init__.py +1 -2
  158. biotite/structure/io/pdbx/bcif.py +36 -44
  159. biotite/structure/io/pdbx/cif.py +140 -110
  160. biotite/structure/io/pdbx/component.py +10 -16
  161. biotite/structure/io/pdbx/convert.py +260 -258
  162. biotite/structure/io/pdbx/encoding.cp310-win_amd64.pyd +0 -0
  163. biotite/structure/io/trajfile.py +90 -107
  164. biotite/structure/io/trr/__init__.py +1 -1
  165. biotite/structure/io/trr/file.py +12 -15
  166. biotite/structure/io/xtc/__init__.py +1 -1
  167. biotite/structure/io/xtc/file.py +11 -14
  168. biotite/structure/mechanics.py +9 -11
  169. biotite/structure/molecules.py +3 -4
  170. biotite/structure/pseudoknots.py +53 -67
  171. biotite/structure/rdf.py +23 -21
  172. biotite/structure/repair.py +137 -86
  173. biotite/structure/residues.py +26 -16
  174. biotite/structure/sasa.cp310-win_amd64.pyd +0 -0
  175. biotite/structure/{resutil.py → segments.py} +24 -23
  176. biotite/structure/sequence.py +10 -11
  177. biotite/structure/sse.py +100 -119
  178. biotite/structure/superimpose.py +39 -77
  179. biotite/structure/transform.py +97 -71
  180. biotite/structure/util.py +11 -13
  181. biotite/version.py +2 -2
  182. biotite/visualize.py +69 -55
  183. {biotite-0.41.2.dist-info → biotite-1.0.1.dist-info}/METADATA +6 -5
  184. biotite-1.0.1.dist-info/RECORD +322 -0
  185. biotite/structure/io/ctab.py +0 -72
  186. biotite/structure/io/mmtf/__init__.py +0 -21
  187. biotite/structure/io/mmtf/assembly.py +0 -214
  188. biotite/structure/io/mmtf/convertarray.cp310-win_amd64.pyd +0 -0
  189. biotite/structure/io/mmtf/convertarray.pyx +0 -341
  190. biotite/structure/io/mmtf/convertfile.cp310-win_amd64.pyd +0 -0
  191. biotite/structure/io/mmtf/convertfile.pyx +0 -501
  192. biotite/structure/io/mmtf/decode.cp310-win_amd64.pyd +0 -0
  193. biotite/structure/io/mmtf/decode.pyx +0 -152
  194. biotite/structure/io/mmtf/encode.cp310-win_amd64.pyd +0 -0
  195. biotite/structure/io/mmtf/encode.pyx +0 -183
  196. biotite/structure/io/mmtf/file.py +0 -233
  197. biotite/structure/io/npz/__init__.py +0 -20
  198. biotite/structure/io/npz/file.py +0 -152
  199. biotite/structure/io/pdbx/legacy.py +0 -267
  200. biotite/structure/io/tng/__init__.py +0 -13
  201. biotite/structure/io/tng/file.py +0 -46
  202. biotite/temp.py +0 -86
  203. biotite-0.41.2.dist-info/RECORD +0 -340
  204. {biotite-0.41.2.dist-info → biotite-1.0.1.dist-info}/WHEEL +0 -0
  205. {biotite-0.41.2.dist-info → biotite-1.0.1.dist-info}/licenses/LICENSE.rst +0 -0
@@ -8,13 +8,14 @@ __all__ = ["CigarOp", "read_alignment_from_cigar", "write_alignment_to_cigar"]
8
8
 
9
9
  import enum
10
10
  import numpy as np
11
- from .alignment import Alignment, get_codes
11
+ from biotite.sequence.align.alignment import Alignment, get_codes
12
12
 
13
13
 
14
14
  class CigarOp(enum.IntEnum):
15
15
  """
16
16
  An enum for the different CIGAR operations.
17
17
  """
18
+
18
19
  MATCH = 0
19
20
  INSERTION = 1
20
21
  DELETION = 2
@@ -46,23 +47,23 @@ class CigarOp(enum.IntEnum):
46
47
  def to_cigar_symbol(self):
47
48
  return _op_to_str[self]
48
49
 
50
+
49
51
  _str_to_op = {
50
- "M" : CigarOp.MATCH,
51
- "I" : CigarOp.INSERTION,
52
- "D" : CigarOp.DELETION,
53
- "N" : CigarOp.INTRON,
54
- "S" : CigarOp.SOFT_CLIP,
55
- "H" : CigarOp.HARD_CLIP,
56
- "P" : CigarOp.PADDING,
57
- "=" : CigarOp.EQUAL,
58
- "X" : CigarOp.DIFFERENT,
59
- "B" : CigarOp.BACK
60
- }
52
+ "M": CigarOp.MATCH,
53
+ "I": CigarOp.INSERTION,
54
+ "D": CigarOp.DELETION,
55
+ "N": CigarOp.INTRON,
56
+ "S": CigarOp.SOFT_CLIP,
57
+ "H": CigarOp.HARD_CLIP,
58
+ "P": CigarOp.PADDING,
59
+ "=": CigarOp.EQUAL,
60
+ "X": CigarOp.DIFFERENT,
61
+ "B": CigarOp.BACK,
62
+ }
61
63
  _op_to_str = {v: k for k, v in _str_to_op.items()}
62
64
 
63
65
 
64
- def read_alignment_from_cigar(cigar, position,
65
- reference_sequence, segment_sequence):
66
+ def read_alignment_from_cigar(cigar, position, reference_sequence, segment_sequence):
66
67
  """
67
68
  Create an :class:`Alignment` from a CIGAR string.
68
69
 
@@ -147,20 +148,16 @@ def read_alignment_from_cigar(cigar, position,
147
148
  else:
148
149
  operations = np.asarray(cigar, dtype=int)
149
150
  if operations.ndim != 2:
150
- raise ValueError(
151
- "Expected array with shape (n,2)"
152
- )
151
+ raise ValueError("Expected array with shape (n,2)")
153
152
  if operations.shape[1] != 2:
154
- raise ValueError(
155
- "Expected (operation, length) pairs"
156
- )
153
+ raise ValueError("Expected (operation, length) pairs")
157
154
 
158
155
  if len(operations) == 0:
159
156
  return Alignment(
160
157
  [reference_sequence, segment_sequence], np.zeros((0, 2), dtype=int)
161
158
  )
162
159
 
163
- trace = np.zeros((np.sum(operations[:,1]), 2), dtype=int)
160
+ trace = np.zeros((np.sum(operations[:, 1]), 2), dtype=int)
164
161
  clip_mask = np.ones(trace.shape[0], dtype=bool)
165
162
 
166
163
  i = 0
@@ -187,19 +184,23 @@ def read_alignment_from_cigar(cigar, position,
187
184
  elif op == CigarOp.HARD_CLIP:
188
185
  clip_mask[i : i + length] = False
189
186
  else:
190
- raise ValueError(
191
- f"CIGAR operation {op} is not implemented"
192
- )
187
+ raise ValueError(f"CIGAR operation {op} is not implemented")
193
188
  i += length
194
189
  # Remove clipped positions
195
190
  trace = trace[clip_mask]
196
191
  return Alignment([reference_sequence, segment_sequence], trace)
197
192
 
198
193
 
199
- def write_alignment_to_cigar(alignment, reference_index=0, segment_index=1,
200
- introns=(), distinguish_matches=False,
201
- hard_clip=False, include_terminal_gaps=False,
202
- as_string=True):
194
+ def write_alignment_to_cigar(
195
+ alignment,
196
+ reference_index=0,
197
+ segment_index=1,
198
+ introns=(),
199
+ distinguish_matches=False,
200
+ hard_clip=False,
201
+ include_terminal_gaps=False,
202
+ as_string=True,
203
+ ):
203
204
  """
204
205
  Convert an :class:`Alignment` into a CIGAR string.
205
206
 
@@ -293,10 +294,10 @@ def write_alignment_to_cigar(alignment, reference_index=0, segment_index=1,
293
294
 
294
295
  >>> op_tuples = write_alignment_to_cigar(semiglobal_alignment, as_string=False)
295
296
  >>> for op, length in op_tuples:
296
- ... print(CigarOp(op), length)
297
- CigarOp.MATCH 9
298
- CigarOp.DELETION 2
299
- CigarOp.MATCH 12
297
+ ... print(CigarOp(op).name, length)
298
+ MATCH 9
299
+ DELETION 2
300
+ MATCH 12
300
301
  """
301
302
  if not include_terminal_gaps:
302
303
  alignment = _remove_terminal_segment_gaps(alignment, segment_index)
@@ -305,8 +306,8 @@ def write_alignment_to_cigar(alignment, reference_index=0, segment_index=1,
305
306
  seg_trace = alignment.trace[:, segment_index]
306
307
  operations = np.full(alignment.trace.shape[0], CigarOp.MATCH, dtype=int)
307
308
 
308
- insertion_mask = (ref_trace == -1)
309
- deletion_mask = (seg_trace == -1)
309
+ insertion_mask = ref_trace == -1
310
+ deletion_mask = seg_trace == -1
310
311
  if np.any(insertion_mask & deletion_mask):
311
312
  raise ValueError(
312
313
  "Alignment contains insertion and deletion at the same position"
@@ -318,35 +319,27 @@ def write_alignment_to_cigar(alignment, reference_index=0, segment_index=1,
318
319
  intron_mask = np.zeros(operations.shape[0], dtype=bool)
319
320
  for start, stop in introns:
320
321
  if start >= stop:
321
- raise ValueError(
322
- "Intron start must be smaller than intron stop"
323
- )
322
+ raise ValueError("Intron start must be smaller than intron stop")
324
323
  if start < 0:
325
- raise ValueError(
326
- "Intron start must not be negative"
327
- )
324
+ raise ValueError("Intron start must not be negative")
328
325
  intron_mask[(ref_trace >= start) & (ref_trace < stop)] = True
329
326
  if np.any(intron_mask & ~deletion_mask):
330
- raise ValueError(
331
- "Introns must be within gaps in the reference sequence"
332
- )
327
+ raise ValueError("Introns must be within gaps in the reference sequence")
333
328
  operations[intron_mask] = CigarOp.INTRON
334
329
 
335
330
  if distinguish_matches:
336
331
  symbol_codes = get_codes(alignment)
337
332
  ref_codes = symbol_codes[reference_index, :]
338
333
  seg_codes = symbol_codes[segment_index, :]
339
- equal_mask = (ref_codes == seg_codes)
340
- match_mask = (operations == CigarOp.MATCH)
334
+ equal_mask = ref_codes == seg_codes
335
+ match_mask = operations == CigarOp.MATCH
341
336
  operations[equal_mask & match_mask] = CigarOp.EQUAL
342
337
  operations[~equal_mask & match_mask] = CigarOp.DIFFERENT
343
338
 
344
339
  op_tuples = _aggregate_consecutive(operations)
345
340
 
346
341
  clip_op = CigarOp.HARD_CLIP if hard_clip else CigarOp.SOFT_CLIP
347
- start_clip_length, end_clip_length = _find_clipped_bases(
348
- alignment, segment_index
349
- )
342
+ start_clip_length, end_clip_length = _find_clipped_bases(alignment, segment_index)
350
343
  if start_clip_length != 0:
351
344
  start_clip = [(clip_op, start_clip_length)]
352
345
  else:
@@ -386,9 +379,7 @@ def _find_clipped_bases(alignment, segment_index):
386
379
  # all previous bases are clipped...
387
380
  start_clip_length = seg_trace[0]
388
381
  # ...and the same applies for the last base
389
- end_clip_length = (
390
- len(alignment.sequences[segment_index]) - seg_trace[-1] - 1
391
- )
382
+ end_clip_length = len(alignment.sequences[segment_index]) - seg_trace[-1] - 1
392
383
  return start_clip_length, end_clip_length
393
384
 
394
385
 
@@ -431,4 +422,4 @@ def _op_tuples_from_cigar(cigar):
431
422
  op = CigarOp.from_cigar_symbol(char)
432
423
  op_tuples.append((op, count))
433
424
  count = ""
434
- return np.array(op_tuples, dtype=int)
425
+ return np.array(op_tuples, dtype=int)
@@ -33,7 +33,7 @@ class KmerAlphabet(Alphabet):
33
33
 
34
34
  This type of alphabet uses *k-mers* as symbols, i.e. all
35
35
  combinations of *k* symbols from its *base alphabet*.
36
-
36
+
37
37
  It's primary use is its :meth:`create_kmers()` method, that iterates
38
38
  over all overlapping *k-mers* in a :class:`Sequence` and encodes
39
39
  each one into its corresponding *k-mer* symbol code
@@ -68,7 +68,7 @@ class KmerAlphabet(Alphabet):
68
68
  integers, that indicate the *informative* positions.
69
69
  For a continuous *k-mer* the `spacing` would be
70
70
  ``[0, 1, 2,...]``.
71
-
71
+
72
72
  Attributes
73
73
  ----------
74
74
  base_alphabet : Alphabet
@@ -79,7 +79,7 @@ class KmerAlphabet(Alphabet):
79
79
  spacing : None or ndarray, dtype=int
80
80
  The *k-mer* model in array form, if spaced *k-mers* are used,
81
81
  ``None`` otherwise.
82
-
82
+
83
83
  Notes
84
84
  -----
85
85
  The symbol code for a *k-mer* :math:`s` calculates as
@@ -94,7 +94,7 @@ class KmerAlphabet(Alphabet):
94
94
 
95
95
  References
96
96
  ----------
97
-
97
+
98
98
  .. footbibliography::
99
99
 
100
100
  Examples
@@ -103,11 +103,11 @@ class KmerAlphabet(Alphabet):
103
103
 
104
104
  >>> base_alphabet = NucleotideSequence.unambiguous_alphabet()
105
105
  >>> print(base_alphabet.get_symbols())
106
- ['A', 'C', 'G', 'T']
106
+ ('A', 'C', 'G', 'T')
107
107
  >>> kmer_alphabet = KmerAlphabet(base_alphabet, 2)
108
108
  >>> print(kmer_alphabet.get_symbols())
109
- ['AA', 'AC', 'AG', 'AT', 'CA', 'CC', 'CG', 'CT', 'GA', 'GC', 'GG', 'GT', 'TA', 'TC', 'TG', 'TT']
110
-
109
+ ('AA', 'AC', 'AG', 'AT', 'CA', 'CC', 'CG', 'CT', 'GA', 'GC', 'GG', 'GT', 'TA', 'TC', 'TG', 'TT')
110
+
111
111
  Encode and decode *k-mers*:
112
112
 
113
113
  >>> print(kmer_alphabet.encode("TC"))
@@ -127,7 +127,7 @@ class KmerAlphabet(Alphabet):
127
127
  [3 1]
128
128
 
129
129
  Encode all overlapping continuous k-mers of a sequence:
130
-
130
+
131
131
  >>> sequence = NucleotideSequence("ATTGCT")
132
132
  >>> kmer_codes = kmer_alphabet.create_kmers(sequence.code)
133
133
  >>> print(kmer_codes)
@@ -146,7 +146,7 @@ class KmerAlphabet(Alphabet):
146
146
  >>> print([s[0] + s[1] + "_" + s[2] for s in strings])
147
147
  ['BI_T', 'IQ_I', 'QT_T', 'TI_E']
148
148
  """
149
-
149
+
150
150
  def __init__(self, base_alphabet, k, spacing=None):
151
151
  if not isinstance(base_alphabet, Alphabet):
152
152
  raise TypeError(
@@ -157,7 +157,7 @@ class KmerAlphabet(Alphabet):
157
157
  raise ValueError("k must be at least 2")
158
158
  self._base_alph = base_alphabet
159
159
  self._k = k
160
-
160
+
161
161
  base_alph_len = len(self._base_alph)
162
162
  self._radix_multiplier = np.array(
163
163
  [base_alph_len**n for n in reversed(range(0, self._k))],
@@ -166,10 +166,10 @@ class KmerAlphabet(Alphabet):
166
166
 
167
167
  if spacing is None:
168
168
  self._spacing = None
169
-
169
+
170
170
  elif isinstance(spacing, str):
171
171
  self._spacing = _to_array_form(spacing)
172
-
172
+
173
173
  else:
174
174
  self._spacing = np.array(spacing, dtype=np.int64)
175
175
  self._spacing.sort()
@@ -181,13 +181,13 @@ class KmerAlphabet(Alphabet):
181
181
  raise ValueError(
182
182
  "Spacing model contains duplicate values"
183
183
  )
184
-
184
+
185
185
  if spacing is not None and len(self._spacing) != self._k:
186
186
  raise ValueError(
187
187
  f"Expected {self._k} informative positions, "
188
188
  f"but got {len(self._spacing)} positions in spacing"
189
189
  )
190
-
190
+
191
191
 
192
192
  @property
193
193
  def base_alphabet(self):
@@ -196,11 +196,11 @@ class KmerAlphabet(Alphabet):
196
196
  @property
197
197
  def k(self):
198
198
  return self._k
199
-
199
+
200
200
  @property
201
201
  def spacing(self):
202
202
  return None if self._spacing is None else self._spacing.copy()
203
-
203
+
204
204
 
205
205
  def get_symbols(self):
206
206
  """
@@ -210,10 +210,10 @@ class KmerAlphabet(Alphabet):
210
210
 
211
211
  Returns
212
212
  -------
213
- symbols : list
214
- A list of all *k-mer* symbols, i.e. all possible
213
+ symbols : tuple
214
+ A tuple of all *k-mer* symbols, i.e. all possible
215
215
  combinations of *k* symbols from its *base alphabet*.
216
-
216
+
217
217
  Notes
218
218
  -----
219
219
  In contrast the base :class:`Alphabet` and
@@ -224,10 +224,10 @@ class KmerAlphabet(Alphabet):
224
224
  to be created first.
225
225
  """
226
226
  if isinstance(self._base_alph, LetterAlphabet):
227
- return ["".join(self.decode(code)) for code in range(len(self))]
227
+ return tuple(["".join(self.decode(code)) for code in range(len(self))])
228
228
  else:
229
- return [list(self.decode(code)) for code in range(len(self))]
230
-
229
+ return tuple([list(self.decode(code)) for code in range(len(self))])
230
+
231
231
 
232
232
  def extends(self, alphabet):
233
233
  # A KmerAlphabet cannot really extend another KmerAlphabet:
@@ -237,15 +237,15 @@ class KmerAlphabet(Alphabet):
237
237
  # A KmerAlphabet can only 'extend' another KmerAlphabet,
238
238
  # if the two alphabets are equal
239
239
  return alphabet == self
240
-
240
+
241
241
 
242
242
  def encode(self, symbol):
243
243
  return self.fuse(self._base_alph.encode_multiple(symbol))
244
-
244
+
245
245
 
246
246
  def decode(self, code):
247
247
  return self._base_alph.decode_multiple(self.split(code))
248
-
248
+
249
249
 
250
250
  def fuse(self, codes):
251
251
  """
@@ -261,7 +261,7 @@ class KmerAlphabet(Alphabet):
261
261
  ----------
262
262
  codes : ndarray, dtype=int, shape=(k,) or shape=(n,k)
263
263
  The symbol codes from the base alphabet to be fused.
264
-
264
+
265
265
  Returns
266
266
  -------
267
267
  kmer_codes : int or ndarray, dtype=np.int64, shape=(n,)
@@ -292,13 +292,13 @@ class KmerAlphabet(Alphabet):
292
292
  )
293
293
  if np.any(codes > len(self._base_alph)):
294
294
  raise AlphabetError("Given k-mer(s) contains invalid symbol code")
295
-
295
+
296
296
  orig_shape = codes.shape
297
297
  codes = np.atleast_2d(codes)
298
298
  kmer_code = np.sum(self._radix_multiplier * codes, axis=-1)
299
299
  # The last dimension is removed since it collpased in np.sum
300
300
  return kmer_code.reshape(orig_shape[:-1])
301
-
301
+
302
302
  def split(self, kmer_code):
303
303
  """
304
304
  split(kmer_code)
@@ -313,7 +313,7 @@ class KmerAlphabet(Alphabet):
313
313
  ----------
314
314
  kmer_code : int or ndarray, dtype=int, shape=(n,)
315
315
  The *k-mer* code(s).
316
-
316
+
317
317
  Returns
318
318
  -------
319
319
  codes : ndarray, dtype=np.uint64, shape=(k,) or shape=(n,k)
@@ -341,13 +341,13 @@ class KmerAlphabet(Alphabet):
341
341
  raise AlphabetError(
342
342
  f"Given k-mer symbol code is invalid for this alphabet"
343
343
  )
344
-
344
+
345
345
  orig_shape = np.shape(kmer_code)
346
346
  split_codes = self._split(
347
347
  np.atleast_1d(kmer_code).astype(np.int64, copy=False)
348
348
  )
349
349
  return split_codes.reshape(orig_shape + (self._k,))
350
-
350
+
351
351
  @cython.boundscheck(False)
352
352
  @cython.wraparound(False)
353
353
  @cython.cdivision(True)
@@ -360,7 +360,7 @@ class KmerAlphabet(Alphabet):
360
360
  cdef uint64[:,:] split_codes = np.empty(
361
361
  (codes.shape[0], self._k), dtype=np.uint64
362
362
  )
363
-
363
+
364
364
  cdef int k = self._k
365
365
  for i in range(codes.shape[0]):
366
366
  code = codes[i]
@@ -369,9 +369,9 @@ class KmerAlphabet(Alphabet):
369
369
  symbol_code = code // val
370
370
  split_codes[i,n] = symbol_code
371
371
  code -= symbol_code * val
372
-
372
+
373
373
  return np.asarray(split_codes)
374
-
374
+
375
375
 
376
376
  def kmer_array_length(self, int64 length):
377
377
  """
@@ -385,7 +385,7 @@ class KmerAlphabet(Alphabet):
385
385
  ----------
386
386
  length : int
387
387
  The length of the hypothetical sequence
388
-
388
+
389
389
  Returns
390
390
  -------
391
391
  kmer_length : int
@@ -400,7 +400,7 @@ class KmerAlphabet(Alphabet):
400
400
  spacing = self._spacing
401
401
  max_offset = self._spacing[len(spacing)-1] + 1
402
402
  return length - max_offset + 1
403
-
403
+
404
404
 
405
405
  def create_kmers(self, seq_code):
406
406
  """
@@ -418,7 +418,7 @@ class KmerAlphabet(Alphabet):
418
418
  -------
419
419
  kmer_codes : ndarray, dtype=int64
420
420
  The symbol codes for the *k-mers*.
421
-
421
+
422
422
  Examples
423
423
  --------
424
424
 
@@ -435,7 +435,7 @@ class KmerAlphabet(Alphabet):
435
435
  return self._create_continuous_kmers(seq_code)
436
436
  else:
437
437
  return self._create_spaced_kmers(seq_code)
438
-
438
+
439
439
  @cython.boundscheck(False)
440
440
  @cython.wraparound(False)
441
441
  def _create_continuous_kmers(self, CodeType[:] seq_code not None):
@@ -460,7 +460,7 @@ class KmerAlphabet(Alphabet):
460
460
  cdef int64[:] kmers = np.empty(
461
461
  self.kmer_array_length(len(seq_code)), dtype=np.int64
462
462
  )
463
-
463
+
464
464
  cdef CodeType code
465
465
  cdef int64 kmer, prev_kmer
466
466
  # Compute first k-mer using naive approach
@@ -471,7 +471,7 @@ class KmerAlphabet(Alphabet):
471
471
  raise AlphabetError(f"Symbol code {code} is out of range")
472
472
  kmer += radix_multiplier[i] * code
473
473
  kmers[0] = kmer
474
-
474
+
475
475
  # Compute all following k-mers from the previous one
476
476
  prev_kmer = kmer
477
477
  for i in range(1, kmers.shape[0]):
@@ -481,7 +481,7 @@ class KmerAlphabet(Alphabet):
481
481
  kmer = (
482
482
  (
483
483
  # Remove first symbol
484
- (prev_kmer - seq_code[i - 1] * end_radix_multiplier)
484
+ (prev_kmer - seq_code[i - 1] * end_radix_multiplier)
485
485
  # Shift k-mer to left
486
486
  * alphabet_length
487
487
  )
@@ -490,9 +490,9 @@ class KmerAlphabet(Alphabet):
490
490
  )
491
491
  kmers[i] = kmer
492
492
  prev_kmer = kmer
493
-
493
+
494
494
  return np.asarray(kmers)
495
-
495
+
496
496
  @cython.boundscheck(False)
497
497
  @cython.wraparound(False)
498
498
  def _create_spaced_kmers(self, CodeType[:] seq_code not None):
@@ -515,7 +515,7 @@ class KmerAlphabet(Alphabet):
515
515
  cdef int64[:] kmers = np.empty(
516
516
  self.kmer_array_length(len(seq_code)), dtype=np.int64
517
517
  )
518
-
518
+
519
519
  cdef CodeType code
520
520
  cdef int64 kmer
521
521
  cdef int64 offset
@@ -528,18 +528,18 @@ class KmerAlphabet(Alphabet):
528
528
  raise AlphabetError(f"Symbol code {code} is out of range")
529
529
  kmer += radix_multiplier[j] * code
530
530
  kmers[i] = kmer
531
-
531
+
532
532
  return np.asarray(kmers)
533
-
533
+
534
534
 
535
535
  def __str__(self):
536
536
  return str(self.get_symbols())
537
-
537
+
538
538
 
539
539
  def __repr__(self):
540
540
  return f"KmerAlphabet({repr(self._base_alph)}, " \
541
541
  f"{self._k}, {repr(self._spacing)})"
542
-
542
+
543
543
 
544
544
  def __eq__(self, item):
545
545
  if item is self:
@@ -550,15 +550,19 @@ class KmerAlphabet(Alphabet):
550
550
  return False
551
551
  if self._k != item._k:
552
552
  return False
553
-
553
+
554
554
  if self._spacing is None:
555
555
  if item._spacing is not None:
556
556
  return False
557
557
  elif np.any(self._spacing != item._spacing):
558
558
  return False
559
-
559
+
560
560
  return True
561
-
561
+
562
+
563
+ def __hash__(self):
564
+ return hash((self._base_alph, self._k, tuple(self._spacing.tolist())))
565
+
562
566
 
563
567
  def __len__(self):
564
568
  return int(len(self._base_alph) ** self._k)
@@ -1352,7 +1352,8 @@ cdef class KmerTable:
1352
1352
 
1353
1353
 
1354
1354
  def __iter__(self):
1355
- return iter(self.get_kmers())
1355
+ for kmer in self.get_kmers():
1356
+ yield kmer.item()
1356
1357
 
1357
1358
 
1358
1359
  def __reversed__(self):
@@ -3394,7 +3395,7 @@ def _to_string(table):
3394
3395
  else:
3395
3396
  symbols = str(tuple(symbols))
3396
3397
  line = symbols + ": " + ", ".join(
3397
- [str(tuple(pos)) for pos in table[kmer]]
3398
+ [str((ref_id.item(), pos.item())) for ref_id, pos in table[kmer]]
3398
3399
  )
3399
3400
  lines.append(line)
3400
3401
  return "\n".join(lines)