biotite 0.38.0__cp311-cp311-macosx_11_0_arm64.whl → 0.40.0__cp311-cp311-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biotite might be problematic. Click here for more details.

Files changed (124) hide show
  1. biotite/__init__.py +3 -3
  2. biotite/application/application.py +33 -28
  3. biotite/application/dssp/app.py +18 -18
  4. biotite/application/sra/__init__.py +5 -0
  5. biotite/application/sra/app.py +337 -55
  6. biotite/database/entrez/__init__.py +2 -1
  7. biotite/database/entrez/check.py +14 -3
  8. biotite/database/entrez/download.py +20 -13
  9. biotite/database/entrez/key.py +44 -0
  10. biotite/database/entrez/query.py +38 -34
  11. biotite/database/pubchem/query.py +44 -44
  12. biotite/database/rcsb/download.py +19 -14
  13. biotite/database/rcsb/query.py +46 -46
  14. biotite/sequence/align/__init__.py +5 -1
  15. biotite/sequence/align/banded.c +1408 -1025
  16. biotite/sequence/align/banded.cpython-311-darwin.so +0 -0
  17. biotite/sequence/align/buckets.py +69 -0
  18. biotite/sequence/align/cigar.py +389 -0
  19. biotite/sequence/align/kmeralphabet.c +3220 -2850
  20. biotite/sequence/align/kmeralphabet.cpython-311-darwin.so +0 -0
  21. biotite/sequence/align/kmersimilarity.c +713 -663
  22. biotite/sequence/align/kmersimilarity.cpython-311-darwin.so +0 -0
  23. biotite/sequence/align/kmertable.cpp +68398 -0
  24. biotite/sequence/align/kmertable.cpython-311-darwin.so +0 -0
  25. biotite/sequence/align/localgapped.c +1507 -1074
  26. biotite/sequence/align/localgapped.cpython-311-darwin.so +0 -0
  27. biotite/sequence/align/localungapped.c +1143 -833
  28. biotite/sequence/align/localungapped.cpython-311-darwin.so +0 -0
  29. biotite/sequence/align/multiple.c +1569 -1092
  30. biotite/sequence/align/multiple.cpython-311-darwin.so +0 -0
  31. biotite/sequence/align/pairwise.c +1612 -1212
  32. biotite/sequence/align/pairwise.cpython-311-darwin.so +0 -0
  33. biotite/sequence/align/permutation.c +33259 -0
  34. biotite/sequence/align/permutation.cpython-311-darwin.so +0 -0
  35. biotite/sequence/align/primes.txt +821 -0
  36. biotite/sequence/align/{kmertable.c → selector.c} +9129 -16497
  37. biotite/sequence/align/selector.cpython-311-darwin.so +0 -0
  38. biotite/sequence/align/tracetable.c +685 -646
  39. biotite/sequence/align/tracetable.cpython-311-darwin.so +0 -0
  40. biotite/sequence/codec.c +1159 -841
  41. biotite/sequence/codec.cpython-311-darwin.so +0 -0
  42. biotite/sequence/graphics/alignment.py +212 -2
  43. biotite/sequence/io/genbank/annotation.py +11 -11
  44. biotite/sequence/phylo/nj.c +684 -636
  45. biotite/sequence/phylo/nj.cpython-311-darwin.so +0 -0
  46. biotite/sequence/phylo/tree.c +970 -673
  47. biotite/sequence/phylo/tree.cpython-311-darwin.so +0 -0
  48. biotite/sequence/phylo/upgma.c +672 -626
  49. biotite/sequence/phylo/upgma.cpython-311-darwin.so +0 -0
  50. biotite/structure/__init__.py +1 -1
  51. biotite/structure/atoms.py +1 -1
  52. biotite/structure/basepairs.py +7 -12
  53. biotite/structure/bonds.c +3861 -3749
  54. biotite/structure/bonds.cpython-311-darwin.so +0 -0
  55. biotite/structure/celllist.c +727 -707
  56. biotite/structure/celllist.cpython-311-darwin.so +0 -0
  57. biotite/structure/charges.c +1561 -1560
  58. biotite/structure/charges.cpython-311-darwin.so +0 -0
  59. biotite/structure/filter.py +30 -37
  60. biotite/structure/info/__init__.py +5 -8
  61. biotite/structure/info/atoms.py +25 -67
  62. biotite/structure/info/bonds.py +46 -100
  63. biotite/structure/info/ccd/README.rst +8 -0
  64. biotite/structure/info/ccd/amino_acids.txt +1646 -0
  65. biotite/structure/info/ccd/carbohydrates.txt +1133 -0
  66. biotite/structure/info/ccd/components.bcif +0 -0
  67. biotite/structure/info/ccd/nucleotides.txt +797 -0
  68. biotite/structure/info/ccd.py +95 -0
  69. biotite/structure/info/groups.py +90 -0
  70. biotite/structure/info/masses.py +21 -20
  71. biotite/structure/info/misc.py +11 -22
  72. biotite/structure/info/standardize.py +17 -12
  73. biotite/structure/io/__init__.py +2 -4
  74. biotite/structure/io/ctab.py +1 -1
  75. biotite/structure/io/general.py +37 -43
  76. biotite/structure/io/mmtf/__init__.py +3 -0
  77. biotite/structure/io/mmtf/convertarray.c +528 -365
  78. biotite/structure/io/mmtf/convertarray.cpython-311-darwin.so +0 -0
  79. biotite/structure/io/mmtf/convertfile.c +725 -676
  80. biotite/structure/io/mmtf/convertfile.cpython-311-darwin.so +0 -0
  81. biotite/structure/io/mmtf/decode.c +1070 -754
  82. biotite/structure/io/mmtf/decode.cpython-311-darwin.so +0 -0
  83. biotite/structure/io/mmtf/encode.c +727 -677
  84. biotite/structure/io/mmtf/encode.cpython-311-darwin.so +0 -0
  85. biotite/structure/io/mmtf/file.py +34 -26
  86. biotite/structure/io/npz/__init__.py +3 -0
  87. biotite/structure/io/npz/file.py +21 -18
  88. biotite/structure/io/pdb/__init__.py +3 -3
  89. biotite/structure/io/pdb/file.py +72 -70
  90. biotite/structure/io/pdb/hybrid36.c +540 -478
  91. biotite/structure/io/pdb/hybrid36.cpython-311-darwin.so +0 -0
  92. biotite/structure/io/pdbqt/file.py +82 -68
  93. biotite/structure/io/pdbx/__init__.py +13 -6
  94. biotite/structure/io/pdbx/bcif.py +649 -0
  95. biotite/structure/io/pdbx/cif.py +1028 -0
  96. biotite/structure/io/pdbx/component.py +243 -0
  97. biotite/structure/io/pdbx/convert.py +707 -359
  98. biotite/structure/io/pdbx/encoding.c +112813 -0
  99. biotite/structure/io/pdbx/encoding.cpython-311-darwin.so +0 -0
  100. biotite/structure/io/pdbx/error.py +14 -0
  101. biotite/structure/io/pdbx/legacy.py +267 -0
  102. biotite/structure/molecules.py +151 -151
  103. biotite/structure/residues.py +40 -40
  104. biotite/structure/sasa.c +713 -644
  105. biotite/structure/sasa.cpython-311-darwin.so +0 -0
  106. biotite/structure/superimpose.py +158 -115
  107. biotite/visualize.py +9 -11
  108. {biotite-0.38.0.dist-info → biotite-0.40.0.dist-info}/METADATA +2 -2
  109. {biotite-0.38.0.dist-info → biotite-0.40.0.dist-info}/RECORD +112 -102
  110. {biotite-0.38.0.dist-info → biotite-0.40.0.dist-info}/WHEEL +1 -1
  111. biotite/structure/info/amino_acids.json +0 -1556
  112. biotite/structure/info/amino_acids.py +0 -42
  113. biotite/structure/info/carbohydrates.json +0 -1122
  114. biotite/structure/info/carbohydrates.py +0 -39
  115. biotite/structure/info/intra_bonds.msgpack +0 -0
  116. biotite/structure/info/link_types.msgpack +0 -1
  117. biotite/structure/info/nucleotides.json +0 -772
  118. biotite/structure/info/nucleotides.py +0 -39
  119. biotite/structure/info/residue_masses.msgpack +0 -0
  120. biotite/structure/info/residue_names.msgpack +0 -3
  121. biotite/structure/info/residues.msgpack +0 -0
  122. biotite/structure/io/pdbx/file.py +0 -652
  123. {biotite-0.38.0.dist-info → biotite-0.40.0.dist-info}/LICENSE.rst +0 -0
  124. {biotite-0.38.0.dist-info → biotite-0.40.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,69 @@
1
+ # This source code is part of the Biotite package and is distributed
2
+ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
3
+ # information.
4
+
5
+ __name__ = "biotite.sequence.align"
6
+ __author__ = "Patrick Kunzmann"
7
+ __all__ = ["bucket_number"]
8
+
9
+ from os.path import realpath, dirname, join
10
+ import numpy as np
11
+
12
+
13
+ _primes = None
14
+ def bucket_number(n_kmers, load_factor=0.8):
15
+ """
16
+ Find an appropriate number of buckets for a :class:`BucketKmerTable`
17
+ based on the number of elements (i.e. *k-mers*) that should be
18
+ stored in the table.
19
+
20
+ Parameters
21
+ ----------
22
+ n_kmers : int
23
+ The expected number of *k-mers* that will be stored in the
24
+ :class:`BucketKmerTable`.
25
+ If this number deviates from the actual number of *k-mers* that
26
+ will be stored, the load factor of the table will deviate
27
+ by the same percentage.
28
+ load_factor : float, optional
29
+ The ratio of bucket number to *k-mer* number.
30
+ The actual load factor will be lower, as the closest greater
31
+ prime is returned (see *Notes*).
32
+
33
+ Returns
34
+ -------
35
+ n_buckets : int
36
+ The recommended number of buckets to use for a
37
+ :class:`BucketKmerTable`, that stores `n_kmers` at the given
38
+ `load_factor`.
39
+
40
+ Notes
41
+ -----
42
+ The function returns the closest greater prime number from a
43
+ precomputed list of primes to use as the number of buckets.
44
+ The reason is that primer numbers have proven to be good hash table
45
+ sizes, if the hash function is not randomized.
46
+
47
+ Let's take unambiguous nucleotide *k-mers* as example.
48
+ If powers of two would be used as table size (another common scheme),
49
+ taking the modulo operation on the *k-mer* code would simply erase
50
+ the upper bits corresponding to the first nucleotide(s) in a
51
+ *k-mer*.
52
+ Hence, all *k-mers* with the same suffix would be stored in the same
53
+ bin.
54
+ """
55
+ global _primes
56
+ if _primes is None:
57
+ with open(
58
+ join(dirname(realpath(__file__)), "primes.txt")
59
+ ) as file:
60
+ _primes = np.array([
61
+ int(line) for line in file.read().splitlines()
62
+ if len(line) != 0 and line[0] != "#"
63
+ ])
64
+
65
+ number = int(n_kmers / load_factor)
66
+ index = np.searchsorted(_primes, number, side="left")
67
+ if index == len(_primes):
68
+ raise ValueError("Number of buckets too large")
69
+ return _primes[index]
@@ -0,0 +1,389 @@
1
+ # This source code is part of the Biotite package and is distributed
2
+ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
3
+ # information.
4
+
5
+ __name__ = "biotite.sequence.align"
6
+ __author__ = "Patrick Kunzmann"
7
+ __all__ = ["CigarOp", "read_alignment_from_cigar", "write_alignment_to_cigar"]
8
+
9
+ import enum
10
+ import numpy as np
11
+ from .alignment import Alignment, get_codes
12
+
13
+
14
+ class CigarOp(enum.IntEnum):
15
+ """
16
+ An enum for the different CIGAR operations.
17
+ """
18
+ MATCH = 0
19
+ INSERTION = 1
20
+ DELETION = 2
21
+ INTRON = 3
22
+ SOFT_CLIP = 4
23
+ HARD_CLIP = 5
24
+ PADDING = 6
25
+ EQUAL = 7
26
+ DIFFERENT = 8
27
+ BACK = 9
28
+
29
+ @staticmethod
30
+ def from_cigar_symbol(symbol):
31
+ """
32
+ Get the enum value from the CIGAR symbol.
33
+
34
+ Parameters
35
+ ----------
36
+ symbol : str
37
+ The CIGAR symbol.
38
+
39
+ Returns
40
+ -------
41
+ op : CigarOp
42
+ The enum value.
43
+ """
44
+ return _str_to_op[symbol]
45
+
46
+ def to_cigar_symbol(self):
47
+ return _op_to_str[self]
48
+
49
+ _str_to_op = {
50
+ "M" : CigarOp.MATCH,
51
+ "I" : CigarOp.INSERTION,
52
+ "D" : CigarOp.DELETION,
53
+ "N" : CigarOp.INTRON,
54
+ "S" : CigarOp.SOFT_CLIP,
55
+ "H" : CigarOp.HARD_CLIP,
56
+ "P" : CigarOp.PADDING,
57
+ "=" : CigarOp.EQUAL,
58
+ "X" : CigarOp.DIFFERENT,
59
+ "B" : CigarOp.BACK
60
+ }
61
+ _op_to_str = {v: k for k, v in _str_to_op.items()}
62
+
63
+
64
+ def read_alignment_from_cigar(cigar, position,
65
+ reference_sequence, segment_sequence):
66
+ """
67
+ Create an :class:`Alignment` from a CIGAR string.
68
+
69
+ Parameters
70
+ ----------
71
+ cigar : str
72
+ The CIGAR string.
73
+ position : int
74
+ 0-based position of the first aligned base in the reference.
75
+ 0-based equivalent to the ``POS`` field in the SAM/BAM file.
76
+ reference_sequence : Sequence
77
+ The reference sequence.
78
+ segment_sequence : Sequence
79
+ The segment, read or query sequence.
80
+
81
+ Returns
82
+ -------
83
+ alignment : Alignment
84
+ The alignment.
85
+
86
+ See Also
87
+ --------
88
+ write_alignment_to_cigar
89
+
90
+ Notes
91
+ -----
92
+ This function expects that the `segment_sequence` was taken from the
93
+ SAM/BAM file, hence hard-clipped bases are not part of the sequence.
94
+ Therefore, hard clipped bases are simply ignored in the CIGAR
95
+ string.
96
+
97
+ Examples
98
+ --------
99
+
100
+ >>> ref = NucleotideSequence("TATAAAAGGTTTCCGACCGTAGGTAGCTGA")
101
+ >>> seg = NucleotideSequence("CCCCGGTTTGACCGTATGTAG")
102
+ >>> print(read_alignment_from_cigar("9M2D12M", 3, ref, seg))
103
+ AAAAGGTTTCCGACCGTAGGTAG
104
+ CCCCGGTTT--GACCGTATGTAG
105
+ >>> print(read_alignment_from_cigar("4X5=2D7=1X4=", 3, ref, seg))
106
+ AAAAGGTTTCCGACCGTAGGTAG
107
+ CCCCGGTTT--GACCGTATGTAG
108
+
109
+ If bases in the segment sequence are soft-clipped, they do not
110
+ appear in the alignment.
111
+ Furthermore, the start of the reference sequence must be adapted.
112
+
113
+ >>> print(read_alignment_from_cigar("4S5M2D12M", 7, ref, seg))
114
+ GGTTTCCGACCGTAGGTAG
115
+ GGTTT--GACCGTATGTAG
116
+
117
+ Hard-clipped bases are not part of the segment sequence.
118
+ Hence `H` operations are completely ignored.
119
+
120
+ >>> seg = NucleotideSequence("GGTTTGACCGTATGTAG")
121
+ >>> print(read_alignment_from_cigar("4H5M2D12M", 7, ref, seg))
122
+ GGTTTCCGACCGTAGGTAG
123
+ GGTTT--GACCGTATGTAG
124
+
125
+ Reading from BAM codes is also possible:
126
+
127
+ >>> seg = NucleotideSequence("CCCCGGTTTGACCGTATGTAG")
128
+ >>> op_tuples = [
129
+ ... (CigarOp.MATCH, 9),
130
+ ... (CigarOp.DELETION, 2),
131
+ ... (CigarOp.MATCH, 12)
132
+ ... ]
133
+ >>> print(read_alignment_from_cigar(op_tuples, 3, ref, seg))
134
+ AAAAGGTTTCCGACCGTAGGTAG
135
+ CCCCGGTTT--GACCGTATGTAG
136
+ """
137
+ if isinstance(cigar, str):
138
+ operations = _op_tuples_from_cigar(cigar)
139
+ else:
140
+ operations = np.asarray(cigar, dtype=int)
141
+ if operations.ndim != 2:
142
+ raise ValueError(
143
+ "Expected array with shape (n,2)"
144
+ )
145
+ if operations.shape[1] != 2:
146
+ raise ValueError(
147
+ "Expected (operation, length) pairs"
148
+ )
149
+
150
+ if len(operations) == 0:
151
+ return Alignment(
152
+ [reference_sequence, segment_sequence], np.zeros((0, 2), dtype=int)
153
+ )
154
+
155
+ trace = np.zeros((np.sum(operations[:,1]), 2), dtype=int)
156
+ clip_mask = np.ones(trace.shape[0], dtype=bool)
157
+
158
+ i = 0
159
+ ref_pos = position
160
+ seg_pos = 0
161
+ for op, length in operations:
162
+ op = CigarOp(op)
163
+ if op in (CigarOp.MATCH, CigarOp.EQUAL, CigarOp.DIFFERENT):
164
+ trace[i : i + length, 0] = np.arange(ref_pos, ref_pos + length)
165
+ trace[i : i + length, 1] = np.arange(seg_pos, seg_pos + length)
166
+ ref_pos += length
167
+ seg_pos += length
168
+ elif op == CigarOp.INSERTION:
169
+ trace[i : i + length, 0] = -1
170
+ trace[i : i + length, 1] = np.arange(seg_pos, seg_pos + length)
171
+ seg_pos += length
172
+ elif op in (CigarOp.DELETION, CigarOp.INTRON):
173
+ trace[i : i + length, 0] = np.arange(ref_pos, ref_pos + length)
174
+ trace[i : i + length, 1] = -1
175
+ ref_pos += length
176
+ elif op == CigarOp.SOFT_CLIP:
177
+ clip_mask[i : i + length] = False
178
+ seg_pos += length
179
+ elif op == CigarOp.HARD_CLIP:
180
+ clip_mask[i : i + length] = False
181
+ else:
182
+ raise ValueError(
183
+ f"CIGAR operation {op} is not implemented"
184
+ )
185
+ i += length
186
+ # Remove clipped positions
187
+ trace = trace[clip_mask]
188
+ return Alignment([reference_sequence, segment_sequence], trace)
189
+
190
+
191
+ def write_alignment_to_cigar(alignment, reference_index=0, segment_index=1,
192
+ introns=(), distinguish_matches=False,
193
+ hard_clip=False, as_string=True):
194
+ """
195
+ Convert an :class:`Alignment` into a CIGAR string.
196
+
197
+ Parameters
198
+ ----------
199
+ alignment : Alignment
200
+ The alignment to be converted.
201
+ reference_index : int, optional
202
+ The index of the reference sequence in the alignment.
203
+ By default the first sequence is used.
204
+ segment_index : int, optional
205
+ The index of the segment, read or query sequence in the
206
+ alignment.
207
+ By default the second sequence is used.
208
+ introns : iterable object of tuple(int, int), optional
209
+ The introns in the reference sequence.
210
+ The introns are given as tuples of start and exclusive stop
211
+ index.
212
+ In those regions gaps in the reference sequence are reflected by
213
+ `'N'` in the CIGAR string.
214
+ By default no introns are assumed.
215
+ distinguish_matches : bool, optional
216
+ If true, matches (`'='`) are distinguished from mismatches
217
+ (`'X'`).
218
+ Otherwise, matches and mismatches are reflected equally by an
219
+ `'M'` in the CIGAR string.
220
+ hard_clip : bool, optional
221
+ If true, clipped bases are hard-clipped.
222
+ Otherwise, clipped bases are soft-clipped.
223
+ as_string : bool, optional
224
+ If true, the CIGAR string is returned.
225
+ Otherwise, a list of tuples is returned, where the first element
226
+ of each tuple specifies the :class:`CigarOp` and the second
227
+ element specifies the number of repetitions.
228
+
229
+ Returns
230
+ -------
231
+ cigar : str or ndarray, shape=(n,2) dtype=int
232
+ If `as_string` is true, the CIGAR string is returned.
233
+ Otherwise, an array is returned, where the first column
234
+ specifies the :class:`CigarOp` and the second column specifies
235
+ the number of repetitions of that operation.
236
+
237
+ See Also
238
+ --------
239
+ read_alignment_from_cigar
240
+
241
+ Examples
242
+ --------
243
+
244
+ >>> ref = NucleotideSequence("TATAAAAGGTTTCCGACCGTAGGTAGCTGA")
245
+ >>> seg = NucleotideSequence("CCCCGGTTTGACCGTATGTAG")
246
+ >>> matrix = SubstitutionMatrix.std_nucleotide_matrix()
247
+ >>> semiglobal_alignment = align_optimal(
248
+ ... ref, seg, matrix, local=False, terminal_penalty=False
249
+ ... )[0]
250
+ >>> print(semiglobal_alignment)
251
+ TATAAAAGGTTTCCGACCGTAGGTAGCTGA
252
+ ---CCCCGGTTT--GACCGTATGTAG----
253
+ >>> print(write_alignment_to_cigar(semiglobal_alignment))
254
+ 9M2D12M
255
+ >>> print(write_alignment_to_cigar(semiglobal_alignment, introns=[(12, 14)]))
256
+ 9M2N12M
257
+ >>> print(write_alignment_to_cigar(semiglobal_alignment, distinguish_matches=True))
258
+ 4X5=2D7=1X4=
259
+ >>> local_alignment = align_optimal(ref, seg, matrix, local=True)[0]
260
+ >>> print(local_alignment)
261
+ GGTTTCCGACCGTAGGTAG
262
+ GGTTT--GACCGTATGTAG
263
+ >>> print(write_alignment_to_cigar(local_alignment, hard_clip=False))
264
+ 4S5M2D12M
265
+ >>> print(write_alignment_to_cigar(local_alignment, hard_clip=True))
266
+ 4H5M2D12M
267
+
268
+ Writing operations as BAM codes is also possible:
269
+
270
+ >>> op_tuples = write_alignment_to_cigar(semiglobal_alignment, as_string=False)
271
+ >>> for op, length in op_tuples:
272
+ ... print(CigarOp(op), length)
273
+ CigarOp.MATCH 9
274
+ CigarOp.DELETION 2
275
+ CigarOp.MATCH 12
276
+ """
277
+ # Ignore terminal gaps in segment sequence
278
+ no_gap_pos = np.where(alignment.trace[:, segment_index] != -1)[0]
279
+ alignment = alignment[no_gap_pos[0] : no_gap_pos[-1] + 1]
280
+
281
+ ref_trace = alignment.trace[:, reference_index]
282
+ seg_trace = alignment.trace[:, segment_index]
283
+ operations = np.full(alignment.trace.shape[0], CigarOp.MATCH, dtype=int)
284
+
285
+ insertion_mask = (ref_trace == -1)
286
+ deletion_mask = (seg_trace == -1)
287
+ if np.any(insertion_mask & deletion_mask):
288
+ raise ValueError(
289
+ "Alignment contains insertion and deletion at the same position"
290
+ )
291
+ operations[insertion_mask] = CigarOp.INSERTION
292
+ operations[deletion_mask] = CigarOp.DELETION
293
+
294
+ if introns is not None:
295
+ intron_mask = np.zeros(operations.shape[0], dtype=bool)
296
+ for start, stop in introns:
297
+ if start >= stop:
298
+ raise ValueError(
299
+ "Intron start must be smaller than intron stop"
300
+ )
301
+ if start < 0:
302
+ raise ValueError(
303
+ "Intron start must not be negative"
304
+ )
305
+ intron_mask[(ref_trace >= start) & (ref_trace < stop)] = True
306
+ if np.any(intron_mask & ~deletion_mask):
307
+ raise ValueError(
308
+ "Introns must be within gaps in the reference sequence"
309
+ )
310
+ operations[intron_mask] = CigarOp.INTRON
311
+
312
+ if distinguish_matches:
313
+ symbol_codes = get_codes(alignment)
314
+ ref_codes = symbol_codes[reference_index, :]
315
+ seg_codes = symbol_codes[segment_index, :]
316
+ equal_mask = (ref_codes == seg_codes)
317
+ match_mask = (operations == CigarOp.MATCH)
318
+ operations[equal_mask & match_mask] = CigarOp.EQUAL
319
+ operations[~equal_mask & match_mask] = CigarOp.DIFFERENT
320
+
321
+ op_tuples = _aggregate_consecutive(operations)
322
+
323
+ clip_op = CigarOp.HARD_CLIP if hard_clip else CigarOp.SOFT_CLIP
324
+ # Missing bases at the beginning and end of the segment are
325
+ # interpreted as clipped
326
+ # As first element in the segment trace is the first aligned base,
327
+ # all previous bases are clipped...
328
+ start_clip_length = seg_trace[0]
329
+ if start_clip_length != 0:
330
+ start_clip = [(clip_op, seg_trace[0])]
331
+ else:
332
+ start_clip = np.zeros((0, 2), dtype=int)
333
+ # ...and the same applies for the last base
334
+ end_clip_length = (
335
+ len(alignment.sequences[segment_index]) - seg_trace[-1] - 1
336
+ )
337
+ if end_clip_length != 0:
338
+ end_clip = [(clip_op, end_clip_length)]
339
+ else:
340
+ end_clip = np.zeros((0, 2), dtype=int)
341
+ op_tuples = np.concatenate((start_clip, op_tuples, end_clip))
342
+
343
+ if as_string:
344
+ cigar = _cigar_from_op_tuples(op_tuples)
345
+ return cigar
346
+ else:
347
+ return op_tuples
348
+
349
+
350
+ def _aggregate_consecutive(operations):
351
+ """
352
+ Aggregate consecutive operations of the same type.
353
+ """
354
+ op_start_indices = np.where(operations[:-1] != operations[1:])[0]
355
+ # Also include the first operation
356
+ op_start_indices += 1
357
+ op_start_indices = np.concatenate(([0], op_start_indices))
358
+ ops = operations[op_start_indices]
359
+ length = np.diff(np.append(op_start_indices, len(operations)))
360
+ return np.stack((ops, length), axis=-1)
361
+
362
+
363
+ def _cigar_from_op_tuples(op_tuples):
364
+ """
365
+ Create a CIGAR string from a list of BAM integer tuples.
366
+
367
+ The first element of each tuple specifies the operation and the
368
+ second element specifies the number of repetitions.
369
+ """
370
+ cigar = ""
371
+ for op, count in op_tuples:
372
+ cigar += str(count) + CigarOp(op).to_cigar_symbol()
373
+ return cigar
374
+
375
+
376
+ def _op_tuples_from_cigar(cigar):
377
+ """
378
+ Create a list of tuples from a CIGAR string.
379
+ """
380
+ op_tuples = []
381
+ count = ""
382
+ for char in cigar:
383
+ if char.isdigit():
384
+ count += char
385
+ else:
386
+ op = CigarOp.from_cigar_symbol(char)
387
+ op_tuples.append((op, count))
388
+ count = ""
389
+ return np.array(op_tuples, dtype=int)