edlib 0.0.2 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1482 @@
1
+ #include "edlib.h"
2
+
3
+ #include <stdint.h>
4
+ #include <cstdlib>
5
+ #include <algorithm>
6
+ #include <vector>
7
+ #include <cstring>
8
+ #include <string>
9
+
10
+ using namespace std;
11
+
12
+ typedef uint64_t Word;
13
+ static const int WORD_SIZE = sizeof(Word) * 8; // Size of Word in bits
14
+ static const Word WORD_1 = static_cast<Word>(1);
15
+ static const Word HIGH_BIT_MASK = WORD_1 << (WORD_SIZE - 1); // 100..00
16
+ static const int MAX_UCHAR = 255;
17
+
18
+ // Data needed to find alignment.
19
+ struct AlignmentData {
20
+ Word* Ps;
21
+ Word* Ms;
22
+ int* scores;
23
+ int* firstBlocks;
24
+ int* lastBlocks;
25
+
26
+ AlignmentData(int maxNumBlocks, int targetLength) {
27
+ // We build a complete table and mark first and last block for each column
28
+ // (because algorithm is banded so only part of each columns is used).
29
+ // TODO: do not build a whole table, but just enough blocks for each column.
30
+ Ps = new Word[maxNumBlocks * targetLength];
31
+ Ms = new Word[maxNumBlocks * targetLength];
32
+ scores = new int[maxNumBlocks * targetLength];
33
+ firstBlocks = new int[targetLength];
34
+ lastBlocks = new int[targetLength];
35
+ }
36
+
37
+ ~AlignmentData() {
38
+ delete[] Ps;
39
+ delete[] Ms;
40
+ delete[] scores;
41
+ delete[] firstBlocks;
42
+ delete[] lastBlocks;
43
+ }
44
+ };
45
+
46
+ struct Block {
47
+ Word P; // Pvin
48
+ Word M; // Mvin
49
+ int score; // score of last cell in block;
50
+
51
+ Block() {}
52
+ Block(Word p, Word m, int s) :P(p), M(m), score(s) {}
53
+ };
54
+
55
+
56
+ /**
57
+ * Defines equality relation on alphabet characters.
58
+ * By default each character is always equal only to itself, but you can also provide additional equalities.
59
+ */
60
+ class EqualityDefinition {
61
+ private:
62
+ bool matrix[MAX_UCHAR + 1][MAX_UCHAR + 1];
63
+ public:
64
+ EqualityDefinition(const string& alphabet,
65
+ const EdlibEqualityPair* additionalEqualities = NULL,
66
+ const int additionalEqualitiesLength = 0) {
67
+ for (int i = 0; i < static_cast<int>(alphabet.size()); i++) {
68
+ for (int j = 0; j < static_cast<int>(alphabet.size()); j++) {
69
+ matrix[i][j] = (i == j);
70
+ }
71
+ }
72
+ if (additionalEqualities != NULL) {
73
+ for (int i = 0; i < additionalEqualitiesLength; i++) {
74
+ size_t firstTransformed = alphabet.find(additionalEqualities[i].first);
75
+ size_t secondTransformed = alphabet.find(additionalEqualities[i].second);
76
+ if (firstTransformed != string::npos && secondTransformed != string::npos) {
77
+ matrix[firstTransformed][secondTransformed] = matrix[secondTransformed][firstTransformed] = true;
78
+ }
79
+ }
80
+ }
81
+ }
82
+
83
+ /**
84
+ * @param a Element from transformed sequence.
85
+ * @param b Element from transformed sequence.
86
+ * @return True if a and b are defined as equal, false otherwise.
87
+ */
88
+ bool areEqual(unsigned char a, unsigned char b) const {
89
+ return matrix[a][b];
90
+ }
91
+ };
92
+
93
+ static int myersCalcEditDistanceSemiGlobal(const Word* Peq, int W, int maxNumBlocks,
94
+ int queryLength,
95
+ const unsigned char* target, int targetLength,
96
+ int k, EdlibAlignMode mode,
97
+ int* bestScore_, int** positions_, int* numPositions_);
98
+
99
+ static int myersCalcEditDistanceNW(const Word* Peq, int W, int maxNumBlocks,
100
+ int queryLength,
101
+ const unsigned char* target, int targetLength,
102
+ int k, int* bestScore_,
103
+ int* position_, bool findAlignment,
104
+ AlignmentData** alignData, int targetStopPosition);
105
+
106
+
107
+ static int obtainAlignment(
108
+ const unsigned char* query, const unsigned char* rQuery, int queryLength,
109
+ const unsigned char* target, const unsigned char* rTarget, int targetLength,
110
+ const EqualityDefinition& equalityDefinition, int alphabetLength, int bestScore,
111
+ unsigned char** alignment, int* alignmentLength);
112
+
113
+ static int obtainAlignmentHirschberg(
114
+ const unsigned char* query, const unsigned char* rQuery, int queryLength,
115
+ const unsigned char* target, const unsigned char* rTarget, int targetLength,
116
+ const EqualityDefinition& equalityDefinition, int alphabetLength, int bestScore,
117
+ unsigned char** alignment, int* alignmentLength);
118
+
119
+ static int obtainAlignmentTraceback(int queryLength, int targetLength,
120
+ int bestScore, const AlignmentData* alignData,
121
+ unsigned char** alignment, int* alignmentLength);
122
+
123
+ static string transformSequences(const char* queryOriginal, int queryLength,
124
+ const char* targetOriginal, int targetLength,
125
+ unsigned char** queryTransformed,
126
+ unsigned char** targetTransformed);
127
+
128
+ static inline int ceilDiv(int x, int y);
129
+
130
+ static inline unsigned char* createReverseCopy(const unsigned char* seq, int length);
131
+
132
+ static inline Word* buildPeq(const int alphabetLength,
133
+ const unsigned char* query,
134
+ const int queryLength,
135
+ const EqualityDefinition& equalityDefinition);
136
+
137
+
138
+ /**
139
+ * Main edlib method.
140
+ */
141
+ extern "C" EdlibAlignResult edlibAlign(const char* const queryOriginal, const int queryLength,
142
+ const char* const targetOriginal, const int targetLength,
143
+ const EdlibAlignConfig config) {
144
+ EdlibAlignResult result;
145
+ result.status = EDLIB_STATUS_OK;
146
+ result.editDistance = -1;
147
+ result.endLocations = result.startLocations = NULL;
148
+ result.numLocations = 0;
149
+ result.alignment = NULL;
150
+ result.alignmentLength = 0;
151
+ result.alphabetLength = 0;
152
+
153
+ /*------------ TRANSFORM SEQUENCES AND RECOGNIZE ALPHABET -----------*/
154
+ unsigned char* query, * target;
155
+ string alphabet = transformSequences(queryOriginal, queryLength, targetOriginal, targetLength,
156
+ &query, &target);
157
+ result.alphabetLength = static_cast<int>(alphabet.size());
158
+ /*-------------------------------------------------------*/
159
+
160
+ // Handle special situation when at least one of the sequences has length 0.
161
+ if (queryLength == 0 || targetLength == 0) {
162
+ if (config.mode == EDLIB_MODE_NW) {
163
+ result.editDistance = std::max(queryLength, targetLength);
164
+ result.endLocations = static_cast<int *>(malloc(sizeof(int) * 1));
165
+ result.endLocations[0] = targetLength - 1;
166
+ result.numLocations = 1;
167
+ } else if (config.mode == EDLIB_MODE_SHW || config.mode == EDLIB_MODE_HW) {
168
+ result.editDistance = queryLength;
169
+ result.endLocations = static_cast<int *>(malloc(sizeof(int) * 1));
170
+ result.endLocations[0] = -1;
171
+ result.numLocations = 1;
172
+ } else {
173
+ result.status = EDLIB_STATUS_ERROR;
174
+ }
175
+
176
+ free(query);
177
+ free(target);
178
+ return result;
179
+ }
180
+
181
+ /*--------------------- INITIALIZATION ------------------*/
182
+ int maxNumBlocks = ceilDiv(queryLength, WORD_SIZE); // bmax in Myers
183
+ int W = maxNumBlocks * WORD_SIZE - queryLength; // number of redundant cells in last level blocks
184
+ EqualityDefinition equalityDefinition(alphabet, config.additionalEqualities, config.additionalEqualitiesLength);
185
+ Word* Peq = buildPeq(static_cast<int>(alphabet.size()), query, queryLength, equalityDefinition);
186
+ /*-------------------------------------------------------*/
187
+
188
+ /*------------------ MAIN CALCULATION -------------------*/
189
+ // TODO: Store alignment data only after k is determined? That could make things faster.
190
+ int positionNW; // Used only when mode is NW.
191
+ AlignmentData* alignData = NULL;
192
+ bool dynamicK = false;
193
+ int k = config.k;
194
+ if (k < 0) { // If valid k is not given, auto-adjust k until solution is found.
195
+ dynamicK = true;
196
+ k = WORD_SIZE; // Gives better results than smaller k.
197
+ }
198
+
199
+ do {
200
+ if (config.mode == EDLIB_MODE_HW || config.mode == EDLIB_MODE_SHW) {
201
+ myersCalcEditDistanceSemiGlobal(Peq, W, maxNumBlocks,
202
+ queryLength, target, targetLength,
203
+ k, config.mode, &(result.editDistance),
204
+ &(result.endLocations), &(result.numLocations));
205
+ } else { // mode == EDLIB_MODE_NW
206
+ myersCalcEditDistanceNW(Peq, W, maxNumBlocks,
207
+ queryLength, target, targetLength,
208
+ k, &(result.editDistance), &positionNW,
209
+ false, &alignData, -1);
210
+ }
211
+ k *= 2;
212
+ } while(dynamicK && result.editDistance == -1);
213
+
214
+ if (result.editDistance >= 0) { // If there is solution.
215
+ // If NW mode, set end location explicitly.
216
+ if (config.mode == EDLIB_MODE_NW) {
217
+ result.endLocations = static_cast<int *>(malloc(sizeof(int) * 1));
218
+ result.endLocations[0] = targetLength - 1;
219
+ result.numLocations = 1;
220
+ }
221
+
222
+ // Find starting locations.
223
+ if (config.task == EDLIB_TASK_LOC || config.task == EDLIB_TASK_PATH) {
224
+ result.startLocations = static_cast<int *>(malloc(result.numLocations * sizeof(int)));
225
+ if (config.mode == EDLIB_MODE_HW) { // If HW, I need to calculate start locations.
226
+ const unsigned char* rTarget = createReverseCopy(target, targetLength);
227
+ const unsigned char* rQuery = createReverseCopy(query, queryLength);
228
+ // Peq for reversed query.
229
+ Word* rPeq = buildPeq(static_cast<int>(alphabet.size()), rQuery, queryLength, equalityDefinition);
230
+ for (int i = 0; i < result.numLocations; i++) {
231
+ int endLocation = result.endLocations[i];
232
+ if (endLocation == -1) {
233
+ // NOTE: Sometimes one of optimal solutions is that query starts before target, like this:
234
+ // AAGG <- target
235
+ // CCTT <- query
236
+ // It will never be only optimal solution and it does not happen often, however it is
237
+ // possible and in that case end location will be -1. What should we do with that?
238
+ // Should we just skip reporting such end location, although it is a solution?
239
+ // If we do report it, what is the start location? -4? -1? Nothing?
240
+ // TODO: Figure this out. This has to do in general with how we think about start
241
+ // and end locations.
242
+ // Also, we have alignment later relying on this locations to limit the space of it's
243
+ // search -> how can it do it right if these locations are negative or incorrect?
244
+ result.startLocations[i] = 0; // I put 0 for now, but it does not make much sense.
245
+ } else {
246
+ int bestScoreSHW, numPositionsSHW;
247
+ int* positionsSHW;
248
+ myersCalcEditDistanceSemiGlobal(
249
+ rPeq, W, maxNumBlocks,
250
+ queryLength, rTarget + targetLength - endLocation - 1, endLocation + 1,
251
+ result.editDistance, EDLIB_MODE_SHW,
252
+ &bestScoreSHW, &positionsSHW, &numPositionsSHW);
253
+ // Taking last location as start ensures that alignment will not start with insertions
254
+ // if it can start with mismatches instead.
255
+ result.startLocations[i] = endLocation - positionsSHW[numPositionsSHW - 1];
256
+ free(positionsSHW);
257
+ }
258
+ }
259
+ delete[] rTarget;
260
+ delete[] rQuery;
261
+ delete[] rPeq;
262
+ } else { // If mode is SHW or NW
263
+ for (int i = 0; i < result.numLocations; i++) {
264
+ result.startLocations[i] = 0;
265
+ }
266
+ }
267
+ }
268
+
269
+ // Find alignment -> all comes down to finding alignment for NW.
270
+ // Currently we return alignment only for first pair of locations.
271
+ if (config.task == EDLIB_TASK_PATH) {
272
+ int alnStartLocation = result.startLocations[0];
273
+ int alnEndLocation = result.endLocations[0];
274
+ const unsigned char* alnTarget = target + alnStartLocation;
275
+ const int alnTargetLength = alnEndLocation - alnStartLocation + 1;
276
+ const unsigned char* rAlnTarget = createReverseCopy(alnTarget, alnTargetLength);
277
+ const unsigned char* rQuery = createReverseCopy(query, queryLength);
278
+ obtainAlignment(query, rQuery, queryLength,
279
+ alnTarget, rAlnTarget, alnTargetLength,
280
+ equalityDefinition, static_cast<int>(alphabet.size()), result.editDistance,
281
+ &(result.alignment), &(result.alignmentLength));
282
+ delete[] rAlnTarget;
283
+ delete[] rQuery;
284
+ }
285
+ }
286
+ /*-------------------------------------------------------*/
287
+
288
+ //--- Free memory ---//
289
+ delete[] Peq;
290
+ free(query);
291
+ free(target);
292
+ if (alignData) delete alignData;
293
+ //-------------------//
294
+
295
+ return result;
296
+ }
297
+
298
+ extern "C" char* edlibAlignmentToCigar(const unsigned char* const alignment, const int alignmentLength,
299
+ const EdlibCigarFormat cigarFormat) {
300
+ if (cigarFormat != EDLIB_CIGAR_EXTENDED && cigarFormat != EDLIB_CIGAR_STANDARD) {
301
+ return 0;
302
+ }
303
+
304
+ // Maps move code from alignment to char in cigar.
305
+ // 0 1 2 3
306
+ char moveCodeToChar[] = {'=', 'I', 'D', 'X'};
307
+ if (cigarFormat == EDLIB_CIGAR_STANDARD) {
308
+ moveCodeToChar[0] = moveCodeToChar[3] = 'M';
309
+ }
310
+
311
+ vector<char>* cigar = new vector<char>();
312
+ char lastMove = 0; // Char of last move. 0 if there was no previous move.
313
+ int numOfSameMoves = 0;
314
+ for (int i = 0; i <= alignmentLength; i++) {
315
+ // if new sequence of same moves started
316
+ if (i == alignmentLength || (moveCodeToChar[alignment[i]] != lastMove && lastMove != 0)) {
317
+ // Write number of moves to cigar string.
318
+ int numDigits = 0;
319
+ for (; numOfSameMoves; numOfSameMoves /= 10) {
320
+ cigar->push_back('0' + numOfSameMoves % 10);
321
+ numDigits++;
322
+ }
323
+ reverse(cigar->end() - numDigits, cigar->end());
324
+ // Write code of move to cigar string.
325
+ cigar->push_back(lastMove);
326
+ // If not at the end, start new sequence of moves.
327
+ if (i < alignmentLength) {
328
+ // Check if alignment has valid values.
329
+ if (alignment[i] > 3) {
330
+ delete cigar;
331
+ return 0;
332
+ }
333
+ numOfSameMoves = 0;
334
+ }
335
+ }
336
+ if (i < alignmentLength) {
337
+ lastMove = moveCodeToChar[alignment[i]];
338
+ numOfSameMoves++;
339
+ }
340
+ }
341
+ cigar->push_back(0); // Null character termination.
342
+ char* cigar_ = static_cast<char *>(malloc(cigar->size() * sizeof(char)));
343
+ memcpy(cigar_, &(*cigar)[0], cigar->size() * sizeof(char));
344
+ delete cigar;
345
+
346
+ return cigar_;
347
+ }
348
+
349
+ /**
350
+ * Build Peq table for given query and alphabet.
351
+ * Peq is table of dimensions alphabetLength+1 x maxNumBlocks.
352
+ * Bit i of Peq[s * maxNumBlocks + b] is 1 if i-th symbol from block b of query equals symbol s, otherwise it is 0.
353
+ * NOTICE: free returned array with delete[]!
354
+ */
355
+ static inline Word* buildPeq(const int alphabetLength,
356
+ const unsigned char* const query,
357
+ const int queryLength,
358
+ const EqualityDefinition& equalityDefinition) {
359
+ int maxNumBlocks = ceilDiv(queryLength, WORD_SIZE);
360
+ // table of dimensions alphabetLength+1 x maxNumBlocks. Last symbol is wildcard.
361
+ Word* Peq = new Word[(alphabetLength + 1) * maxNumBlocks];
362
+
363
+ // Build Peq (1 is match, 0 is mismatch). NOTE: last column is wildcard(symbol that matches anything) with just 1s
364
+ for (int symbol = 0; symbol <= alphabetLength; symbol++) {
365
+ for (int b = 0; b < maxNumBlocks; b++) {
366
+ if (symbol < alphabetLength) {
367
+ Peq[symbol * maxNumBlocks + b] = 0;
368
+ for (int r = (b+1) * WORD_SIZE - 1; r >= b * WORD_SIZE; r--) {
369
+ Peq[symbol * maxNumBlocks + b] <<= 1;
370
+ // NOTE: We pretend like query is padded at the end with W wildcard symbols
371
+ if (r >= queryLength || equalityDefinition.areEqual(query[r], symbol))
372
+ Peq[symbol * maxNumBlocks + b] += 1;
373
+ }
374
+ } else { // Last symbol is wildcard, so it is all 1s
375
+ Peq[symbol * maxNumBlocks + b] = static_cast<Word>(-1);
376
+ }
377
+ }
378
+ }
379
+
380
+ return Peq;
381
+ }
382
+
383
+
384
+ /**
385
+ * Returns new sequence that is reverse of given sequence.
386
+ * Free returned array with delete[].
387
+ */
388
+ static inline unsigned char* createReverseCopy(const unsigned char* const seq, const int length) {
389
+ unsigned char* rSeq = new unsigned char[length];
390
+ for (int i = 0; i < length; i++) {
391
+ rSeq[i] = seq[length - i - 1];
392
+ }
393
+ return rSeq;
394
+ }
395
+
396
+ /**
397
+ * Corresponds to Advance_Block function from Myers.
398
+ * Calculates one word(block), which is part of a column.
399
+ * Highest bit of word (one most to the left) is most bottom cell of block from column.
400
+ * Pv[i] and Mv[i] define vin of cell[i]: vin = cell[i] - cell[i-1].
401
+ * @param [in] Pv Bitset, Pv[i] == 1 if vin is +1, otherwise Pv[i] == 0.
402
+ * @param [in] Mv Bitset, Mv[i] == 1 if vin is -1, otherwise Mv[i] == 0.
403
+ * @param [in] Eq Bitset, Eq[i] == 1 if match, 0 if mismatch.
404
+ * @param [in] hin Will be +1, 0 or -1.
405
+ * @param [out] PvOut Bitset, PvOut[i] == 1 if vout is +1, otherwise PvOut[i] == 0.
406
+ * @param [out] MvOut Bitset, MvOut[i] == 1 if vout is -1, otherwise MvOut[i] == 0.
407
+ * @param [out] hout Will be +1, 0 or -1.
408
+ */
409
+ static inline int calculateBlock(Word Pv, Word Mv, Word Eq, const int hin,
410
+ Word &PvOut, Word &MvOut) {
411
+ // hin can be 1, -1 or 0.
412
+ // 1 -> 00...01
413
+ // 0 -> 00...00
414
+ // -1 -> 11...11 (2-complement)
415
+
416
+ Word hinIsNeg = static_cast<Word>(hin >> 2) & WORD_1; // 00...001 if hin is -1, 00...000 if 0 or 1
417
+
418
+ Word Xv = Eq | Mv;
419
+ // This is instruction below written using 'if': if (hin < 0) Eq |= (Word)1;
420
+ Eq |= hinIsNeg;
421
+ Word Xh = (((Eq & Pv) + Pv) ^ Pv) | Eq;
422
+
423
+ Word Ph = Mv | ~(Xh | Pv);
424
+ Word Mh = Pv & Xh;
425
+
426
+ int hout = 0;
427
+ // This is instruction below written using 'if': if (Ph & HIGH_BIT_MASK) hout = 1;
428
+ hout = (Ph & HIGH_BIT_MASK) >> (WORD_SIZE - 1);
429
+ // This is instruction below written using 'if': if (Mh & HIGH_BIT_MASK) hout = -1;
430
+ hout -= (Mh & HIGH_BIT_MASK) >> (WORD_SIZE - 1);
431
+
432
+ Ph <<= 1;
433
+ Mh <<= 1;
434
+
435
+ // This is instruction below written using 'if': if (hin < 0) Mh |= (Word)1;
436
+ Mh |= hinIsNeg;
437
+ // This is instruction below written using 'if': if (hin > 0) Ph |= (Word)1;
438
+ Ph |= static_cast<Word>((hin + 1) >> 1);
439
+
440
+ PvOut = Mh | ~(Xv | Ph);
441
+ MvOut = Ph & Xv;
442
+
443
+ return hout;
444
+ }
445
+
446
+ /**
447
+ * Does ceiling division x / y.
448
+ * Note: x and y must be non-negative and x + y must not overflow.
449
+ */
450
+ static inline int ceilDiv(const int x, const int y) {
451
+ return x % y ? x / y + 1 : x / y;
452
+ }
453
+
454
+ static inline int min(const int x, const int y) {
455
+ return x < y ? x : y;
456
+ }
457
+
458
+ static inline int max(const int x, const int y) {
459
+ return x > y ? x : y;
460
+ }
461
+
462
+
463
+ /**
464
+ * @param [in] block
465
+ * @return Values of cells in block, starting with bottom cell in block.
466
+ */
467
+ static inline vector<int> getBlockCellValues(const Block block) {
468
+ vector<int> scores(WORD_SIZE);
469
+ int score = block.score;
470
+ Word mask = HIGH_BIT_MASK;
471
+ for (int i = 0; i < WORD_SIZE - 1; i++) {
472
+ scores[i] = score;
473
+ if (block.P & mask) score--;
474
+ if (block.M & mask) score++;
475
+ mask >>= 1;
476
+ }
477
+ scores[WORD_SIZE - 1] = score;
478
+ return scores;
479
+ }
480
+
481
+ /**
482
+ * Writes values of cells in block into given array, starting with first/top cell.
483
+ * @param [in] block
484
+ * @param [out] dest Array into which cell values are written. Must have size of at least WORD_SIZE.
485
+ */
486
+ static inline void readBlock(const Block block, int* const dest) {
487
+ int score = block.score;
488
+ Word mask = HIGH_BIT_MASK;
489
+ for (int i = 0; i < WORD_SIZE - 1; i++) {
490
+ dest[WORD_SIZE - 1 - i] = score;
491
+ if (block.P & mask) score--;
492
+ if (block.M & mask) score++;
493
+ mask >>= 1;
494
+ }
495
+ dest[0] = score;
496
+ }
497
+
498
+ /**
499
+ * Writes values of cells in block into given array, starting with last/bottom cell.
500
+ * @param [in] block
501
+ * @param [out] dest Array into which cell values are written. Must have size of at least WORD_SIZE.
502
+ */
503
+ static inline void readBlockReverse(const Block block, int* const dest) {
504
+ int score = block.score;
505
+ Word mask = HIGH_BIT_MASK;
506
+ for (int i = 0; i < WORD_SIZE - 1; i++) {
507
+ dest[i] = score;
508
+ if (block.P & mask) score--;
509
+ if (block.M & mask) score++;
510
+ mask >>= 1;
511
+ }
512
+ dest[WORD_SIZE - 1] = score;
513
+ }
514
+
515
+ /**
516
+ * @param [in] block
517
+ * @param [in] k
518
+ * @return True if all cells in block have value larger than k, otherwise false.
519
+ */
520
+ static inline bool allBlockCellsLarger(const Block block, const int k) {
521
+ vector<int> scores = getBlockCellValues(block);
522
+ for (int i = 0; i < WORD_SIZE; i++) {
523
+ if (scores[i] <= k) return false;
524
+ }
525
+ return true;
526
+ }
527
+
528
+
529
+ /**
530
+ * Uses Myers' bit-vector algorithm to find edit distance for one of semi-global alignment methods.
531
+ * @param [in] Peq Query profile.
532
+ * @param [in] W Size of padding in last block.
533
+ * TODO: Calculate this directly from query, instead of passing it.
534
+ * @param [in] maxNumBlocks Number of blocks needed to cover the whole query.
535
+ * TODO: Calculate this directly from query, instead of passing it.
536
+ * @param [in] queryLength
537
+ * @param [in] target
538
+ * @param [in] targetLength
539
+ * @param [in] k
540
+ * @param [in] mode EDLIB_MODE_HW or EDLIB_MODE_SHW
541
+ * @param [out] bestScore_ Edit distance.
542
+ * @param [out] positions_ Array of 0-indexed positions in target at which best score was found.
543
+ Make sure to free this array with free().
544
+ * @param [out] numPositions_ Number of positions in the positions_ array.
545
+ * @return Status.
546
+ */
547
+ static int myersCalcEditDistanceSemiGlobal(
548
+ const Word* const Peq, const int W, const int maxNumBlocks,
549
+ const int queryLength,
550
+ const unsigned char* const target, const int targetLength,
551
+ int k, const EdlibAlignMode mode,
552
+ int* const bestScore_, int** const positions_, int* const numPositions_) {
553
+ *positions_ = NULL;
554
+ *numPositions_ = 0;
555
+
556
+ // firstBlock is 0-based index of first block in Ukkonen band.
557
+ // lastBlock is 0-based index of last block in Ukkonen band.
558
+ int firstBlock = 0;
559
+ int lastBlock = min(ceilDiv(k + 1, WORD_SIZE), maxNumBlocks) - 1; // y in Myers
560
+ Block *bl; // Current block
561
+
562
+ Block* blocks = new Block[maxNumBlocks];
563
+
564
+ // For HW, solution will never be larger then queryLength.
565
+ if (mode == EDLIB_MODE_HW) {
566
+ k = min(queryLength, k);
567
+ }
568
+
569
+ // Each STRONG_REDUCE_NUM column is reduced in more expensive way.
570
+ // This gives speed up of about 2 times for small k.
571
+ const int STRONG_REDUCE_NUM = 2048;
572
+
573
+ // Initialize P, M and score
574
+ bl = blocks;
575
+ for (int b = 0; b <= lastBlock; b++) {
576
+ bl->score = (b + 1) * WORD_SIZE;
577
+ bl->P = static_cast<Word>(-1); // All 1s
578
+ bl->M = static_cast<Word>(0);
579
+ bl++;
580
+ }
581
+
582
+ int bestScore = -1;
583
+ vector<int> positions; // TODO: Maybe put this on heap?
584
+ const int startHout = mode == EDLIB_MODE_HW ? 0 : 1; // If 0 then gap before query is not penalized;
585
+ const unsigned char* targetChar = target;
586
+ for (int c = 0; c < targetLength; c++) { // for each column
587
+ const Word* Peq_c = Peq + (*targetChar) * maxNumBlocks;
588
+
589
+ //----------------------- Calculate column -------------------------//
590
+ int hout = startHout;
591
+ bl = blocks + firstBlock;
592
+ Peq_c += firstBlock;
593
+ for (int b = firstBlock; b <= lastBlock; b++) {
594
+ hout = calculateBlock(bl->P, bl->M, *Peq_c, hout, bl->P, bl->M);
595
+ bl->score += hout;
596
+ bl++; Peq_c++;
597
+ }
598
+ bl--; Peq_c--;
599
+ //------------------------------------------------------------------//
600
+
601
+ //---------- Adjust number of blocks according to Ukkonen ----------//
602
+ if ((lastBlock < maxNumBlocks - 1) && (bl->score - hout <= k) // bl is pointing to last block
603
+ && ((*(Peq_c + 1) & WORD_1) || hout < 0)) { // Peq_c is pointing to last block
604
+ // If score of left block is not too big, calculate one more block
605
+ lastBlock++; bl++; Peq_c++;
606
+ bl->P = static_cast<Word>(-1); // All 1s
607
+ bl->M = static_cast<Word>(0);
608
+ bl->score = (bl - 1)->score - hout + WORD_SIZE + calculateBlock(bl->P, bl->M, *Peq_c, hout, bl->P, bl->M);
609
+ } else {
610
+ while (lastBlock >= firstBlock && bl->score >= k + WORD_SIZE) {
611
+ lastBlock--; bl--; Peq_c--;
612
+ }
613
+ }
614
+
615
+ // Every some columns, do some expensive but also more efficient block reducing.
616
+ // This is important!
617
+ //
618
+ // Reduce the band by decreasing last block if possible.
619
+ if (c % STRONG_REDUCE_NUM == 0) {
620
+ while (lastBlock >= 0 && lastBlock >= firstBlock && allBlockCellsLarger(*bl, k)) {
621
+ lastBlock--; bl--; Peq_c--;
622
+ }
623
+ }
624
+ // For HW, even if all cells are > k, there still may be solution in next
625
+ // column because starting conditions at upper boundary are 0.
626
+ // That means that first block is always candidate for solution,
627
+ // and we can never end calculation before last column.
628
+ if (mode == EDLIB_MODE_HW && lastBlock == -1) {
629
+ lastBlock++; bl++; Peq_c++;
630
+ }
631
+
632
+ // Reduce band by increasing first block if possible. Not applicable to HW.
633
+ if (mode != EDLIB_MODE_HW) {
634
+ while (firstBlock <= lastBlock && blocks[firstBlock].score >= k + WORD_SIZE) {
635
+ firstBlock++;
636
+ }
637
+ if (c % STRONG_REDUCE_NUM == 0) { // Do strong reduction every some blocks
638
+ while (firstBlock <= lastBlock && allBlockCellsLarger(blocks[firstBlock], k)) {
639
+ firstBlock++;
640
+ }
641
+ }
642
+ }
643
+
644
+ // If band stops to exist finish
645
+ if (lastBlock < firstBlock) {
646
+ *bestScore_ = bestScore;
647
+ if (bestScore != -1) {
648
+ *positions_ = static_cast<int *>(malloc(sizeof(int) * static_cast<int>(positions.size())));
649
+ *numPositions_ = static_cast<int>(positions.size());
650
+ copy(positions.begin(), positions.end(), *positions_);
651
+ }
652
+ delete[] blocks;
653
+ return EDLIB_STATUS_OK;
654
+ }
655
+ //------------------------------------------------------------------//
656
+
657
+ //------------------------- Update best score ----------------------//
658
+ if (lastBlock == maxNumBlocks - 1) {
659
+ int colScore = bl->score;
660
+ if (colScore <= k) { // Scores > k dont have correct values (so we cannot use them), but are certainly > k.
661
+ // NOTE: Score that I find in column c is actually score from column c-W
662
+ if (bestScore == -1 || colScore <= bestScore) {
663
+ if (colScore != bestScore) {
664
+ positions.clear();
665
+ bestScore = colScore;
666
+ // Change k so we will look only for equal or better
667
+ // scores then the best found so far.
668
+ k = bestScore;
669
+ }
670
+ positions.push_back(c - W);
671
+ }
672
+ }
673
+ }
674
+ //------------------------------------------------------------------//
675
+
676
+ targetChar++;
677
+ }
678
+
679
+
680
+ // Obtain results for last W columns from last column.
681
+ if (lastBlock == maxNumBlocks - 1) {
682
+ vector<int> blockScores = getBlockCellValues(*bl);
683
+ for (int i = 0; i < W; i++) {
684
+ int colScore = blockScores[i + 1];
685
+ if (colScore <= k && (bestScore == -1 || colScore <= bestScore)) {
686
+ if (colScore != bestScore) {
687
+ positions.clear();
688
+ k = bestScore = colScore;
689
+ }
690
+ positions.push_back(targetLength - W + i);
691
+ }
692
+ }
693
+ }
694
+
695
+ *bestScore_ = bestScore;
696
+ if (bestScore != -1) {
697
+ *positions_ = static_cast<int *>(malloc(sizeof(int) * static_cast<int>(positions.size())));
698
+ *numPositions_ = static_cast<int>(positions.size());
699
+ copy(positions.begin(), positions.end(), *positions_);
700
+ }
701
+
702
+ delete[] blocks;
703
+ return EDLIB_STATUS_OK;
704
+ }
705
+
706
+
707
+ /**
708
+ * Uses Myers' bit-vector algorithm to find edit distance for global(NW) alignment method.
709
+ * @param [in] Peq Query profile.
710
+ * @param [in] W Size of padding in last block.
711
+ * TODO: Calculate this directly from query, instead of passing it.
712
+ * @param [in] maxNumBlocks Number of blocks needed to cover the whole query.
713
+ * TODO: Calculate this directly from query, instead of passing it.
714
+ * @param [in] queryLength
715
+ * @param [in] target
716
+ * @param [in] targetLength
717
+ * @param [in] k
718
+ * @param [out] bestScore_ Edit distance.
719
+ * @param [out] position_ 0-indexed position in target at which best score was found.
720
+ * @param [in] findAlignment If true, whole matrix is remembered and alignment data is returned.
721
+ * Quadratic amount of memory is consumed.
722
+ * @param [out] alignData Data needed for alignment traceback (for reconstruction of alignment).
723
+ * Set only if findAlignment is set to true, otherwise it is NULL.
724
+ * Make sure to free this array using delete[].
725
+ * @param [out] targetStopPosition If set to -1, whole calculation is performed normally, as expected.
726
+ * If set to p, calculation is performed up to position p in target (inclusive)
727
+ * and column p is returned as the only column in alignData.
728
+ * @return Status.
729
+ */
730
+ static int myersCalcEditDistanceNW(const Word* const Peq, const int W, const int maxNumBlocks,
731
+ const int queryLength,
732
+ const unsigned char* const target, const int targetLength,
733
+ int k, int* const bestScore_,
734
+ int* const position_, const bool findAlignment,
735
+ AlignmentData** const alignData, const int targetStopPosition) {
736
+ if (targetStopPosition > -1 && findAlignment) {
737
+ // They can not be both set at the same time!
738
+ return EDLIB_STATUS_ERROR;
739
+ }
740
+
741
+ // Each STRONG_REDUCE_NUM column is reduced in more expensive way.
742
+ const int STRONG_REDUCE_NUM = 2048; // TODO: Choose this number dinamically (based on query and target lengths?), so it does not affect speed of computation
743
+
744
+ if (k < abs(targetLength - queryLength)) {
745
+ *bestScore_ = *position_ = -1;
746
+ return EDLIB_STATUS_OK;
747
+ }
748
+
749
+ k = min(k, max(queryLength, targetLength)); // Upper bound for k
750
+
751
+ // firstBlock is 0-based index of first block in Ukkonen band.
752
+ // lastBlock is 0-based index of last block in Ukkonen band.
753
+ int firstBlock = 0;
754
+ // This is optimal now, by my formula.
755
+ int lastBlock = min(maxNumBlocks, ceilDiv(min(k, (k + queryLength - targetLength) / 2) + 1, WORD_SIZE)) - 1;
756
+ Block* bl; // Current block
757
+
758
+ Block* blocks = new Block[maxNumBlocks];
759
+
760
+ // Initialize P, M and score
761
+ bl = blocks;
762
+ for (int b = 0; b <= lastBlock; b++) {
763
+ bl->score = (b + 1) * WORD_SIZE;
764
+ bl->P = static_cast<Word>(-1); // All 1s
765
+ bl->M = static_cast<Word>(0);
766
+ bl++;
767
+ }
768
+
769
+ // If we want to find alignment, we have to store needed data.
770
+ if (findAlignment)
771
+ *alignData = new AlignmentData(maxNumBlocks, targetLength);
772
+ else if (targetStopPosition > -1)
773
+ *alignData = new AlignmentData(maxNumBlocks, 1);
774
+ else
775
+ *alignData = NULL;
776
+
777
+ const unsigned char* targetChar = target;
778
+ for (int c = 0; c < targetLength; c++) { // for each column
779
+ const Word* Peq_c = Peq + *targetChar * maxNumBlocks;
780
+
781
+ //----------------------- Calculate column -------------------------//
782
+ int hout = 1;
783
+ bl = blocks + firstBlock;
784
+ for (int b = firstBlock; b <= lastBlock; b++) {
785
+ hout = calculateBlock(bl->P, bl->M, Peq_c[b], hout, bl->P, bl->M);
786
+ bl->score += hout;
787
+ bl++;
788
+ }
789
+ bl--;
790
+ //------------------------------------------------------------------//
791
+ // bl now points to last block
792
+
793
+ // Update k. I do it only on end of column because it would slow calculation too much otherwise.
794
+ // NOTICE: I add W when in last block because it is actually result from W cells to the left and W cells up.
795
+ k = min(k, bl->score
796
+ + max(targetLength - c - 1, queryLength - ((1 + lastBlock) * WORD_SIZE - 1) - 1)
797
+ + (lastBlock == maxNumBlocks - 1 ? W : 0));
798
+
799
+ //---------- Adjust number of blocks according to Ukkonen ----------//
800
+ //--- Adjust last block ---//
801
+ // If block is not beneath band, calculate next block. Only next because others are certainly beneath band.
802
+ if (lastBlock + 1 < maxNumBlocks
803
+ && !(//score[lastBlock] >= k + WORD_SIZE || // NOTICE: this condition could be satisfied if above block also!
804
+ ((lastBlock + 1) * WORD_SIZE - 1
805
+ > k - bl->score + 2 * WORD_SIZE - 2 - targetLength + c + queryLength))) {
806
+ lastBlock++; bl++;
807
+ bl->P = static_cast<Word>(-1); // All 1s
808
+ bl->M = static_cast<Word>(0);
809
+ int newHout = calculateBlock(bl->P, bl->M, Peq_c[lastBlock], hout, bl->P, bl->M);
810
+ bl->score = (bl - 1)->score - hout + WORD_SIZE + newHout;
811
+ hout = newHout;
812
+ }
813
+
814
+ // While block is out of band, move one block up.
815
+ // NOTE: Condition used here is more loose than the one from the article, since I simplified the max() part of it.
816
+ // I could consider adding that max part, for optimal performance.
817
+ while (lastBlock >= firstBlock
818
+ && (bl->score >= k + WORD_SIZE
819
+ || ((lastBlock + 1) * WORD_SIZE - 1 >
820
+ // TODO: Does not work if do not put +1! Why???
821
+ k - bl->score + 2 * WORD_SIZE - 2 - targetLength + c + queryLength + 1))) {
822
+ lastBlock--; bl--;
823
+ }
824
+ //-------------------------//
825
+
826
+ //--- Adjust first block ---//
827
+ // While outside of band, advance block
828
+ while (firstBlock <= lastBlock
829
+ && (blocks[firstBlock].score >= k + WORD_SIZE
830
+ || ((firstBlock + 1) * WORD_SIZE - 1 <
831
+ blocks[firstBlock].score - k - targetLength + queryLength + c))) {
832
+ firstBlock++;
833
+ }
834
+ //--------------------------/
835
+
836
+
837
+ // TODO: consider if this part is useful, it does not seem to help much
838
+ if (c % STRONG_REDUCE_NUM == 0) { // Every some columns do more expensive but more efficient reduction
839
+ while (lastBlock >= firstBlock) {
840
+ // If all cells outside of band, remove block
841
+ vector<int> scores = getBlockCellValues(*bl);
842
+ int numCells = lastBlock == maxNumBlocks - 1 ? WORD_SIZE - W : WORD_SIZE;
843
+ int r = lastBlock * WORD_SIZE + numCells - 1;
844
+ bool reduce = true;
845
+ for (int i = WORD_SIZE - numCells; i < WORD_SIZE; i++) {
846
+ // TODO: Does not work if do not put +1! Why???
847
+ if (scores[i] <= k && r <= k - scores[i] - targetLength + c + queryLength + 1) {
848
+ reduce = false;
849
+ break;
850
+ }
851
+ r--;
852
+ }
853
+ if (!reduce) break;
854
+ lastBlock--; bl--;
855
+ }
856
+
857
+ while (firstBlock <= lastBlock) {
858
+ // If all cells outside of band, remove block
859
+ vector<int> scores = getBlockCellValues(blocks[firstBlock]);
860
+ int numCells = firstBlock == maxNumBlocks - 1 ? WORD_SIZE - W : WORD_SIZE;
861
+ int r = firstBlock * WORD_SIZE + numCells - 1;
862
+ bool reduce = true;
863
+ for (int i = WORD_SIZE - numCells; i < WORD_SIZE; i++) {
864
+ if (scores[i] <= k && r >= scores[i] - k - targetLength + c + queryLength) {
865
+ reduce = false;
866
+ break;
867
+ }
868
+ r--;
869
+ }
870
+ if (!reduce) break;
871
+ firstBlock++;
872
+ }
873
+ }
874
+
875
+
876
+ // If band stops to exist finish
877
+ if (lastBlock < firstBlock) {
878
+ *bestScore_ = *position_ = -1;
879
+ delete[] blocks;
880
+ return EDLIB_STATUS_OK;
881
+ }
882
+ //------------------------------------------------------------------//
883
+
884
+
885
+ //---- Save column so it can be used for reconstruction ----//
886
+ if (findAlignment && c < targetLength) {
887
+ bl = blocks + firstBlock;
888
+ for (int b = firstBlock; b <= lastBlock; b++) {
889
+ (*alignData)->Ps[maxNumBlocks * c + b] = bl->P;
890
+ (*alignData)->Ms[maxNumBlocks * c + b] = bl->M;
891
+ (*alignData)->scores[maxNumBlocks * c + b] = bl->score;
892
+ (*alignData)->firstBlocks[c] = firstBlock;
893
+ (*alignData)->lastBlocks[c] = lastBlock;
894
+ bl++;
895
+ }
896
+ }
897
+ //----------------------------------------------------------//
898
+ //---- If this is stop column, save it and finish ----//
899
+ if (c == targetStopPosition) {
900
+ for (int b = firstBlock; b <= lastBlock; b++) {
901
+ (*alignData)->Ps[b] = (blocks + b)->P;
902
+ (*alignData)->Ms[b] = (blocks + b)->M;
903
+ (*alignData)->scores[b] = (blocks + b)->score;
904
+ (*alignData)->firstBlocks[0] = firstBlock;
905
+ (*alignData)->lastBlocks[0] = lastBlock;
906
+ }
907
+ *bestScore_ = -1;
908
+ *position_ = targetStopPosition;
909
+ delete[] blocks;
910
+ return EDLIB_STATUS_OK;
911
+ }
912
+ //----------------------------------------------------//
913
+
914
+ targetChar++;
915
+ }
916
+
917
+ if (lastBlock == maxNumBlocks - 1) { // If last block of last column was calculated
918
+ // Obtain best score from block -> it is complicated because query is padded with W cells
919
+ int bestScore = getBlockCellValues(blocks[lastBlock])[W];
920
+ if (bestScore <= k) {
921
+ *bestScore_ = bestScore;
922
+ *position_ = targetLength - 1;
923
+ delete[] blocks;
924
+ return EDLIB_STATUS_OK;
925
+ }
926
+ }
927
+
928
+ *bestScore_ = *position_ = -1;
929
+ delete[] blocks;
930
+ return EDLIB_STATUS_OK;
931
+ }
932
+
933
+
934
+ /**
935
+ * Finds one possible alignment that gives optimal score by moving back through the dynamic programming matrix,
936
+ * that is stored in alignData. Consumes large amount of memory: O(queryLength * targetLength).
937
+ * @param [in] queryLength Normal length, without W.
938
+ * @param [in] targetLength Normal length, without W.
939
+ * @param [in] bestScore Best score.
940
+ * @param [in] alignData Data obtained during finding best score that is useful for finding alignment.
941
+ * @param [out] alignment Alignment.
942
+ * @param [out] alignmentLength Length of alignment.
943
+ * @return Status code.
944
+ */
945
+ static int obtainAlignmentTraceback(const int queryLength, const int targetLength,
946
+ const int bestScore, const AlignmentData* const alignData,
947
+ unsigned char** const alignment, int* const alignmentLength) {
948
+ const int maxNumBlocks = ceilDiv(queryLength, WORD_SIZE);
949
+ const int W = maxNumBlocks * WORD_SIZE - queryLength;
950
+
951
+ *alignment = static_cast<unsigned char*>(malloc((queryLength + targetLength - 1) * sizeof(unsigned char)));
952
+ *alignmentLength = 0;
953
+ int c = targetLength - 1; // index of column
954
+ int b = maxNumBlocks - 1; // index of block in column
955
+ int currScore = bestScore; // Score of current cell
956
+ int lScore = -1; // Score of left cell
957
+ int uScore = -1; // Score of upper cell
958
+ int ulScore = -1; // Score of upper left cell
959
+ Word currP = alignData->Ps[c * maxNumBlocks + b]; // P of current block
960
+ Word currM = alignData->Ms[c * maxNumBlocks + b]; // M of current block
961
+ // True if block to left exists and is in band
962
+ bool thereIsLeftBlock = c > 0 && b >= alignData->firstBlocks[c-1] && b <= alignData->lastBlocks[c-1];
963
+ // We set initial values of lP and lM to 0 only to avoid compiler warnings, they should not affect the
964
+ // calculation as both lP and lM should be initialized at some moment later (but compiler can not
965
+ // detect it since this initialization is guaranteed by "business" logic).
966
+ Word lP = 0, lM = 0;
967
+ if (thereIsLeftBlock) {
968
+ lP = alignData->Ps[(c - 1) * maxNumBlocks + b]; // P of block to the left
969
+ lM = alignData->Ms[(c - 1) * maxNumBlocks + b]; // M of block to the left
970
+ }
971
+ currP <<= W;
972
+ currM <<= W;
973
+ int blockPos = WORD_SIZE - W - 1; // 0 based index of current cell in blockPos
974
+
975
+ // TODO(martin): refactor this whole piece of code. There are too many if-else statements,
976
+ // it is too easy for a bug to hide and to hard to effectively cover all the edge-cases.
977
+ // We need better separation of logic and responsibilities.
978
+ while (true) {
979
+ if (c == 0) {
980
+ thereIsLeftBlock = true;
981
+ lScore = b * WORD_SIZE + blockPos + 1;
982
+ ulScore = lScore - 1;
983
+ }
984
+
985
+ // TODO: improvement: calculate only those cells that are needed,
986
+ // for example if I calculate upper cell and can move up,
987
+ // there is no need to calculate left and upper left cell
988
+ //---------- Calculate scores ---------//
989
+ if (lScore == -1 && thereIsLeftBlock) {
990
+ lScore = alignData->scores[(c - 1) * maxNumBlocks + b]; // score of block to the left
991
+ for (int i = 0; i < WORD_SIZE - blockPos - 1; i++) {
992
+ if (lP & HIGH_BIT_MASK) lScore--;
993
+ if (lM & HIGH_BIT_MASK) lScore++;
994
+ lP <<= 1;
995
+ lM <<= 1;
996
+ }
997
+ }
998
+ if (ulScore == -1) {
999
+ if (lScore != -1) {
1000
+ ulScore = lScore;
1001
+ if (lP & HIGH_BIT_MASK) ulScore--;
1002
+ if (lM & HIGH_BIT_MASK) ulScore++;
1003
+ }
1004
+ else if (c > 0 && b-1 >= alignData->firstBlocks[c-1] && b-1 <= alignData->lastBlocks[c-1]) {
1005
+ // This is the case when upper left cell is last cell in block,
1006
+ // and block to left is not in band so lScore is -1.
1007
+ ulScore = alignData->scores[(c - 1) * maxNumBlocks + b - 1];
1008
+ }
1009
+ }
1010
+ if (uScore == -1) {
1011
+ uScore = currScore;
1012
+ if (currP & HIGH_BIT_MASK) uScore--;
1013
+ if (currM & HIGH_BIT_MASK) uScore++;
1014
+ currP <<= 1;
1015
+ currM <<= 1;
1016
+ }
1017
+ //-------------------------------------//
1018
+
1019
+ // TODO: should I check if there is upper block?
1020
+
1021
+ //-------------- Move --------------//
1022
+ // Move up - insertion to target - deletion from query
1023
+ if (uScore != -1 && uScore + 1 == currScore) {
1024
+ currScore = uScore;
1025
+ lScore = ulScore;
1026
+ uScore = ulScore = -1;
1027
+ if (blockPos == 0) { // If entering new (upper) block
1028
+ if (b == 0) { // If there are no cells above (only boundary cells)
1029
+ (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_INSERT; // Move up
1030
+ for (int i = 0; i < c + 1; i++) // Move left until end
1031
+ (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_DELETE;
1032
+ break;
1033
+ } else {
1034
+ blockPos = WORD_SIZE - 1;
1035
+ b--;
1036
+ currP = alignData->Ps[c * maxNumBlocks + b];
1037
+ currM = alignData->Ms[c * maxNumBlocks + b];
1038
+ if (c > 0 && b >= alignData->firstBlocks[c-1] && b <= alignData->lastBlocks[c-1]) {
1039
+ thereIsLeftBlock = true;
1040
+ lP = alignData->Ps[(c - 1) * maxNumBlocks + b]; // TODO: improve this, too many operations
1041
+ lM = alignData->Ms[(c - 1) * maxNumBlocks + b];
1042
+ } else {
1043
+ thereIsLeftBlock = false;
1044
+ // TODO(martin): There may not be left block, but there can be left boundary - do we
1045
+ // handle this correctly then? Are l and ul score set correctly? I should check that / refactor this.
1046
+ }
1047
+ }
1048
+ } else {
1049
+ blockPos--;
1050
+ lP <<= 1;
1051
+ lM <<= 1;
1052
+ }
1053
+ // Mark move
1054
+ (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_INSERT;
1055
+ }
1056
+ // Move left - deletion from target - insertion to query
1057
+ else if (lScore != -1 && lScore + 1 == currScore) {
1058
+ currScore = lScore;
1059
+ uScore = ulScore;
1060
+ lScore = ulScore = -1;
1061
+ c--;
1062
+ if (c == -1) { // If there are no cells to the left (only boundary cells)
1063
+ (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_DELETE; // Move left
1064
+ int numUp = b * WORD_SIZE + blockPos + 1;
1065
+ for (int i = 0; i < numUp; i++) // Move up until end
1066
+ (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_INSERT;
1067
+ break;
1068
+ }
1069
+ currP = lP;
1070
+ currM = lM;
1071
+ if (c > 0 && b >= alignData->firstBlocks[c-1] && b <= alignData->lastBlocks[c-1]) {
1072
+ thereIsLeftBlock = true;
1073
+ lP = alignData->Ps[(c - 1) * maxNumBlocks + b];
1074
+ lM = alignData->Ms[(c - 1) * maxNumBlocks + b];
1075
+ } else {
1076
+ if (c == 0) { // If there are no cells to the left (only boundary cells)
1077
+ thereIsLeftBlock = true;
1078
+ lScore = b * WORD_SIZE + blockPos + 1;
1079
+ ulScore = lScore - 1;
1080
+ } else {
1081
+ thereIsLeftBlock = false;
1082
+ }
1083
+ }
1084
+ // Mark move
1085
+ (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_DELETE;
1086
+ }
1087
+ // Move up left - (mis)match
1088
+ else if (ulScore != -1) {
1089
+ unsigned char moveCode = ulScore == currScore ? EDLIB_EDOP_MATCH : EDLIB_EDOP_MISMATCH;
1090
+ currScore = ulScore;
1091
+ uScore = lScore = ulScore = -1;
1092
+ c--;
1093
+ if (c == -1) { // If there are no cells to the left (only boundary cells)
1094
+ (*alignment)[(*alignmentLength)++] = moveCode; // Move left
1095
+ int numUp = b * WORD_SIZE + blockPos;
1096
+ for (int i = 0; i < numUp; i++) // Move up until end
1097
+ (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_INSERT;
1098
+ break;
1099
+ }
1100
+ if (blockPos == 0) { // If entering upper left block
1101
+ if (b == 0) { // If there are no more cells above (only boundary cells)
1102
+ (*alignment)[(*alignmentLength)++] = moveCode; // Move up left
1103
+ for (int i = 0; i < c + 1; i++) // Move left until end
1104
+ (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_DELETE;
1105
+ break;
1106
+ }
1107
+ blockPos = WORD_SIZE - 1;
1108
+ b--;
1109
+ currP = alignData->Ps[c * maxNumBlocks + b];
1110
+ currM = alignData->Ms[c * maxNumBlocks + b];
1111
+ } else { // If entering left block
1112
+ blockPos--;
1113
+ currP = lP;
1114
+ currM = lM;
1115
+ currP <<= 1;
1116
+ currM <<= 1;
1117
+ }
1118
+ // Set new left block
1119
+ if (c > 0 && b >= alignData->firstBlocks[c-1] && b <= alignData->lastBlocks[c-1]) {
1120
+ thereIsLeftBlock = true;
1121
+ lP = alignData->Ps[(c - 1) * maxNumBlocks + b];
1122
+ lM = alignData->Ms[(c - 1) * maxNumBlocks + b];
1123
+ } else {
1124
+ if (c == 0) { // If there are no cells to the left (only boundary cells)
1125
+ thereIsLeftBlock = true;
1126
+ lScore = b * WORD_SIZE + blockPos + 1;
1127
+ ulScore = lScore - 1;
1128
+ } else {
1129
+ thereIsLeftBlock = false;
1130
+ }
1131
+ }
1132
+ // Mark move
1133
+ (*alignment)[(*alignmentLength)++] = moveCode;
1134
+ } else {
1135
+ // Reached end - finished!
1136
+ break;
1137
+ }
1138
+ //----------------------------------//
1139
+ }
1140
+
1141
+ *alignment = static_cast<unsigned char*>(realloc(*alignment, (*alignmentLength) * sizeof(unsigned char)));
1142
+ reverse(*alignment, *alignment + (*alignmentLength));
1143
+ return EDLIB_STATUS_OK;
1144
+ }
1145
+
1146
+
1147
+ /**
1148
+ * Finds one possible alignment that gives optimal score (bestScore).
1149
+ * It will split problem into smaller problems using Hirschberg's algorithm and when they are small enough,
1150
+ * it will solve them using traceback algorithm.
1151
+ * @param [in] query
1152
+ * @param [in] rQuery Reversed query.
1153
+ * @param [in] queryLength
1154
+ * @param [in] target
1155
+ * @param [in] rTarget Reversed target.
1156
+ * @param [in] targetLength
1157
+ * @param [in] equalityDefinition
1158
+ * @param [in] alphabetLength
1159
+ * @param [in] bestScore Best(optimal) score.
1160
+ * @param [out] alignment Sequence of edit operations that make target equal to query.
1161
+ * @param [out] alignmentLength Length of alignment.
1162
+ * @return Status code.
1163
+ */
1164
+ static int obtainAlignment(
1165
+ const unsigned char* const query, const unsigned char* const rQuery, const int queryLength,
1166
+ const unsigned char* const target, const unsigned char* const rTarget, const int targetLength,
1167
+ const EqualityDefinition& equalityDefinition, const int alphabetLength, const int bestScore,
1168
+ unsigned char** const alignment, int* const alignmentLength) {
1169
+
1170
+ // Handle special case when one of sequences has length of 0.
1171
+ if (queryLength == 0 || targetLength == 0) {
1172
+ *alignmentLength = targetLength + queryLength;
1173
+ *alignment = static_cast<unsigned char*>(malloc((*alignmentLength) * sizeof(unsigned char)));
1174
+ for (int i = 0; i < *alignmentLength; i++) {
1175
+ (*alignment)[i] = queryLength == 0 ? EDLIB_EDOP_DELETE : EDLIB_EDOP_INSERT;
1176
+ }
1177
+ return EDLIB_STATUS_OK;
1178
+ }
1179
+
1180
+ const int maxNumBlocks = ceilDiv(queryLength, WORD_SIZE);
1181
+ const int W = maxNumBlocks * WORD_SIZE - queryLength;
1182
+ int statusCode;
1183
+
1184
+ // TODO: think about reducing number of memory allocations in alignment functions, probably
1185
+ // by sharing some memory that is allocated only once. That refers to: Peq, columns in Hirschberg,
1186
+ // and it could also be done for alignments - we could have one big array for alignment that would be
1187
+ // sparsely populated by each of steps in recursion, and at the end we would just consolidate those results.
1188
+
1189
+ // If estimated memory consumption for traceback algorithm is smaller than 1MB use it,
1190
+ // otherwise use Hirschberg's algorithm. By running few tests I choose boundary of 1MB as optimal.
1191
+ long long alignmentDataSize = (2ll * sizeof(Word) + sizeof(int)) * maxNumBlocks * targetLength
1192
+ + 2ll * sizeof(int) * targetLength;
1193
+ if (alignmentDataSize < 1024 * 1024) {
1194
+ int score_, endLocation_; // Used only to call function.
1195
+ AlignmentData* alignData = NULL;
1196
+ Word* Peq = buildPeq(alphabetLength, query, queryLength, equalityDefinition);
1197
+ myersCalcEditDistanceNW(Peq, W, maxNumBlocks,
1198
+ queryLength,
1199
+ target, targetLength,
1200
+ bestScore,
1201
+ &score_, &endLocation_, true, &alignData, -1);
1202
+ //assert(score_ == bestScore);
1203
+ //assert(endLocation_ == targetLength - 1);
1204
+
1205
+ statusCode = obtainAlignmentTraceback(queryLength, targetLength,
1206
+ bestScore, alignData, alignment, alignmentLength);
1207
+ delete alignData;
1208
+ delete[] Peq;
1209
+ } else {
1210
+ statusCode = obtainAlignmentHirschberg(query, rQuery, queryLength,
1211
+ target, rTarget, targetLength,
1212
+ equalityDefinition, alphabetLength, bestScore,
1213
+ alignment, alignmentLength);
1214
+ }
1215
+ return statusCode;
1216
+ }
1217
+
1218
+
1219
+ /**
1220
+ * Finds one possible alignment that gives optimal score (bestScore).
1221
+ * Uses Hirschberg's algorithm to split problem into two sub-problems, solve them and combine them together.
1222
+ * @param [in] query
1223
+ * @param [in] rQuery Reversed query.
1224
+ * @param [in] queryLength
1225
+ * @param [in] target
1226
+ * @param [in] rTarget Reversed target.
1227
+ * @param [in] targetLength
1228
+ * @param [in] alphabetLength
1229
+ * @param [in] bestScore Best(optimal) score.
1230
+ * @param [out] alignment Sequence of edit operations that make target equal to query.
1231
+ * @param [out] alignmentLength Length of alignment.
1232
+ * @return Status code.
1233
+ */
1234
+ static int obtainAlignmentHirschberg(
1235
+ const unsigned char* const query, const unsigned char* const rQuery, const int queryLength,
1236
+ const unsigned char* const target, const unsigned char* const rTarget, const int targetLength,
1237
+ const EqualityDefinition& equalityDefinition, const int alphabetLength, const int bestScore,
1238
+ unsigned char** const alignment, int* const alignmentLength) {
1239
+
1240
+ const int maxNumBlocks = ceilDiv(queryLength, WORD_SIZE);
1241
+ const int W = maxNumBlocks * WORD_SIZE - queryLength;
1242
+
1243
+ Word* Peq = buildPeq(alphabetLength, query, queryLength, equalityDefinition);
1244
+ Word* rPeq = buildPeq(alphabetLength, rQuery, queryLength, equalityDefinition);
1245
+
1246
+ // Used only to call functions.
1247
+ int score_, endLocation_;
1248
+
1249
+ // Divide dynamic matrix into two halfs, left and right.
1250
+ const int leftHalfWidth = targetLength / 2;
1251
+ const int rightHalfWidth = targetLength - leftHalfWidth;
1252
+
1253
+ // Calculate left half.
1254
+ AlignmentData* alignDataLeftHalf = NULL;
1255
+ int leftHalfCalcStatus = myersCalcEditDistanceNW(
1256
+ Peq, W, maxNumBlocks, queryLength, target, targetLength, bestScore,
1257
+ &score_, &endLocation_, false, &alignDataLeftHalf, leftHalfWidth - 1);
1258
+
1259
+ // Calculate right half.
1260
+ AlignmentData* alignDataRightHalf = NULL;
1261
+ int rightHalfCalcStatus = myersCalcEditDistanceNW(
1262
+ rPeq, W, maxNumBlocks, queryLength, rTarget, targetLength, bestScore,
1263
+ &score_, &endLocation_, false, &alignDataRightHalf, rightHalfWidth - 1);
1264
+
1265
+ delete[] Peq;
1266
+ delete[] rPeq;
1267
+
1268
+ if (leftHalfCalcStatus == EDLIB_STATUS_ERROR || rightHalfCalcStatus == EDLIB_STATUS_ERROR) {
1269
+ if (alignDataLeftHalf) delete alignDataLeftHalf;
1270
+ if (alignDataRightHalf) delete alignDataRightHalf;
1271
+ return EDLIB_STATUS_ERROR;
1272
+ }
1273
+
1274
+ // Unwrap the left half.
1275
+ int firstBlockIdxLeft = alignDataLeftHalf->firstBlocks[0];
1276
+ int lastBlockIdxLeft = alignDataLeftHalf->lastBlocks[0];
1277
+ // TODO: avoid this allocation by using some shared array?
1278
+ // scoresLeft contains scores from left column, starting with scoresLeftStartIdx row (query index)
1279
+ // and ending with scoresLeftEndIdx row (0-indexed).
1280
+ int scoresLeftLength = (lastBlockIdxLeft - firstBlockIdxLeft + 1) * WORD_SIZE;
1281
+ int* scoresLeft = new int[scoresLeftLength];
1282
+ for (int blockIdx = firstBlockIdxLeft; blockIdx <= lastBlockIdxLeft; blockIdx++) {
1283
+ Block block(alignDataLeftHalf->Ps[blockIdx], alignDataLeftHalf->Ms[blockIdx],
1284
+ alignDataLeftHalf->scores[blockIdx]);
1285
+ readBlock(block, scoresLeft + (blockIdx - firstBlockIdxLeft) * WORD_SIZE);
1286
+ }
1287
+ int scoresLeftStartIdx = firstBlockIdxLeft * WORD_SIZE;
1288
+ // If last block contains padding, shorten the length of scores for the length of padding.
1289
+ if (lastBlockIdxLeft == maxNumBlocks - 1) {
1290
+ scoresLeftLength -= W;
1291
+ }
1292
+
1293
+ // Unwrap the right half (I also reverse it while unwraping).
1294
+ int firstBlockIdxRight = alignDataRightHalf->firstBlocks[0];
1295
+ int lastBlockIdxRight = alignDataRightHalf->lastBlocks[0];
1296
+ int scoresRightLength = (lastBlockIdxRight - firstBlockIdxRight + 1) * WORD_SIZE;
1297
+ int* scoresRight = new int[scoresRightLength];
1298
+ int* scoresRightOriginalStart = scoresRight;
1299
+ for (int blockIdx = firstBlockIdxRight; blockIdx <= lastBlockIdxRight; blockIdx++) {
1300
+ Block block(alignDataRightHalf->Ps[blockIdx], alignDataRightHalf->Ms[blockIdx],
1301
+ alignDataRightHalf->scores[blockIdx]);
1302
+ readBlockReverse(block, scoresRight + (lastBlockIdxRight - blockIdx) * WORD_SIZE);
1303
+ }
1304
+ int scoresRightStartIdx = queryLength - (lastBlockIdxRight + 1) * WORD_SIZE;
1305
+ // If there is padding at the beginning of scoresRight (that can happen because of reversing that we do),
1306
+ // move pointer forward to remove the padding (that is why we remember originalStart).
1307
+ if (scoresRightStartIdx < 0) {
1308
+ //assert(scoresRightStartIdx == -1 * W);
1309
+ scoresRight += W;
1310
+ scoresRightStartIdx += W;
1311
+ scoresRightLength -= W;
1312
+ }
1313
+
1314
+ delete alignDataLeftHalf;
1315
+ delete alignDataRightHalf;
1316
+
1317
+ //--------------------- Find the best move ----------------//
1318
+ // Find the query/row index of cell in left column which together with its lower right neighbour
1319
+ // from right column gives the best score (when summed). We also have to consider boundary cells
1320
+ // (those cells at -1 indexes).
1321
+ // x|
1322
+ // -+-
1323
+ // |x
1324
+ int queryIdxLeftStart = max(scoresLeftStartIdx, scoresRightStartIdx - 1);
1325
+ int queryIdxLeftEnd = min(scoresLeftStartIdx + scoresLeftLength - 1,
1326
+ scoresRightStartIdx + scoresRightLength - 2);
1327
+ int leftScore = -1, rightScore = -1;
1328
+ int queryIdxLeftAlignment = -1; // Query/row index of cell in left column where alignment is passing through.
1329
+ bool queryIdxLeftAlignmentFound = false;
1330
+ for (int queryIdx = queryIdxLeftStart; queryIdx <= queryIdxLeftEnd; queryIdx++) {
1331
+ leftScore = scoresLeft[queryIdx - scoresLeftStartIdx];
1332
+ rightScore = scoresRight[queryIdx + 1 - scoresRightStartIdx];
1333
+ if (leftScore + rightScore == bestScore) {
1334
+ queryIdxLeftAlignment = queryIdx;
1335
+ queryIdxLeftAlignmentFound = true;
1336
+ break;
1337
+ }
1338
+ }
1339
+ // Check boundary cells.
1340
+ if (!queryIdxLeftAlignmentFound && scoresLeftStartIdx == 0 && scoresRightStartIdx == 0) {
1341
+ leftScore = leftHalfWidth;
1342
+ rightScore = scoresRight[0];
1343
+ if (leftScore + rightScore == bestScore) {
1344
+ queryIdxLeftAlignment = -1;
1345
+ queryIdxLeftAlignmentFound = true;
1346
+ }
1347
+ }
1348
+ if (!queryIdxLeftAlignmentFound && scoresLeftStartIdx + scoresLeftLength == queryLength
1349
+ && scoresRightStartIdx + scoresRightLength == queryLength) {
1350
+ leftScore = scoresLeft[scoresLeftLength - 1];
1351
+ rightScore = rightHalfWidth;
1352
+ if (leftScore + rightScore == bestScore) {
1353
+ queryIdxLeftAlignment = queryLength - 1;
1354
+ queryIdxLeftAlignmentFound = true;
1355
+ }
1356
+ }
1357
+
1358
+ delete[] scoresLeft;
1359
+ delete[] scoresRightOriginalStart;
1360
+
1361
+ if (queryIdxLeftAlignmentFound == false) {
1362
+ // If there was no move that is part of optimal alignment, then there is no such alignment
1363
+ // or given bestScore is not correct!
1364
+ return EDLIB_STATUS_ERROR;
1365
+ }
1366
+ //----------------------------------------------------------//
1367
+
1368
+ // Calculate alignments for upper half of left half (upper left - ul)
1369
+ // and lower half of right half (lower right - lr).
1370
+ const int ulHeight = queryIdxLeftAlignment + 1;
1371
+ const int lrHeight = queryLength - ulHeight;
1372
+ const int ulWidth = leftHalfWidth;
1373
+ const int lrWidth = rightHalfWidth;
1374
+ unsigned char* ulAlignment = NULL; int ulAlignmentLength;
1375
+ int ulStatusCode = obtainAlignment(query, rQuery + lrHeight, ulHeight,
1376
+ target, rTarget + lrWidth, ulWidth,
1377
+ equalityDefinition, alphabetLength, leftScore,
1378
+ &ulAlignment, &ulAlignmentLength);
1379
+ unsigned char* lrAlignment = NULL; int lrAlignmentLength;
1380
+ int lrStatusCode = obtainAlignment(query + ulHeight, rQuery, lrHeight,
1381
+ target + ulWidth, rTarget, lrWidth,
1382
+ equalityDefinition, alphabetLength, rightScore,
1383
+ &lrAlignment, &lrAlignmentLength);
1384
+ if (ulStatusCode == EDLIB_STATUS_ERROR || lrStatusCode == EDLIB_STATUS_ERROR) {
1385
+ if (ulAlignment) free(ulAlignment);
1386
+ if (lrAlignment) free(lrAlignment);
1387
+ return EDLIB_STATUS_ERROR;
1388
+ }
1389
+
1390
+ // Build alignment by concatenating upper left alignment with lower right alignment.
1391
+ *alignmentLength = ulAlignmentLength + lrAlignmentLength;
1392
+ *alignment = static_cast<unsigned char*>(malloc((*alignmentLength) * sizeof(unsigned char)));
1393
+ memcpy(*alignment, ulAlignment, ulAlignmentLength);
1394
+ memcpy(*alignment + ulAlignmentLength, lrAlignment, lrAlignmentLength);
1395
+
1396
+ free(ulAlignment);
1397
+ free(lrAlignment);
1398
+ return EDLIB_STATUS_OK;
1399
+ }
1400
+
1401
+
1402
+ /**
1403
+ * Takes char query and char target, recognizes alphabet and transforms them into unsigned char sequences
1404
+ * where elements in sequences are not any more letters of alphabet, but their index in alphabet.
1405
+ * Most of internal edlib functions expect such transformed sequences.
1406
+ * This function will allocate queryTransformed and targetTransformed, so make sure to free them when done.
1407
+ * Example:
1408
+ * Original sequences: "ACT" and "CGT".
1409
+ * Alphabet would be recognized as "ACTG". Alphabet length = 4.
1410
+ * Transformed sequences: [0, 1, 2] and [1, 3, 2].
1411
+ * @param [in] queryOriginal
1412
+ * @param [in] queryLength
1413
+ * @param [in] targetOriginal
1414
+ * @param [in] targetLength
1415
+ * @param [out] queryTransformed It will contain values in range [0, alphabet length - 1].
1416
+ * @param [out] targetTransformed It will contain values in range [0, alphabet length - 1].
1417
+ * @return Alphabet as a string of unique characters, where index of each character is its value in transformed
1418
+ * sequences.
1419
+ */
1420
+ static string transformSequences(const char* const queryOriginal, const int queryLength,
1421
+ const char* const targetOriginal, const int targetLength,
1422
+ unsigned char** const queryTransformed,
1423
+ unsigned char** const targetTransformed) {
1424
+ // Alphabet is constructed from letters that are present in sequences.
1425
+ // Each letter is assigned an ordinal number, starting from 0 up to alphabetLength - 1,
1426
+ // and new query and target are created in which letters are replaced with their ordinal numbers.
1427
+ // This query and target are used in all the calculations later.
1428
+ *queryTransformed = static_cast<unsigned char *>(malloc(sizeof(unsigned char) * queryLength));
1429
+ *targetTransformed = static_cast<unsigned char *>(malloc(sizeof(unsigned char) * targetLength));
1430
+
1431
+ string alphabet = "";
1432
+
1433
+ // Alphabet information, it is constructed on fly while transforming sequences.
1434
+ // letterIdx[c] is index of letter c in alphabet.
1435
+ unsigned char letterIdx[MAX_UCHAR + 1];
1436
+ bool inAlphabet[MAX_UCHAR + 1]; // inAlphabet[c] is true if c is in alphabet
1437
+ for (int i = 0; i < MAX_UCHAR + 1; i++) inAlphabet[i] = false;
1438
+
1439
+ for (int i = 0; i < queryLength; i++) {
1440
+ unsigned char c = static_cast<unsigned char>(queryOriginal[i]);
1441
+ if (!inAlphabet[c]) {
1442
+ inAlphabet[c] = true;
1443
+ letterIdx[c] = static_cast<unsigned char>(alphabet.size());
1444
+ alphabet += queryOriginal[i];
1445
+ }
1446
+ (*queryTransformed)[i] = letterIdx[c];
1447
+ }
1448
+ for (int i = 0; i < targetLength; i++) {
1449
+ unsigned char c = static_cast<unsigned char>(targetOriginal[i]);
1450
+ if (!inAlphabet[c]) {
1451
+ inAlphabet[c] = true;
1452
+ letterIdx[c] = static_cast<unsigned char>(alphabet.size());
1453
+ alphabet += targetOriginal[i];
1454
+ }
1455
+ (*targetTransformed)[i] = letterIdx[c];
1456
+ }
1457
+
1458
+ return alphabet;
1459
+ }
1460
+
1461
+
1462
+ extern "C" EdlibAlignConfig edlibNewAlignConfig(int k, EdlibAlignMode mode, EdlibAlignTask task,
1463
+ const EdlibEqualityPair* additionalEqualities,
1464
+ int additionalEqualitiesLength) {
1465
+ EdlibAlignConfig config;
1466
+ config.k = k;
1467
+ config.mode = mode;
1468
+ config.task = task;
1469
+ config.additionalEqualities = additionalEqualities;
1470
+ config.additionalEqualitiesLength = additionalEqualitiesLength;
1471
+ return config;
1472
+ }
1473
+
1474
+ extern "C" EdlibAlignConfig edlibDefaultAlignConfig(void) {
1475
+ return edlibNewAlignConfig(-1, EDLIB_MODE_NW, EDLIB_TASK_DISTANCE, NULL, 0);
1476
+ }
1477
+
1478
+ extern "C" void edlibFreeAlignResult(EdlibAlignResult result) {
1479
+ if (result.endLocations) free(result.endLocations);
1480
+ if (result.startLocations) free(result.startLocations);
1481
+ if (result.alignment) free(result.alignment);
1482
+ }