edlib 0.0.3 → 0.0.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,1482 @@
1
+ #include "edlib.h"
2
+
3
+ #include <stdint.h>
4
+ #include <cstdlib>
5
+ #include <algorithm>
6
+ #include <vector>
7
+ #include <cstring>
8
+ #include <string>
9
+
10
+ using namespace std;
11
+
12
+ typedef uint64_t Word;
13
+ static const int WORD_SIZE = sizeof(Word) * 8; // Size of Word in bits
14
+ static const Word WORD_1 = static_cast<Word>(1);
15
+ static const Word HIGH_BIT_MASK = WORD_1 << (WORD_SIZE - 1); // 100..00
16
+ static const int MAX_UCHAR = 255;
17
+
18
+ // Data needed to find alignment.
19
+ struct AlignmentData {
20
+ Word* Ps;
21
+ Word* Ms;
22
+ int* scores;
23
+ int* firstBlocks;
24
+ int* lastBlocks;
25
+
26
+ AlignmentData(int maxNumBlocks, int targetLength) {
27
+ // We build a complete table and mark first and last block for each column
28
+ // (because algorithm is banded so only part of each columns is used).
29
+ // TODO: do not build a whole table, but just enough blocks for each column.
30
+ Ps = new Word[maxNumBlocks * targetLength];
31
+ Ms = new Word[maxNumBlocks * targetLength];
32
+ scores = new int[maxNumBlocks * targetLength];
33
+ firstBlocks = new int[targetLength];
34
+ lastBlocks = new int[targetLength];
35
+ }
36
+
37
+ ~AlignmentData() {
38
+ delete[] Ps;
39
+ delete[] Ms;
40
+ delete[] scores;
41
+ delete[] firstBlocks;
42
+ delete[] lastBlocks;
43
+ }
44
+ };
45
+
46
+ struct Block {
47
+ Word P; // Pvin
48
+ Word M; // Mvin
49
+ int score; // score of last cell in block;
50
+
51
+ Block() {}
52
+ Block(Word p, Word m, int s) :P(p), M(m), score(s) {}
53
+ };
54
+
55
+
56
+ /**
57
+ * Defines equality relation on alphabet characters.
58
+ * By default each character is always equal only to itself, but you can also provide additional equalities.
59
+ */
60
+ class EqualityDefinition {
61
+ private:
62
+ bool matrix[MAX_UCHAR + 1][MAX_UCHAR + 1];
63
+ public:
64
+ EqualityDefinition(const string& alphabet,
65
+ const EdlibEqualityPair* additionalEqualities = NULL,
66
+ const int additionalEqualitiesLength = 0) {
67
+ for (int i = 0; i < static_cast<int>(alphabet.size()); i++) {
68
+ for (int j = 0; j < static_cast<int>(alphabet.size()); j++) {
69
+ matrix[i][j] = (i == j);
70
+ }
71
+ }
72
+ if (additionalEqualities != NULL) {
73
+ for (int i = 0; i < additionalEqualitiesLength; i++) {
74
+ size_t firstTransformed = alphabet.find(additionalEqualities[i].first);
75
+ size_t secondTransformed = alphabet.find(additionalEqualities[i].second);
76
+ if (firstTransformed != string::npos && secondTransformed != string::npos) {
77
+ matrix[firstTransformed][secondTransformed] = matrix[secondTransformed][firstTransformed] = true;
78
+ }
79
+ }
80
+ }
81
+ }
82
+
83
+ /**
84
+ * @param a Element from transformed sequence.
85
+ * @param b Element from transformed sequence.
86
+ * @return True if a and b are defined as equal, false otherwise.
87
+ */
88
+ bool areEqual(unsigned char a, unsigned char b) const {
89
+ return matrix[a][b];
90
+ }
91
+ };
92
+
93
+ static int myersCalcEditDistanceSemiGlobal(const Word* Peq, int W, int maxNumBlocks,
94
+ int queryLength,
95
+ const unsigned char* target, int targetLength,
96
+ int k, EdlibAlignMode mode,
97
+ int* bestScore_, int** positions_, int* numPositions_);
98
+
99
+ static int myersCalcEditDistanceNW(const Word* Peq, int W, int maxNumBlocks,
100
+ int queryLength,
101
+ const unsigned char* target, int targetLength,
102
+ int k, int* bestScore_,
103
+ int* position_, bool findAlignment,
104
+ AlignmentData** alignData, int targetStopPosition);
105
+
106
+
107
+ static int obtainAlignment(
108
+ const unsigned char* query, const unsigned char* rQuery, int queryLength,
109
+ const unsigned char* target, const unsigned char* rTarget, int targetLength,
110
+ const EqualityDefinition& equalityDefinition, int alphabetLength, int bestScore,
111
+ unsigned char** alignment, int* alignmentLength);
112
+
113
+ static int obtainAlignmentHirschberg(
114
+ const unsigned char* query, const unsigned char* rQuery, int queryLength,
115
+ const unsigned char* target, const unsigned char* rTarget, int targetLength,
116
+ const EqualityDefinition& equalityDefinition, int alphabetLength, int bestScore,
117
+ unsigned char** alignment, int* alignmentLength);
118
+
119
+ static int obtainAlignmentTraceback(int queryLength, int targetLength,
120
+ int bestScore, const AlignmentData* alignData,
121
+ unsigned char** alignment, int* alignmentLength);
122
+
123
+ static string transformSequences(const char* queryOriginal, int queryLength,
124
+ const char* targetOriginal, int targetLength,
125
+ unsigned char** queryTransformed,
126
+ unsigned char** targetTransformed);
127
+
128
+ static inline int ceilDiv(int x, int y);
129
+
130
+ static inline unsigned char* createReverseCopy(const unsigned char* seq, int length);
131
+
132
+ static inline Word* buildPeq(const int alphabetLength,
133
+ const unsigned char* query,
134
+ const int queryLength,
135
+ const EqualityDefinition& equalityDefinition);
136
+
137
+
138
+ /**
139
+ * Main edlib method.
140
+ */
141
+ extern "C" EdlibAlignResult edlibAlign(const char* const queryOriginal, const int queryLength,
142
+ const char* const targetOriginal, const int targetLength,
143
+ const EdlibAlignConfig config) {
144
+ EdlibAlignResult result;
145
+ result.status = EDLIB_STATUS_OK;
146
+ result.editDistance = -1;
147
+ result.endLocations = result.startLocations = NULL;
148
+ result.numLocations = 0;
149
+ result.alignment = NULL;
150
+ result.alignmentLength = 0;
151
+ result.alphabetLength = 0;
152
+
153
+ /*------------ TRANSFORM SEQUENCES AND RECOGNIZE ALPHABET -----------*/
154
+ unsigned char* query, * target;
155
+ string alphabet = transformSequences(queryOriginal, queryLength, targetOriginal, targetLength,
156
+ &query, &target);
157
+ result.alphabetLength = static_cast<int>(alphabet.size());
158
+ /*-------------------------------------------------------*/
159
+
160
+ // Handle special situation when at least one of the sequences has length 0.
161
+ if (queryLength == 0 || targetLength == 0) {
162
+ if (config.mode == EDLIB_MODE_NW) {
163
+ result.editDistance = std::max(queryLength, targetLength);
164
+ result.endLocations = static_cast<int *>(malloc(sizeof(int) * 1));
165
+ result.endLocations[0] = targetLength - 1;
166
+ result.numLocations = 1;
167
+ } else if (config.mode == EDLIB_MODE_SHW || config.mode == EDLIB_MODE_HW) {
168
+ result.editDistance = queryLength;
169
+ result.endLocations = static_cast<int *>(malloc(sizeof(int) * 1));
170
+ result.endLocations[0] = -1;
171
+ result.numLocations = 1;
172
+ } else {
173
+ result.status = EDLIB_STATUS_ERROR;
174
+ }
175
+
176
+ free(query);
177
+ free(target);
178
+ return result;
179
+ }
180
+
181
+ /*--------------------- INITIALIZATION ------------------*/
182
+ int maxNumBlocks = ceilDiv(queryLength, WORD_SIZE); // bmax in Myers
183
+ int W = maxNumBlocks * WORD_SIZE - queryLength; // number of redundant cells in last level blocks
184
+ EqualityDefinition equalityDefinition(alphabet, config.additionalEqualities, config.additionalEqualitiesLength);
185
+ Word* Peq = buildPeq(static_cast<int>(alphabet.size()), query, queryLength, equalityDefinition);
186
+ /*-------------------------------------------------------*/
187
+
188
+ /*------------------ MAIN CALCULATION -------------------*/
189
+ // TODO: Store alignment data only after k is determined? That could make things faster.
190
+ int positionNW; // Used only when mode is NW.
191
+ AlignmentData* alignData = NULL;
192
+ bool dynamicK = false;
193
+ int k = config.k;
194
+ if (k < 0) { // If valid k is not given, auto-adjust k until solution is found.
195
+ dynamicK = true;
196
+ k = WORD_SIZE; // Gives better results than smaller k.
197
+ }
198
+
199
+ do {
200
+ if (config.mode == EDLIB_MODE_HW || config.mode == EDLIB_MODE_SHW) {
201
+ myersCalcEditDistanceSemiGlobal(Peq, W, maxNumBlocks,
202
+ queryLength, target, targetLength,
203
+ k, config.mode, &(result.editDistance),
204
+ &(result.endLocations), &(result.numLocations));
205
+ } else { // mode == EDLIB_MODE_NW
206
+ myersCalcEditDistanceNW(Peq, W, maxNumBlocks,
207
+ queryLength, target, targetLength,
208
+ k, &(result.editDistance), &positionNW,
209
+ false, &alignData, -1);
210
+ }
211
+ k *= 2;
212
+ } while(dynamicK && result.editDistance == -1);
213
+
214
+ if (result.editDistance >= 0) { // If there is solution.
215
+ // If NW mode, set end location explicitly.
216
+ if (config.mode == EDLIB_MODE_NW) {
217
+ result.endLocations = static_cast<int *>(malloc(sizeof(int) * 1));
218
+ result.endLocations[0] = targetLength - 1;
219
+ result.numLocations = 1;
220
+ }
221
+
222
+ // Find starting locations.
223
+ if (config.task == EDLIB_TASK_LOC || config.task == EDLIB_TASK_PATH) {
224
+ result.startLocations = static_cast<int *>(malloc(result.numLocations * sizeof(int)));
225
+ if (config.mode == EDLIB_MODE_HW) { // If HW, I need to calculate start locations.
226
+ const unsigned char* rTarget = createReverseCopy(target, targetLength);
227
+ const unsigned char* rQuery = createReverseCopy(query, queryLength);
228
+ // Peq for reversed query.
229
+ Word* rPeq = buildPeq(static_cast<int>(alphabet.size()), rQuery, queryLength, equalityDefinition);
230
+ for (int i = 0; i < result.numLocations; i++) {
231
+ int endLocation = result.endLocations[i];
232
+ if (endLocation == -1) {
233
+ // NOTE: Sometimes one of optimal solutions is that query starts before target, like this:
234
+ // AAGG <- target
235
+ // CCTT <- query
236
+ // It will never be only optimal solution and it does not happen often, however it is
237
+ // possible and in that case end location will be -1. What should we do with that?
238
+ // Should we just skip reporting such end location, although it is a solution?
239
+ // If we do report it, what is the start location? -4? -1? Nothing?
240
+ // TODO: Figure this out. This has to do in general with how we think about start
241
+ // and end locations.
242
+ // Also, we have alignment later relying on this locations to limit the space of it's
243
+ // search -> how can it do it right if these locations are negative or incorrect?
244
+ result.startLocations[i] = 0; // I put 0 for now, but it does not make much sense.
245
+ } else {
246
+ int bestScoreSHW, numPositionsSHW;
247
+ int* positionsSHW;
248
+ myersCalcEditDistanceSemiGlobal(
249
+ rPeq, W, maxNumBlocks,
250
+ queryLength, rTarget + targetLength - endLocation - 1, endLocation + 1,
251
+ result.editDistance, EDLIB_MODE_SHW,
252
+ &bestScoreSHW, &positionsSHW, &numPositionsSHW);
253
+ // Taking last location as start ensures that alignment will not start with insertions
254
+ // if it can start with mismatches instead.
255
+ result.startLocations[i] = endLocation - positionsSHW[numPositionsSHW - 1];
256
+ free(positionsSHW);
257
+ }
258
+ }
259
+ delete[] rTarget;
260
+ delete[] rQuery;
261
+ delete[] rPeq;
262
+ } else { // If mode is SHW or NW
263
+ for (int i = 0; i < result.numLocations; i++) {
264
+ result.startLocations[i] = 0;
265
+ }
266
+ }
267
+ }
268
+
269
+ // Find alignment -> all comes down to finding alignment for NW.
270
+ // Currently we return alignment only for first pair of locations.
271
+ if (config.task == EDLIB_TASK_PATH) {
272
+ int alnStartLocation = result.startLocations[0];
273
+ int alnEndLocation = result.endLocations[0];
274
+ const unsigned char* alnTarget = target + alnStartLocation;
275
+ const int alnTargetLength = alnEndLocation - alnStartLocation + 1;
276
+ const unsigned char* rAlnTarget = createReverseCopy(alnTarget, alnTargetLength);
277
+ const unsigned char* rQuery = createReverseCopy(query, queryLength);
278
+ obtainAlignment(query, rQuery, queryLength,
279
+ alnTarget, rAlnTarget, alnTargetLength,
280
+ equalityDefinition, static_cast<int>(alphabet.size()), result.editDistance,
281
+ &(result.alignment), &(result.alignmentLength));
282
+ delete[] rAlnTarget;
283
+ delete[] rQuery;
284
+ }
285
+ }
286
+ /*-------------------------------------------------------*/
287
+
288
+ //--- Free memory ---//
289
+ delete[] Peq;
290
+ free(query);
291
+ free(target);
292
+ if (alignData) delete alignData;
293
+ //-------------------//
294
+
295
+ return result;
296
+ }
297
+
298
+ extern "C" char* edlibAlignmentToCigar(const unsigned char* const alignment, const int alignmentLength,
299
+ const EdlibCigarFormat cigarFormat) {
300
+ if (cigarFormat != EDLIB_CIGAR_EXTENDED && cigarFormat != EDLIB_CIGAR_STANDARD) {
301
+ return 0;
302
+ }
303
+
304
+ // Maps move code from alignment to char in cigar.
305
+ // 0 1 2 3
306
+ char moveCodeToChar[] = {'=', 'I', 'D', 'X'};
307
+ if (cigarFormat == EDLIB_CIGAR_STANDARD) {
308
+ moveCodeToChar[0] = moveCodeToChar[3] = 'M';
309
+ }
310
+
311
+ vector<char>* cigar = new vector<char>();
312
+ char lastMove = 0; // Char of last move. 0 if there was no previous move.
313
+ int numOfSameMoves = 0;
314
+ for (int i = 0; i <= alignmentLength; i++) {
315
+ // if new sequence of same moves started
316
+ if (i == alignmentLength || (moveCodeToChar[alignment[i]] != lastMove && lastMove != 0)) {
317
+ // Write number of moves to cigar string.
318
+ int numDigits = 0;
319
+ for (; numOfSameMoves; numOfSameMoves /= 10) {
320
+ cigar->push_back('0' + numOfSameMoves % 10);
321
+ numDigits++;
322
+ }
323
+ reverse(cigar->end() - numDigits, cigar->end());
324
+ // Write code of move to cigar string.
325
+ cigar->push_back(lastMove);
326
+ // If not at the end, start new sequence of moves.
327
+ if (i < alignmentLength) {
328
+ // Check if alignment has valid values.
329
+ if (alignment[i] > 3) {
330
+ delete cigar;
331
+ return 0;
332
+ }
333
+ numOfSameMoves = 0;
334
+ }
335
+ }
336
+ if (i < alignmentLength) {
337
+ lastMove = moveCodeToChar[alignment[i]];
338
+ numOfSameMoves++;
339
+ }
340
+ }
341
+ cigar->push_back(0); // Null character termination.
342
+ char* cigar_ = static_cast<char *>(malloc(cigar->size() * sizeof(char)));
343
+ memcpy(cigar_, &(*cigar)[0], cigar->size() * sizeof(char));
344
+ delete cigar;
345
+
346
+ return cigar_;
347
+ }
348
+
349
+ /**
350
+ * Build Peq table for given query and alphabet.
351
+ * Peq is table of dimensions alphabetLength+1 x maxNumBlocks.
352
+ * Bit i of Peq[s * maxNumBlocks + b] is 1 if i-th symbol from block b of query equals symbol s, otherwise it is 0.
353
+ * NOTICE: free returned array with delete[]!
354
+ */
355
+ static inline Word* buildPeq(const int alphabetLength,
356
+ const unsigned char* const query,
357
+ const int queryLength,
358
+ const EqualityDefinition& equalityDefinition) {
359
+ int maxNumBlocks = ceilDiv(queryLength, WORD_SIZE);
360
+ // table of dimensions alphabetLength+1 x maxNumBlocks. Last symbol is wildcard.
361
+ Word* Peq = new Word[(alphabetLength + 1) * maxNumBlocks];
362
+
363
+ // Build Peq (1 is match, 0 is mismatch). NOTE: last column is wildcard(symbol that matches anything) with just 1s
364
+ for (int symbol = 0; symbol <= alphabetLength; symbol++) {
365
+ for (int b = 0; b < maxNumBlocks; b++) {
366
+ if (symbol < alphabetLength) {
367
+ Peq[symbol * maxNumBlocks + b] = 0;
368
+ for (int r = (b+1) * WORD_SIZE - 1; r >= b * WORD_SIZE; r--) {
369
+ Peq[symbol * maxNumBlocks + b] <<= 1;
370
+ // NOTE: We pretend like query is padded at the end with W wildcard symbols
371
+ if (r >= queryLength || equalityDefinition.areEqual(query[r], symbol))
372
+ Peq[symbol * maxNumBlocks + b] += 1;
373
+ }
374
+ } else { // Last symbol is wildcard, so it is all 1s
375
+ Peq[symbol * maxNumBlocks + b] = static_cast<Word>(-1);
376
+ }
377
+ }
378
+ }
379
+
380
+ return Peq;
381
+ }
382
+
383
+
384
+ /**
385
+ * Returns new sequence that is reverse of given sequence.
386
+ * Free returned array with delete[].
387
+ */
388
+ static inline unsigned char* createReverseCopy(const unsigned char* const seq, const int length) {
389
+ unsigned char* rSeq = new unsigned char[length];
390
+ for (int i = 0; i < length; i++) {
391
+ rSeq[i] = seq[length - i - 1];
392
+ }
393
+ return rSeq;
394
+ }
395
+
396
+ /**
397
+ * Corresponds to Advance_Block function from Myers.
398
+ * Calculates one word(block), which is part of a column.
399
+ * Highest bit of word (one most to the left) is most bottom cell of block from column.
400
+ * Pv[i] and Mv[i] define vin of cell[i]: vin = cell[i] - cell[i-1].
401
+ * @param [in] Pv Bitset, Pv[i] == 1 if vin is +1, otherwise Pv[i] == 0.
402
+ * @param [in] Mv Bitset, Mv[i] == 1 if vin is -1, otherwise Mv[i] == 0.
403
+ * @param [in] Eq Bitset, Eq[i] == 1 if match, 0 if mismatch.
404
+ * @param [in] hin Will be +1, 0 or -1.
405
+ * @param [out] PvOut Bitset, PvOut[i] == 1 if vout is +1, otherwise PvOut[i] == 0.
406
+ * @param [out] MvOut Bitset, MvOut[i] == 1 if vout is -1, otherwise MvOut[i] == 0.
407
+ * @param [out] hout Will be +1, 0 or -1.
408
+ */
409
+ static inline int calculateBlock(Word Pv, Word Mv, Word Eq, const int hin,
410
+ Word &PvOut, Word &MvOut) {
411
+ // hin can be 1, -1 or 0.
412
+ // 1 -> 00...01
413
+ // 0 -> 00...00
414
+ // -1 -> 11...11 (2-complement)
415
+
416
+ Word hinIsNeg = static_cast<Word>(hin >> 2) & WORD_1; // 00...001 if hin is -1, 00...000 if 0 or 1
417
+
418
+ Word Xv = Eq | Mv;
419
+ // This is instruction below written using 'if': if (hin < 0) Eq |= (Word)1;
420
+ Eq |= hinIsNeg;
421
+ Word Xh = (((Eq & Pv) + Pv) ^ Pv) | Eq;
422
+
423
+ Word Ph = Mv | ~(Xh | Pv);
424
+ Word Mh = Pv & Xh;
425
+
426
+ int hout = 0;
427
+ // This is instruction below written using 'if': if (Ph & HIGH_BIT_MASK) hout = 1;
428
+ hout = (Ph & HIGH_BIT_MASK) >> (WORD_SIZE - 1);
429
+ // This is instruction below written using 'if': if (Mh & HIGH_BIT_MASK) hout = -1;
430
+ hout -= (Mh & HIGH_BIT_MASK) >> (WORD_SIZE - 1);
431
+
432
+ Ph <<= 1;
433
+ Mh <<= 1;
434
+
435
+ // This is instruction below written using 'if': if (hin < 0) Mh |= (Word)1;
436
+ Mh |= hinIsNeg;
437
+ // This is instruction below written using 'if': if (hin > 0) Ph |= (Word)1;
438
+ Ph |= static_cast<Word>((hin + 1) >> 1);
439
+
440
+ PvOut = Mh | ~(Xv | Ph);
441
+ MvOut = Ph & Xv;
442
+
443
+ return hout;
444
+ }
445
+
446
+ /**
447
+ * Does ceiling division x / y.
448
+ * Note: x and y must be non-negative and x + y must not overflow.
449
+ */
450
+ static inline int ceilDiv(const int x, const int y) {
451
+ return x % y ? x / y + 1 : x / y;
452
+ }
453
+
454
+ static inline int min(const int x, const int y) {
455
+ return x < y ? x : y;
456
+ }
457
+
458
+ static inline int max(const int x, const int y) {
459
+ return x > y ? x : y;
460
+ }
461
+
462
+
463
+ /**
464
+ * @param [in] block
465
+ * @return Values of cells in block, starting with bottom cell in block.
466
+ */
467
+ static inline vector<int> getBlockCellValues(const Block block) {
468
+ vector<int> scores(WORD_SIZE);
469
+ int score = block.score;
470
+ Word mask = HIGH_BIT_MASK;
471
+ for (int i = 0; i < WORD_SIZE - 1; i++) {
472
+ scores[i] = score;
473
+ if (block.P & mask) score--;
474
+ if (block.M & mask) score++;
475
+ mask >>= 1;
476
+ }
477
+ scores[WORD_SIZE - 1] = score;
478
+ return scores;
479
+ }
480
+
481
+ /**
482
+ * Writes values of cells in block into given array, starting with first/top cell.
483
+ * @param [in] block
484
+ * @param [out] dest Array into which cell values are written. Must have size of at least WORD_SIZE.
485
+ */
486
+ static inline void readBlock(const Block block, int* const dest) {
487
+ int score = block.score;
488
+ Word mask = HIGH_BIT_MASK;
489
+ for (int i = 0; i < WORD_SIZE - 1; i++) {
490
+ dest[WORD_SIZE - 1 - i] = score;
491
+ if (block.P & mask) score--;
492
+ if (block.M & mask) score++;
493
+ mask >>= 1;
494
+ }
495
+ dest[0] = score;
496
+ }
497
+
498
+ /**
499
+ * Writes values of cells in block into given array, starting with last/bottom cell.
500
+ * @param [in] block
501
+ * @param [out] dest Array into which cell values are written. Must have size of at least WORD_SIZE.
502
+ */
503
+ static inline void readBlockReverse(const Block block, int* const dest) {
504
+ int score = block.score;
505
+ Word mask = HIGH_BIT_MASK;
506
+ for (int i = 0; i < WORD_SIZE - 1; i++) {
507
+ dest[i] = score;
508
+ if (block.P & mask) score--;
509
+ if (block.M & mask) score++;
510
+ mask >>= 1;
511
+ }
512
+ dest[WORD_SIZE - 1] = score;
513
+ }
514
+
515
+ /**
516
+ * @param [in] block
517
+ * @param [in] k
518
+ * @return True if all cells in block have value larger than k, otherwise false.
519
+ */
520
+ static inline bool allBlockCellsLarger(const Block block, const int k) {
521
+ vector<int> scores = getBlockCellValues(block);
522
+ for (int i = 0; i < WORD_SIZE; i++) {
523
+ if (scores[i] <= k) return false;
524
+ }
525
+ return true;
526
+ }
527
+
528
+
529
+ /**
530
+ * Uses Myers' bit-vector algorithm to find edit distance for one of semi-global alignment methods.
531
+ * @param [in] Peq Query profile.
532
+ * @param [in] W Size of padding in last block.
533
+ * TODO: Calculate this directly from query, instead of passing it.
534
+ * @param [in] maxNumBlocks Number of blocks needed to cover the whole query.
535
+ * TODO: Calculate this directly from query, instead of passing it.
536
+ * @param [in] queryLength
537
+ * @param [in] target
538
+ * @param [in] targetLength
539
+ * @param [in] k
540
+ * @param [in] mode EDLIB_MODE_HW or EDLIB_MODE_SHW
541
+ * @param [out] bestScore_ Edit distance.
542
+ * @param [out] positions_ Array of 0-indexed positions in target at which best score was found.
543
+ Make sure to free this array with free().
544
+ * @param [out] numPositions_ Number of positions in the positions_ array.
545
+ * @return Status.
546
+ */
547
+ static int myersCalcEditDistanceSemiGlobal(
548
+ const Word* const Peq, const int W, const int maxNumBlocks,
549
+ const int queryLength,
550
+ const unsigned char* const target, const int targetLength,
551
+ int k, const EdlibAlignMode mode,
552
+ int* const bestScore_, int** const positions_, int* const numPositions_) {
553
+ *positions_ = NULL;
554
+ *numPositions_ = 0;
555
+
556
+ // firstBlock is 0-based index of first block in Ukkonen band.
557
+ // lastBlock is 0-based index of last block in Ukkonen band.
558
+ int firstBlock = 0;
559
+ int lastBlock = min(ceilDiv(k + 1, WORD_SIZE), maxNumBlocks) - 1; // y in Myers
560
+ Block *bl; // Current block
561
+
562
+ Block* blocks = new Block[maxNumBlocks];
563
+
564
+ // For HW, solution will never be larger then queryLength.
565
+ if (mode == EDLIB_MODE_HW) {
566
+ k = min(queryLength, k);
567
+ }
568
+
569
+ // Each STRONG_REDUCE_NUM column is reduced in more expensive way.
570
+ // This gives speed up of about 2 times for small k.
571
+ const int STRONG_REDUCE_NUM = 2048;
572
+
573
+ // Initialize P, M and score
574
+ bl = blocks;
575
+ for (int b = 0; b <= lastBlock; b++) {
576
+ bl->score = (b + 1) * WORD_SIZE;
577
+ bl->P = static_cast<Word>(-1); // All 1s
578
+ bl->M = static_cast<Word>(0);
579
+ bl++;
580
+ }
581
+
582
+ int bestScore = -1;
583
+ vector<int> positions; // TODO: Maybe put this on heap?
584
+ const int startHout = mode == EDLIB_MODE_HW ? 0 : 1; // If 0 then gap before query is not penalized;
585
+ const unsigned char* targetChar = target;
586
+ for (int c = 0; c < targetLength; c++) { // for each column
587
+ const Word* Peq_c = Peq + (*targetChar) * maxNumBlocks;
588
+
589
+ //----------------------- Calculate column -------------------------//
590
+ int hout = startHout;
591
+ bl = blocks + firstBlock;
592
+ Peq_c += firstBlock;
593
+ for (int b = firstBlock; b <= lastBlock; b++) {
594
+ hout = calculateBlock(bl->P, bl->M, *Peq_c, hout, bl->P, bl->M);
595
+ bl->score += hout;
596
+ bl++; Peq_c++;
597
+ }
598
+ bl--; Peq_c--;
599
+ //------------------------------------------------------------------//
600
+
601
+ //---------- Adjust number of blocks according to Ukkonen ----------//
602
+ if ((lastBlock < maxNumBlocks - 1) && (bl->score - hout <= k) // bl is pointing to last block
603
+ && ((*(Peq_c + 1) & WORD_1) || hout < 0)) { // Peq_c is pointing to last block
604
+ // If score of left block is not too big, calculate one more block
605
+ lastBlock++; bl++; Peq_c++;
606
+ bl->P = static_cast<Word>(-1); // All 1s
607
+ bl->M = static_cast<Word>(0);
608
+ bl->score = (bl - 1)->score - hout + WORD_SIZE + calculateBlock(bl->P, bl->M, *Peq_c, hout, bl->P, bl->M);
609
+ } else {
610
+ while (lastBlock >= firstBlock && bl->score >= k + WORD_SIZE) {
611
+ lastBlock--; bl--; Peq_c--;
612
+ }
613
+ }
614
+
615
+ // Every some columns, do some expensive but also more efficient block reducing.
616
+ // This is important!
617
+ //
618
+ // Reduce the band by decreasing last block if possible.
619
+ if (c % STRONG_REDUCE_NUM == 0) {
620
+ while (lastBlock >= 0 && lastBlock >= firstBlock && allBlockCellsLarger(*bl, k)) {
621
+ lastBlock--; bl--; Peq_c--;
622
+ }
623
+ }
624
+ // For HW, even if all cells are > k, there still may be solution in next
625
+ // column because starting conditions at upper boundary are 0.
626
+ // That means that first block is always candidate for solution,
627
+ // and we can never end calculation before last column.
628
+ if (mode == EDLIB_MODE_HW && lastBlock == -1) {
629
+ lastBlock++; bl++; Peq_c++;
630
+ }
631
+
632
+ // Reduce band by increasing first block if possible. Not applicable to HW.
633
+ if (mode != EDLIB_MODE_HW) {
634
+ while (firstBlock <= lastBlock && blocks[firstBlock].score >= k + WORD_SIZE) {
635
+ firstBlock++;
636
+ }
637
+ if (c % STRONG_REDUCE_NUM == 0) { // Do strong reduction every some blocks
638
+ while (firstBlock <= lastBlock && allBlockCellsLarger(blocks[firstBlock], k)) {
639
+ firstBlock++;
640
+ }
641
+ }
642
+ }
643
+
644
+ // If band stops to exist finish
645
+ if (lastBlock < firstBlock) {
646
+ *bestScore_ = bestScore;
647
+ if (bestScore != -1) {
648
+ *positions_ = static_cast<int *>(malloc(sizeof(int) * static_cast<int>(positions.size())));
649
+ *numPositions_ = static_cast<int>(positions.size());
650
+ copy(positions.begin(), positions.end(), *positions_);
651
+ }
652
+ delete[] blocks;
653
+ return EDLIB_STATUS_OK;
654
+ }
655
+ //------------------------------------------------------------------//
656
+
657
+ //------------------------- Update best score ----------------------//
658
+ if (lastBlock == maxNumBlocks - 1) {
659
+ int colScore = bl->score;
660
+ if (colScore <= k) { // Scores > k dont have correct values (so we cannot use them), but are certainly > k.
661
+ // NOTE: Score that I find in column c is actually score from column c-W
662
+ if (bestScore == -1 || colScore <= bestScore) {
663
+ if (colScore != bestScore) {
664
+ positions.clear();
665
+ bestScore = colScore;
666
+ // Change k so we will look only for equal or better
667
+ // scores then the best found so far.
668
+ k = bestScore;
669
+ }
670
+ positions.push_back(c - W);
671
+ }
672
+ }
673
+ }
674
+ //------------------------------------------------------------------//
675
+
676
+ targetChar++;
677
+ }
678
+
679
+
680
+ // Obtain results for last W columns from last column.
681
+ if (lastBlock == maxNumBlocks - 1) {
682
+ vector<int> blockScores = getBlockCellValues(*bl);
683
+ for (int i = 0; i < W; i++) {
684
+ int colScore = blockScores[i + 1];
685
+ if (colScore <= k && (bestScore == -1 || colScore <= bestScore)) {
686
+ if (colScore != bestScore) {
687
+ positions.clear();
688
+ k = bestScore = colScore;
689
+ }
690
+ positions.push_back(targetLength - W + i);
691
+ }
692
+ }
693
+ }
694
+
695
+ *bestScore_ = bestScore;
696
+ if (bestScore != -1) {
697
+ *positions_ = static_cast<int *>(malloc(sizeof(int) * static_cast<int>(positions.size())));
698
+ *numPositions_ = static_cast<int>(positions.size());
699
+ copy(positions.begin(), positions.end(), *positions_);
700
+ }
701
+
702
+ delete[] blocks;
703
+ return EDLIB_STATUS_OK;
704
+ }
705
+
706
+
707
+ /**
708
+ * Uses Myers' bit-vector algorithm to find edit distance for global(NW) alignment method.
709
+ * @param [in] Peq Query profile.
710
+ * @param [in] W Size of padding in last block.
711
+ * TODO: Calculate this directly from query, instead of passing it.
712
+ * @param [in] maxNumBlocks Number of blocks needed to cover the whole query.
713
+ * TODO: Calculate this directly from query, instead of passing it.
714
+ * @param [in] queryLength
715
+ * @param [in] target
716
+ * @param [in] targetLength
717
+ * @param [in] k
718
+ * @param [out] bestScore_ Edit distance.
719
+ * @param [out] position_ 0-indexed position in target at which best score was found.
720
+ * @param [in] findAlignment If true, whole matrix is remembered and alignment data is returned.
721
+ * Quadratic amount of memory is consumed.
722
+ * @param [out] alignData Data needed for alignment traceback (for reconstruction of alignment).
723
+ * Set only if findAlignment is set to true, otherwise it is NULL.
724
+ * Make sure to free this array using delete[].
725
+ * @param [out] targetStopPosition If set to -1, whole calculation is performed normally, as expected.
726
+ * If set to p, calculation is performed up to position p in target (inclusive)
727
+ * and column p is returned as the only column in alignData.
728
+ * @return Status.
729
+ */
730
+ static int myersCalcEditDistanceNW(const Word* const Peq, const int W, const int maxNumBlocks,
731
+ const int queryLength,
732
+ const unsigned char* const target, const int targetLength,
733
+ int k, int* const bestScore_,
734
+ int* const position_, const bool findAlignment,
735
+ AlignmentData** const alignData, const int targetStopPosition) {
736
+ if (targetStopPosition > -1 && findAlignment) {
737
+ // They can not be both set at the same time!
738
+ return EDLIB_STATUS_ERROR;
739
+ }
740
+
741
+ // Each STRONG_REDUCE_NUM column is reduced in more expensive way.
742
+ const int STRONG_REDUCE_NUM = 2048; // TODO: Choose this number dinamically (based on query and target lengths?), so it does not affect speed of computation
743
+
744
+ if (k < abs(targetLength - queryLength)) {
745
+ *bestScore_ = *position_ = -1;
746
+ return EDLIB_STATUS_OK;
747
+ }
748
+
749
+ k = min(k, max(queryLength, targetLength)); // Upper bound for k
750
+
751
+ // firstBlock is 0-based index of first block in Ukkonen band.
752
+ // lastBlock is 0-based index of last block in Ukkonen band.
753
+ int firstBlock = 0;
754
+ // This is optimal now, by my formula.
755
+ int lastBlock = min(maxNumBlocks, ceilDiv(min(k, (k + queryLength - targetLength) / 2) + 1, WORD_SIZE)) - 1;
756
+ Block* bl; // Current block
757
+
758
+ Block* blocks = new Block[maxNumBlocks];
759
+
760
+ // Initialize P, M and score
761
+ bl = blocks;
762
+ for (int b = 0; b <= lastBlock; b++) {
763
+ bl->score = (b + 1) * WORD_SIZE;
764
+ bl->P = static_cast<Word>(-1); // All 1s
765
+ bl->M = static_cast<Word>(0);
766
+ bl++;
767
+ }
768
+
769
+ // If we want to find alignment, we have to store needed data.
770
+ if (findAlignment)
771
+ *alignData = new AlignmentData(maxNumBlocks, targetLength);
772
+ else if (targetStopPosition > -1)
773
+ *alignData = new AlignmentData(maxNumBlocks, 1);
774
+ else
775
+ *alignData = NULL;
776
+
777
+ const unsigned char* targetChar = target;
778
+ for (int c = 0; c < targetLength; c++) { // for each column
779
+ const Word* Peq_c = Peq + *targetChar * maxNumBlocks;
780
+
781
+ //----------------------- Calculate column -------------------------//
782
+ int hout = 1;
783
+ bl = blocks + firstBlock;
784
+ for (int b = firstBlock; b <= lastBlock; b++) {
785
+ hout = calculateBlock(bl->P, bl->M, Peq_c[b], hout, bl->P, bl->M);
786
+ bl->score += hout;
787
+ bl++;
788
+ }
789
+ bl--;
790
+ //------------------------------------------------------------------//
791
+ // bl now points to last block
792
+
793
+ // Update k. I do it only on end of column because it would slow calculation too much otherwise.
794
+ // NOTICE: I add W when in last block because it is actually result from W cells to the left and W cells up.
795
+ k = min(k, bl->score
796
+ + max(targetLength - c - 1, queryLength - ((1 + lastBlock) * WORD_SIZE - 1) - 1)
797
+ + (lastBlock == maxNumBlocks - 1 ? W : 0));
798
+
799
+ //---------- Adjust number of blocks according to Ukkonen ----------//
800
+ //--- Adjust last block ---//
801
+ // If block is not beneath band, calculate next block. Only next because others are certainly beneath band.
802
+ if (lastBlock + 1 < maxNumBlocks
803
+ && !(//score[lastBlock] >= k + WORD_SIZE || // NOTICE: this condition could be satisfied if above block also!
804
+ ((lastBlock + 1) * WORD_SIZE - 1
805
+ > k - bl->score + 2 * WORD_SIZE - 2 - targetLength + c + queryLength))) {
806
+ lastBlock++; bl++;
807
+ bl->P = static_cast<Word>(-1); // All 1s
808
+ bl->M = static_cast<Word>(0);
809
+ int newHout = calculateBlock(bl->P, bl->M, Peq_c[lastBlock], hout, bl->P, bl->M);
810
+ bl->score = (bl - 1)->score - hout + WORD_SIZE + newHout;
811
+ hout = newHout;
812
+ }
813
+
814
+ // While block is out of band, move one block up.
815
+ // NOTE: Condition used here is more loose than the one from the article, since I simplified the max() part of it.
816
+ // I could consider adding that max part, for optimal performance.
817
+ while (lastBlock >= firstBlock
818
+ && (bl->score >= k + WORD_SIZE
819
+ || ((lastBlock + 1) * WORD_SIZE - 1 >
820
+ // TODO: Does not work if do not put +1! Why???
821
+ k - bl->score + 2 * WORD_SIZE - 2 - targetLength + c + queryLength + 1))) {
822
+ lastBlock--; bl--;
823
+ }
824
+ //-------------------------//
825
+
826
+ //--- Adjust first block ---//
827
+ // While outside of band, advance block
828
+ while (firstBlock <= lastBlock
829
+ && (blocks[firstBlock].score >= k + WORD_SIZE
830
+ || ((firstBlock + 1) * WORD_SIZE - 1 <
831
+ blocks[firstBlock].score - k - targetLength + queryLength + c))) {
832
+ firstBlock++;
833
+ }
834
+ //--------------------------/
835
+
836
+
837
+ // TODO: consider if this part is useful, it does not seem to help much
838
+ if (c % STRONG_REDUCE_NUM == 0) { // Every some columns do more expensive but more efficient reduction
839
+ while (lastBlock >= firstBlock) {
840
+ // If all cells outside of band, remove block
841
+ vector<int> scores = getBlockCellValues(*bl);
842
+ int numCells = lastBlock == maxNumBlocks - 1 ? WORD_SIZE - W : WORD_SIZE;
843
+ int r = lastBlock * WORD_SIZE + numCells - 1;
844
+ bool reduce = true;
845
+ for (int i = WORD_SIZE - numCells; i < WORD_SIZE; i++) {
846
+ // TODO: Does not work if do not put +1! Why???
847
+ if (scores[i] <= k && r <= k - scores[i] - targetLength + c + queryLength + 1) {
848
+ reduce = false;
849
+ break;
850
+ }
851
+ r--;
852
+ }
853
+ if (!reduce) break;
854
+ lastBlock--; bl--;
855
+ }
856
+
857
+ while (firstBlock <= lastBlock) {
858
+ // If all cells outside of band, remove block
859
+ vector<int> scores = getBlockCellValues(blocks[firstBlock]);
860
+ int numCells = firstBlock == maxNumBlocks - 1 ? WORD_SIZE - W : WORD_SIZE;
861
+ int r = firstBlock * WORD_SIZE + numCells - 1;
862
+ bool reduce = true;
863
+ for (int i = WORD_SIZE - numCells; i < WORD_SIZE; i++) {
864
+ if (scores[i] <= k && r >= scores[i] - k - targetLength + c + queryLength) {
865
+ reduce = false;
866
+ break;
867
+ }
868
+ r--;
869
+ }
870
+ if (!reduce) break;
871
+ firstBlock++;
872
+ }
873
+ }
874
+
875
+
876
+ // If band stops to exist finish
877
+ if (lastBlock < firstBlock) {
878
+ *bestScore_ = *position_ = -1;
879
+ delete[] blocks;
880
+ return EDLIB_STATUS_OK;
881
+ }
882
+ //------------------------------------------------------------------//
883
+
884
+
885
+ //---- Save column so it can be used for reconstruction ----//
886
+ if (findAlignment && c < targetLength) {
887
+ bl = blocks + firstBlock;
888
+ for (int b = firstBlock; b <= lastBlock; b++) {
889
+ (*alignData)->Ps[maxNumBlocks * c + b] = bl->P;
890
+ (*alignData)->Ms[maxNumBlocks * c + b] = bl->M;
891
+ (*alignData)->scores[maxNumBlocks * c + b] = bl->score;
892
+ (*alignData)->firstBlocks[c] = firstBlock;
893
+ (*alignData)->lastBlocks[c] = lastBlock;
894
+ bl++;
895
+ }
896
+ }
897
+ //----------------------------------------------------------//
898
+ //---- If this is stop column, save it and finish ----//
899
+ if (c == targetStopPosition) {
900
+ for (int b = firstBlock; b <= lastBlock; b++) {
901
+ (*alignData)->Ps[b] = (blocks + b)->P;
902
+ (*alignData)->Ms[b] = (blocks + b)->M;
903
+ (*alignData)->scores[b] = (blocks + b)->score;
904
+ (*alignData)->firstBlocks[0] = firstBlock;
905
+ (*alignData)->lastBlocks[0] = lastBlock;
906
+ }
907
+ *bestScore_ = -1;
908
+ *position_ = targetStopPosition;
909
+ delete[] blocks;
910
+ return EDLIB_STATUS_OK;
911
+ }
912
+ //----------------------------------------------------//
913
+
914
+ targetChar++;
915
+ }
916
+
917
+ if (lastBlock == maxNumBlocks - 1) { // If last block of last column was calculated
918
+ // Obtain best score from block -> it is complicated because query is padded with W cells
919
+ int bestScore = getBlockCellValues(blocks[lastBlock])[W];
920
+ if (bestScore <= k) {
921
+ *bestScore_ = bestScore;
922
+ *position_ = targetLength - 1;
923
+ delete[] blocks;
924
+ return EDLIB_STATUS_OK;
925
+ }
926
+ }
927
+
928
+ *bestScore_ = *position_ = -1;
929
+ delete[] blocks;
930
+ return EDLIB_STATUS_OK;
931
+ }
932
+
933
+
934
+ /**
935
+ * Finds one possible alignment that gives optimal score by moving back through the dynamic programming matrix,
936
+ * that is stored in alignData. Consumes large amount of memory: O(queryLength * targetLength).
937
+ * @param [in] queryLength Normal length, without W.
938
+ * @param [in] targetLength Normal length, without W.
939
+ * @param [in] bestScore Best score.
940
+ * @param [in] alignData Data obtained during finding best score that is useful for finding alignment.
941
+ * @param [out] alignment Alignment.
942
+ * @param [out] alignmentLength Length of alignment.
943
+ * @return Status code.
944
+ */
945
+ static int obtainAlignmentTraceback(const int queryLength, const int targetLength,
946
+ const int bestScore, const AlignmentData* const alignData,
947
+ unsigned char** const alignment, int* const alignmentLength) {
948
+ const int maxNumBlocks = ceilDiv(queryLength, WORD_SIZE);
949
+ const int W = maxNumBlocks * WORD_SIZE - queryLength;
950
+
951
+ *alignment = static_cast<unsigned char*>(malloc((queryLength + targetLength - 1) * sizeof(unsigned char)));
952
+ *alignmentLength = 0;
953
+ int c = targetLength - 1; // index of column
954
+ int b = maxNumBlocks - 1; // index of block in column
955
+ int currScore = bestScore; // Score of current cell
956
+ int lScore = -1; // Score of left cell
957
+ int uScore = -1; // Score of upper cell
958
+ int ulScore = -1; // Score of upper left cell
959
+ Word currP = alignData->Ps[c * maxNumBlocks + b]; // P of current block
960
+ Word currM = alignData->Ms[c * maxNumBlocks + b]; // M of current block
961
+ // True if block to left exists and is in band
962
+ bool thereIsLeftBlock = c > 0 && b >= alignData->firstBlocks[c-1] && b <= alignData->lastBlocks[c-1];
963
+ // We set initial values of lP and lM to 0 only to avoid compiler warnings, they should not affect the
964
+ // calculation as both lP and lM should be initialized at some moment later (but compiler can not
965
+ // detect it since this initialization is guaranteed by "business" logic).
966
+ Word lP = 0, lM = 0;
967
+ if (thereIsLeftBlock) {
968
+ lP = alignData->Ps[(c - 1) * maxNumBlocks + b]; // P of block to the left
969
+ lM = alignData->Ms[(c - 1) * maxNumBlocks + b]; // M of block to the left
970
+ }
971
+ currP <<= W;
972
+ currM <<= W;
973
+ int blockPos = WORD_SIZE - W - 1; // 0 based index of current cell in blockPos
974
+
975
+ // TODO(martin): refactor this whole piece of code. There are too many if-else statements,
976
+ // it is too easy for a bug to hide and to hard to effectively cover all the edge-cases.
977
+ // We need better separation of logic and responsibilities.
978
+ while (true) {
979
+ if (c == 0) {
980
+ thereIsLeftBlock = true;
981
+ lScore = b * WORD_SIZE + blockPos + 1;
982
+ ulScore = lScore - 1;
983
+ }
984
+
985
+ // TODO: improvement: calculate only those cells that are needed,
986
+ // for example if I calculate upper cell and can move up,
987
+ // there is no need to calculate left and upper left cell
988
+ //---------- Calculate scores ---------//
989
+ if (lScore == -1 && thereIsLeftBlock) {
990
+ lScore = alignData->scores[(c - 1) * maxNumBlocks + b]; // score of block to the left
991
+ for (int i = 0; i < WORD_SIZE - blockPos - 1; i++) {
992
+ if (lP & HIGH_BIT_MASK) lScore--;
993
+ if (lM & HIGH_BIT_MASK) lScore++;
994
+ lP <<= 1;
995
+ lM <<= 1;
996
+ }
997
+ }
998
+ if (ulScore == -1) {
999
+ if (lScore != -1) {
1000
+ ulScore = lScore;
1001
+ if (lP & HIGH_BIT_MASK) ulScore--;
1002
+ if (lM & HIGH_BIT_MASK) ulScore++;
1003
+ }
1004
+ else if (c > 0 && b-1 >= alignData->firstBlocks[c-1] && b-1 <= alignData->lastBlocks[c-1]) {
1005
+ // This is the case when upper left cell is last cell in block,
1006
+ // and block to left is not in band so lScore is -1.
1007
+ ulScore = alignData->scores[(c - 1) * maxNumBlocks + b - 1];
1008
+ }
1009
+ }
1010
+ if (uScore == -1) {
1011
+ uScore = currScore;
1012
+ if (currP & HIGH_BIT_MASK) uScore--;
1013
+ if (currM & HIGH_BIT_MASK) uScore++;
1014
+ currP <<= 1;
1015
+ currM <<= 1;
1016
+ }
1017
+ //-------------------------------------//
1018
+
1019
+ // TODO: should I check if there is upper block?
1020
+
1021
+ //-------------- Move --------------//
1022
+ // Move up - insertion to target - deletion from query
1023
+ if (uScore != -1 && uScore + 1 == currScore) {
1024
+ currScore = uScore;
1025
+ lScore = ulScore;
1026
+ uScore = ulScore = -1;
1027
+ if (blockPos == 0) { // If entering new (upper) block
1028
+ if (b == 0) { // If there are no cells above (only boundary cells)
1029
+ (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_INSERT; // Move up
1030
+ for (int i = 0; i < c + 1; i++) // Move left until end
1031
+ (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_DELETE;
1032
+ break;
1033
+ } else {
1034
+ blockPos = WORD_SIZE - 1;
1035
+ b--;
1036
+ currP = alignData->Ps[c * maxNumBlocks + b];
1037
+ currM = alignData->Ms[c * maxNumBlocks + b];
1038
+ if (c > 0 && b >= alignData->firstBlocks[c-1] && b <= alignData->lastBlocks[c-1]) {
1039
+ thereIsLeftBlock = true;
1040
+ lP = alignData->Ps[(c - 1) * maxNumBlocks + b]; // TODO: improve this, too many operations
1041
+ lM = alignData->Ms[(c - 1) * maxNumBlocks + b];
1042
+ } else {
1043
+ thereIsLeftBlock = false;
1044
+ // TODO(martin): There may not be left block, but there can be left boundary - do we
1045
+ // handle this correctly then? Are l and ul score set correctly? I should check that / refactor this.
1046
+ }
1047
+ }
1048
+ } else {
1049
+ blockPos--;
1050
+ lP <<= 1;
1051
+ lM <<= 1;
1052
+ }
1053
+ // Mark move
1054
+ (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_INSERT;
1055
+ }
1056
+ // Move left - deletion from target - insertion to query
1057
+ else if (lScore != -1 && lScore + 1 == currScore) {
1058
+ currScore = lScore;
1059
+ uScore = ulScore;
1060
+ lScore = ulScore = -1;
1061
+ c--;
1062
+ if (c == -1) { // If there are no cells to the left (only boundary cells)
1063
+ (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_DELETE; // Move left
1064
+ int numUp = b * WORD_SIZE + blockPos + 1;
1065
+ for (int i = 0; i < numUp; i++) // Move up until end
1066
+ (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_INSERT;
1067
+ break;
1068
+ }
1069
+ currP = lP;
1070
+ currM = lM;
1071
+ if (c > 0 && b >= alignData->firstBlocks[c-1] && b <= alignData->lastBlocks[c-1]) {
1072
+ thereIsLeftBlock = true;
1073
+ lP = alignData->Ps[(c - 1) * maxNumBlocks + b];
1074
+ lM = alignData->Ms[(c - 1) * maxNumBlocks + b];
1075
+ } else {
1076
+ if (c == 0) { // If there are no cells to the left (only boundary cells)
1077
+ thereIsLeftBlock = true;
1078
+ lScore = b * WORD_SIZE + blockPos + 1;
1079
+ ulScore = lScore - 1;
1080
+ } else {
1081
+ thereIsLeftBlock = false;
1082
+ }
1083
+ }
1084
+ // Mark move
1085
+ (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_DELETE;
1086
+ }
1087
+ // Move up left - (mis)match
1088
+ else if (ulScore != -1) {
1089
+ unsigned char moveCode = ulScore == currScore ? EDLIB_EDOP_MATCH : EDLIB_EDOP_MISMATCH;
1090
+ currScore = ulScore;
1091
+ uScore = lScore = ulScore = -1;
1092
+ c--;
1093
+ if (c == -1) { // If there are no cells to the left (only boundary cells)
1094
+ (*alignment)[(*alignmentLength)++] = moveCode; // Move left
1095
+ int numUp = b * WORD_SIZE + blockPos;
1096
+ for (int i = 0; i < numUp; i++) // Move up until end
1097
+ (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_INSERT;
1098
+ break;
1099
+ }
1100
+ if (blockPos == 0) { // If entering upper left block
1101
+ if (b == 0) { // If there are no more cells above (only boundary cells)
1102
+ (*alignment)[(*alignmentLength)++] = moveCode; // Move up left
1103
+ for (int i = 0; i < c + 1; i++) // Move left until end
1104
+ (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_DELETE;
1105
+ break;
1106
+ }
1107
+ blockPos = WORD_SIZE - 1;
1108
+ b--;
1109
+ currP = alignData->Ps[c * maxNumBlocks + b];
1110
+ currM = alignData->Ms[c * maxNumBlocks + b];
1111
+ } else { // If entering left block
1112
+ blockPos--;
1113
+ currP = lP;
1114
+ currM = lM;
1115
+ currP <<= 1;
1116
+ currM <<= 1;
1117
+ }
1118
+ // Set new left block
1119
+ if (c > 0 && b >= alignData->firstBlocks[c-1] && b <= alignData->lastBlocks[c-1]) {
1120
+ thereIsLeftBlock = true;
1121
+ lP = alignData->Ps[(c - 1) * maxNumBlocks + b];
1122
+ lM = alignData->Ms[(c - 1) * maxNumBlocks + b];
1123
+ } else {
1124
+ if (c == 0) { // If there are no cells to the left (only boundary cells)
1125
+ thereIsLeftBlock = true;
1126
+ lScore = b * WORD_SIZE + blockPos + 1;
1127
+ ulScore = lScore - 1;
1128
+ } else {
1129
+ thereIsLeftBlock = false;
1130
+ }
1131
+ }
1132
+ // Mark move
1133
+ (*alignment)[(*alignmentLength)++] = moveCode;
1134
+ } else {
1135
+ // Reached end - finished!
1136
+ break;
1137
+ }
1138
+ //----------------------------------//
1139
+ }
1140
+
1141
+ *alignment = static_cast<unsigned char*>(realloc(*alignment, (*alignmentLength) * sizeof(unsigned char)));
1142
+ reverse(*alignment, *alignment + (*alignmentLength));
1143
+ return EDLIB_STATUS_OK;
1144
+ }
1145
+
1146
+
1147
+ /**
1148
+ * Finds one possible alignment that gives optimal score (bestScore).
1149
+ * It will split problem into smaller problems using Hirschberg's algorithm and when they are small enough,
1150
+ * it will solve them using traceback algorithm.
1151
+ * @param [in] query
1152
+ * @param [in] rQuery Reversed query.
1153
+ * @param [in] queryLength
1154
+ * @param [in] target
1155
+ * @param [in] rTarget Reversed target.
1156
+ * @param [in] targetLength
1157
+ * @param [in] equalityDefinition
1158
+ * @param [in] alphabetLength
1159
+ * @param [in] bestScore Best(optimal) score.
1160
+ * @param [out] alignment Sequence of edit operations that make target equal to query.
1161
+ * @param [out] alignmentLength Length of alignment.
1162
+ * @return Status code.
1163
+ */
1164
+ static int obtainAlignment(
1165
+ const unsigned char* const query, const unsigned char* const rQuery, const int queryLength,
1166
+ const unsigned char* const target, const unsigned char* const rTarget, const int targetLength,
1167
+ const EqualityDefinition& equalityDefinition, const int alphabetLength, const int bestScore,
1168
+ unsigned char** const alignment, int* const alignmentLength) {
1169
+
1170
+ // Handle special case when one of sequences has length of 0.
1171
+ if (queryLength == 0 || targetLength == 0) {
1172
+ *alignmentLength = targetLength + queryLength;
1173
+ *alignment = static_cast<unsigned char*>(malloc((*alignmentLength) * sizeof(unsigned char)));
1174
+ for (int i = 0; i < *alignmentLength; i++) {
1175
+ (*alignment)[i] = queryLength == 0 ? EDLIB_EDOP_DELETE : EDLIB_EDOP_INSERT;
1176
+ }
1177
+ return EDLIB_STATUS_OK;
1178
+ }
1179
+
1180
+ const int maxNumBlocks = ceilDiv(queryLength, WORD_SIZE);
1181
+ const int W = maxNumBlocks * WORD_SIZE - queryLength;
1182
+ int statusCode;
1183
+
1184
+ // TODO: think about reducing number of memory allocations in alignment functions, probably
1185
+ // by sharing some memory that is allocated only once. That refers to: Peq, columns in Hirschberg,
1186
+ // and it could also be done for alignments - we could have one big array for alignment that would be
1187
+ // sparsely populated by each of steps in recursion, and at the end we would just consolidate those results.
1188
+
1189
+ // If estimated memory consumption for traceback algorithm is smaller than 1MB use it,
1190
+ // otherwise use Hirschberg's algorithm. By running few tests I choose boundary of 1MB as optimal.
1191
+ long long alignmentDataSize = (2ll * sizeof(Word) + sizeof(int)) * maxNumBlocks * targetLength
1192
+ + 2ll * sizeof(int) * targetLength;
1193
+ if (alignmentDataSize < 1024 * 1024) {
1194
+ int score_, endLocation_; // Used only to call function.
1195
+ AlignmentData* alignData = NULL;
1196
+ Word* Peq = buildPeq(alphabetLength, query, queryLength, equalityDefinition);
1197
+ myersCalcEditDistanceNW(Peq, W, maxNumBlocks,
1198
+ queryLength,
1199
+ target, targetLength,
1200
+ bestScore,
1201
+ &score_, &endLocation_, true, &alignData, -1);
1202
+ //assert(score_ == bestScore);
1203
+ //assert(endLocation_ == targetLength - 1);
1204
+
1205
+ statusCode = obtainAlignmentTraceback(queryLength, targetLength,
1206
+ bestScore, alignData, alignment, alignmentLength);
1207
+ delete alignData;
1208
+ delete[] Peq;
1209
+ } else {
1210
+ statusCode = obtainAlignmentHirschberg(query, rQuery, queryLength,
1211
+ target, rTarget, targetLength,
1212
+ equalityDefinition, alphabetLength, bestScore,
1213
+ alignment, alignmentLength);
1214
+ }
1215
+ return statusCode;
1216
+ }
1217
+
1218
+
1219
+ /**
1220
+ * Finds one possible alignment that gives optimal score (bestScore).
1221
+ * Uses Hirschberg's algorithm to split problem into two sub-problems, solve them and combine them together.
1222
+ * @param [in] query
1223
+ * @param [in] rQuery Reversed query.
1224
+ * @param [in] queryLength
1225
+ * @param [in] target
1226
+ * @param [in] rTarget Reversed target.
1227
+ * @param [in] targetLength
1228
+ * @param [in] alphabetLength
1229
+ * @param [in] bestScore Best(optimal) score.
1230
+ * @param [out] alignment Sequence of edit operations that make target equal to query.
1231
+ * @param [out] alignmentLength Length of alignment.
1232
+ * @return Status code.
1233
+ */
1234
+ static int obtainAlignmentHirschberg(
1235
+ const unsigned char* const query, const unsigned char* const rQuery, const int queryLength,
1236
+ const unsigned char* const target, const unsigned char* const rTarget, const int targetLength,
1237
+ const EqualityDefinition& equalityDefinition, const int alphabetLength, const int bestScore,
1238
+ unsigned char** const alignment, int* const alignmentLength) {
1239
+
1240
+ const int maxNumBlocks = ceilDiv(queryLength, WORD_SIZE);
1241
+ const int W = maxNumBlocks * WORD_SIZE - queryLength;
1242
+
1243
+ Word* Peq = buildPeq(alphabetLength, query, queryLength, equalityDefinition);
1244
+ Word* rPeq = buildPeq(alphabetLength, rQuery, queryLength, equalityDefinition);
1245
+
1246
+ // Used only to call functions.
1247
+ int score_, endLocation_;
1248
+
1249
+ // Divide dynamic matrix into two halfs, left and right.
1250
+ const int leftHalfWidth = targetLength / 2;
1251
+ const int rightHalfWidth = targetLength - leftHalfWidth;
1252
+
1253
+ // Calculate left half.
1254
+ AlignmentData* alignDataLeftHalf = NULL;
1255
+ int leftHalfCalcStatus = myersCalcEditDistanceNW(
1256
+ Peq, W, maxNumBlocks, queryLength, target, targetLength, bestScore,
1257
+ &score_, &endLocation_, false, &alignDataLeftHalf, leftHalfWidth - 1);
1258
+
1259
+ // Calculate right half.
1260
+ AlignmentData* alignDataRightHalf = NULL;
1261
+ int rightHalfCalcStatus = myersCalcEditDistanceNW(
1262
+ rPeq, W, maxNumBlocks, queryLength, rTarget, targetLength, bestScore,
1263
+ &score_, &endLocation_, false, &alignDataRightHalf, rightHalfWidth - 1);
1264
+
1265
+ delete[] Peq;
1266
+ delete[] rPeq;
1267
+
1268
+ if (leftHalfCalcStatus == EDLIB_STATUS_ERROR || rightHalfCalcStatus == EDLIB_STATUS_ERROR) {
1269
+ if (alignDataLeftHalf) delete alignDataLeftHalf;
1270
+ if (alignDataRightHalf) delete alignDataRightHalf;
1271
+ return EDLIB_STATUS_ERROR;
1272
+ }
1273
+
1274
+ // Unwrap the left half.
1275
+ int firstBlockIdxLeft = alignDataLeftHalf->firstBlocks[0];
1276
+ int lastBlockIdxLeft = alignDataLeftHalf->lastBlocks[0];
1277
+ // TODO: avoid this allocation by using some shared array?
1278
+ // scoresLeft contains scores from left column, starting with scoresLeftStartIdx row (query index)
1279
+ // and ending with scoresLeftEndIdx row (0-indexed).
1280
+ int scoresLeftLength = (lastBlockIdxLeft - firstBlockIdxLeft + 1) * WORD_SIZE;
1281
+ int* scoresLeft = new int[scoresLeftLength];
1282
+ for (int blockIdx = firstBlockIdxLeft; blockIdx <= lastBlockIdxLeft; blockIdx++) {
1283
+ Block block(alignDataLeftHalf->Ps[blockIdx], alignDataLeftHalf->Ms[blockIdx],
1284
+ alignDataLeftHalf->scores[blockIdx]);
1285
+ readBlock(block, scoresLeft + (blockIdx - firstBlockIdxLeft) * WORD_SIZE);
1286
+ }
1287
+ int scoresLeftStartIdx = firstBlockIdxLeft * WORD_SIZE;
1288
+ // If last block contains padding, shorten the length of scores for the length of padding.
1289
+ if (lastBlockIdxLeft == maxNumBlocks - 1) {
1290
+ scoresLeftLength -= W;
1291
+ }
1292
+
1293
+ // Unwrap the right half (I also reverse it while unwraping).
1294
+ int firstBlockIdxRight = alignDataRightHalf->firstBlocks[0];
1295
+ int lastBlockIdxRight = alignDataRightHalf->lastBlocks[0];
1296
+ int scoresRightLength = (lastBlockIdxRight - firstBlockIdxRight + 1) * WORD_SIZE;
1297
+ int* scoresRight = new int[scoresRightLength];
1298
+ int* scoresRightOriginalStart = scoresRight;
1299
+ for (int blockIdx = firstBlockIdxRight; blockIdx <= lastBlockIdxRight; blockIdx++) {
1300
+ Block block(alignDataRightHalf->Ps[blockIdx], alignDataRightHalf->Ms[blockIdx],
1301
+ alignDataRightHalf->scores[blockIdx]);
1302
+ readBlockReverse(block, scoresRight + (lastBlockIdxRight - blockIdx) * WORD_SIZE);
1303
+ }
1304
+ int scoresRightStartIdx = queryLength - (lastBlockIdxRight + 1) * WORD_SIZE;
1305
+ // If there is padding at the beginning of scoresRight (that can happen because of reversing that we do),
1306
+ // move pointer forward to remove the padding (that is why we remember originalStart).
1307
+ if (scoresRightStartIdx < 0) {
1308
+ //assert(scoresRightStartIdx == -1 * W);
1309
+ scoresRight += W;
1310
+ scoresRightStartIdx += W;
1311
+ scoresRightLength -= W;
1312
+ }
1313
+
1314
+ delete alignDataLeftHalf;
1315
+ delete alignDataRightHalf;
1316
+
1317
+ //--------------------- Find the best move ----------------//
1318
+ // Find the query/row index of cell in left column which together with its lower right neighbour
1319
+ // from right column gives the best score (when summed). We also have to consider boundary cells
1320
+ // (those cells at -1 indexes).
1321
+ // x|
1322
+ // -+-
1323
+ // |x
1324
+ int queryIdxLeftStart = max(scoresLeftStartIdx, scoresRightStartIdx - 1);
1325
+ int queryIdxLeftEnd = min(scoresLeftStartIdx + scoresLeftLength - 1,
1326
+ scoresRightStartIdx + scoresRightLength - 2);
1327
+ int leftScore = -1, rightScore = -1;
1328
+ int queryIdxLeftAlignment = -1; // Query/row index of cell in left column where alignment is passing through.
1329
+ bool queryIdxLeftAlignmentFound = false;
1330
+ for (int queryIdx = queryIdxLeftStart; queryIdx <= queryIdxLeftEnd; queryIdx++) {
1331
+ leftScore = scoresLeft[queryIdx - scoresLeftStartIdx];
1332
+ rightScore = scoresRight[queryIdx + 1 - scoresRightStartIdx];
1333
+ if (leftScore + rightScore == bestScore) {
1334
+ queryIdxLeftAlignment = queryIdx;
1335
+ queryIdxLeftAlignmentFound = true;
1336
+ break;
1337
+ }
1338
+ }
1339
+ // Check boundary cells.
1340
+ if (!queryIdxLeftAlignmentFound && scoresLeftStartIdx == 0 && scoresRightStartIdx == 0) {
1341
+ leftScore = leftHalfWidth;
1342
+ rightScore = scoresRight[0];
1343
+ if (leftScore + rightScore == bestScore) {
1344
+ queryIdxLeftAlignment = -1;
1345
+ queryIdxLeftAlignmentFound = true;
1346
+ }
1347
+ }
1348
+ if (!queryIdxLeftAlignmentFound && scoresLeftStartIdx + scoresLeftLength == queryLength
1349
+ && scoresRightStartIdx + scoresRightLength == queryLength) {
1350
+ leftScore = scoresLeft[scoresLeftLength - 1];
1351
+ rightScore = rightHalfWidth;
1352
+ if (leftScore + rightScore == bestScore) {
1353
+ queryIdxLeftAlignment = queryLength - 1;
1354
+ queryIdxLeftAlignmentFound = true;
1355
+ }
1356
+ }
1357
+
1358
+ delete[] scoresLeft;
1359
+ delete[] scoresRightOriginalStart;
1360
+
1361
+ if (queryIdxLeftAlignmentFound == false) {
1362
+ // If there was no move that is part of optimal alignment, then there is no such alignment
1363
+ // or given bestScore is not correct!
1364
+ return EDLIB_STATUS_ERROR;
1365
+ }
1366
+ //----------------------------------------------------------//
1367
+
1368
+ // Calculate alignments for upper half of left half (upper left - ul)
1369
+ // and lower half of right half (lower right - lr).
1370
+ const int ulHeight = queryIdxLeftAlignment + 1;
1371
+ const int lrHeight = queryLength - ulHeight;
1372
+ const int ulWidth = leftHalfWidth;
1373
+ const int lrWidth = rightHalfWidth;
1374
+ unsigned char* ulAlignment = NULL; int ulAlignmentLength;
1375
+ int ulStatusCode = obtainAlignment(query, rQuery + lrHeight, ulHeight,
1376
+ target, rTarget + lrWidth, ulWidth,
1377
+ equalityDefinition, alphabetLength, leftScore,
1378
+ &ulAlignment, &ulAlignmentLength);
1379
+ unsigned char* lrAlignment = NULL; int lrAlignmentLength;
1380
+ int lrStatusCode = obtainAlignment(query + ulHeight, rQuery, lrHeight,
1381
+ target + ulWidth, rTarget, lrWidth,
1382
+ equalityDefinition, alphabetLength, rightScore,
1383
+ &lrAlignment, &lrAlignmentLength);
1384
+ if (ulStatusCode == EDLIB_STATUS_ERROR || lrStatusCode == EDLIB_STATUS_ERROR) {
1385
+ if (ulAlignment) free(ulAlignment);
1386
+ if (lrAlignment) free(lrAlignment);
1387
+ return EDLIB_STATUS_ERROR;
1388
+ }
1389
+
1390
+ // Build alignment by concatenating upper left alignment with lower right alignment.
1391
+ *alignmentLength = ulAlignmentLength + lrAlignmentLength;
1392
+ *alignment = static_cast<unsigned char*>(malloc((*alignmentLength) * sizeof(unsigned char)));
1393
+ memcpy(*alignment, ulAlignment, ulAlignmentLength);
1394
+ memcpy(*alignment + ulAlignmentLength, lrAlignment, lrAlignmentLength);
1395
+
1396
+ free(ulAlignment);
1397
+ free(lrAlignment);
1398
+ return EDLIB_STATUS_OK;
1399
+ }
1400
+
1401
+
1402
+ /**
1403
+ * Takes char query and char target, recognizes alphabet and transforms them into unsigned char sequences
1404
+ * where elements in sequences are not any more letters of alphabet, but their index in alphabet.
1405
+ * Most of internal edlib functions expect such transformed sequences.
1406
+ * This function will allocate queryTransformed and targetTransformed, so make sure to free them when done.
1407
+ * Example:
1408
+ * Original sequences: "ACT" and "CGT".
1409
+ * Alphabet would be recognized as "ACTG". Alphabet length = 4.
1410
+ * Transformed sequences: [0, 1, 2] and [1, 3, 2].
1411
+ * @param [in] queryOriginal
1412
+ * @param [in] queryLength
1413
+ * @param [in] targetOriginal
1414
+ * @param [in] targetLength
1415
+ * @param [out] queryTransformed It will contain values in range [0, alphabet length - 1].
1416
+ * @param [out] targetTransformed It will contain values in range [0, alphabet length - 1].
1417
+ * @return Alphabet as a string of unique characters, where index of each character is its value in transformed
1418
+ * sequences.
1419
+ */
1420
+ static string transformSequences(const char* const queryOriginal, const int queryLength,
1421
+ const char* const targetOriginal, const int targetLength,
1422
+ unsigned char** const queryTransformed,
1423
+ unsigned char** const targetTransformed) {
1424
+ // Alphabet is constructed from letters that are present in sequences.
1425
+ // Each letter is assigned an ordinal number, starting from 0 up to alphabetLength - 1,
1426
+ // and new query and target are created in which letters are replaced with their ordinal numbers.
1427
+ // This query and target are used in all the calculations later.
1428
+ *queryTransformed = static_cast<unsigned char *>(malloc(sizeof(unsigned char) * queryLength));
1429
+ *targetTransformed = static_cast<unsigned char *>(malloc(sizeof(unsigned char) * targetLength));
1430
+
1431
+ string alphabet = "";
1432
+
1433
+ // Alphabet information, it is constructed on fly while transforming sequences.
1434
+ // letterIdx[c] is index of letter c in alphabet.
1435
+ unsigned char letterIdx[MAX_UCHAR + 1];
1436
+ bool inAlphabet[MAX_UCHAR + 1]; // inAlphabet[c] is true if c is in alphabet
1437
+ for (int i = 0; i < MAX_UCHAR + 1; i++) inAlphabet[i] = false;
1438
+
1439
+ for (int i = 0; i < queryLength; i++) {
1440
+ unsigned char c = static_cast<unsigned char>(queryOriginal[i]);
1441
+ if (!inAlphabet[c]) {
1442
+ inAlphabet[c] = true;
1443
+ letterIdx[c] = static_cast<unsigned char>(alphabet.size());
1444
+ alphabet += queryOriginal[i];
1445
+ }
1446
+ (*queryTransformed)[i] = letterIdx[c];
1447
+ }
1448
+ for (int i = 0; i < targetLength; i++) {
1449
+ unsigned char c = static_cast<unsigned char>(targetOriginal[i]);
1450
+ if (!inAlphabet[c]) {
1451
+ inAlphabet[c] = true;
1452
+ letterIdx[c] = static_cast<unsigned char>(alphabet.size());
1453
+ alphabet += targetOriginal[i];
1454
+ }
1455
+ (*targetTransformed)[i] = letterIdx[c];
1456
+ }
1457
+
1458
+ return alphabet;
1459
+ }
1460
+
1461
+
1462
+ extern "C" EdlibAlignConfig edlibNewAlignConfig(int k, EdlibAlignMode mode, EdlibAlignTask task,
1463
+ const EdlibEqualityPair* additionalEqualities,
1464
+ int additionalEqualitiesLength) {
1465
+ EdlibAlignConfig config;
1466
+ config.k = k;
1467
+ config.mode = mode;
1468
+ config.task = task;
1469
+ config.additionalEqualities = additionalEqualities;
1470
+ config.additionalEqualitiesLength = additionalEqualitiesLength;
1471
+ return config;
1472
+ }
1473
+
1474
+ extern "C" EdlibAlignConfig edlibDefaultAlignConfig(void) {
1475
+ return edlibNewAlignConfig(-1, EDLIB_MODE_NW, EDLIB_TASK_DISTANCE, NULL, 0);
1476
+ }
1477
+
1478
+ extern "C" void edlibFreeAlignResult(EdlibAlignResult result) {
1479
+ if (result.endLocations) free(result.endLocations);
1480
+ if (result.startLocations) free(result.startLocations);
1481
+ if (result.alignment) free(result.alignment);
1482
+ }