edlib 0.0.8 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ext/edlib/LICENSE +20 -0
- data/ext/edlib/edlib.cpp +70 -67
- data/ext/edlib/edlib.h +1 -1
- data/ext/edlib/edlibext.c +12 -6
- data/lib/edlib/version.rb +1 -1
- data/lib/edlib.rb +1 -1
- metadata +6 -8
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 61e105faf4bfa9c455a5035f7f4c10709af4ccc1ec41c786e9422526a9aae7e7
|
|
4
|
+
data.tar.gz: '07933fceaf42bf6a7fccaa0cde661cd0085a3ff25374ae11fb8ec4dea18d43d1'
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 3c9ddaa7620ffc834e59ce89e096c62f38d94a23f75b17694b2d4cbf79a5cc36f8fc8edca59a0aadf7d39689b86a166c3f55d5cfd7f85f69820580e474a4bc9d
|
|
7
|
+
data.tar.gz: 21b29d8957092dd0cfff7e11ca35b69f673fd9b70b023a451c534a615f4d3fd3730f80e6e698d723f1f0a27985db3d39c190d1cf3b023225be4848a3f35df793
|
data/ext/edlib/LICENSE
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
The MIT License (MIT)
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2014 Martin Šošić
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
|
6
|
+
this software and associated documentation files (the "Software"), to deal in
|
|
7
|
+
the Software without restriction, including without limitation the rights to
|
|
8
|
+
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
|
|
9
|
+
the Software, and to permit persons to whom the Software is furnished to do so,
|
|
10
|
+
subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
|
17
|
+
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
|
18
|
+
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
|
19
|
+
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
|
20
|
+
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/ext/edlib/edlib.cpp
CHANGED
|
@@ -1,12 +1,15 @@
|
|
|
1
1
|
#include "edlib.h"
|
|
2
2
|
|
|
3
3
|
#include <stdint.h>
|
|
4
|
+
#include <array>
|
|
4
5
|
#include <cstdlib>
|
|
5
6
|
#include <algorithm>
|
|
6
7
|
#include <vector>
|
|
7
8
|
#include <cstring>
|
|
8
9
|
#include <string>
|
|
9
10
|
|
|
11
|
+
namespace {
|
|
12
|
+
|
|
10
13
|
using namespace std;
|
|
11
14
|
|
|
12
15
|
typedef uint64_t Word;
|
|
@@ -90,6 +93,8 @@ public:
|
|
|
90
93
|
}
|
|
91
94
|
};
|
|
92
95
|
|
|
96
|
+
} // anonymous namespace
|
|
97
|
+
|
|
93
98
|
static int myersCalcEditDistanceSemiGlobal(const Word* Peq, int W, int maxNumBlocks,
|
|
94
99
|
int queryLength,
|
|
95
100
|
const unsigned char* target, int targetLength,
|
|
@@ -308,7 +313,7 @@ extern "C" char* edlibAlignmentToCigar(const unsigned char* const alignment, con
|
|
|
308
313
|
moveCodeToChar[0] = moveCodeToChar[3] = 'M';
|
|
309
314
|
}
|
|
310
315
|
|
|
311
|
-
vector<char
|
|
316
|
+
vector<char> cigar;
|
|
312
317
|
char lastMove = 0; // Char of last move. 0 if there was no previous move.
|
|
313
318
|
int numOfSameMoves = 0;
|
|
314
319
|
for (int i = 0; i <= alignmentLength; i++) {
|
|
@@ -317,17 +322,16 @@ extern "C" char* edlibAlignmentToCigar(const unsigned char* const alignment, con
|
|
|
317
322
|
// Write number of moves to cigar string.
|
|
318
323
|
int numDigits = 0;
|
|
319
324
|
for (; numOfSameMoves; numOfSameMoves /= 10) {
|
|
320
|
-
cigar
|
|
325
|
+
cigar.push_back('0' + numOfSameMoves % 10);
|
|
321
326
|
numDigits++;
|
|
322
327
|
}
|
|
323
|
-
reverse(cigar
|
|
328
|
+
reverse(cigar.end() - numDigits, cigar.end());
|
|
324
329
|
// Write code of move to cigar string.
|
|
325
|
-
cigar
|
|
330
|
+
cigar.push_back(lastMove);
|
|
326
331
|
// If not at the end, start new sequence of moves.
|
|
327
332
|
if (i < alignmentLength) {
|
|
328
333
|
// Check if alignment has valid values.
|
|
329
334
|
if (alignment[i] > 3) {
|
|
330
|
-
delete cigar;
|
|
331
335
|
return 0;
|
|
332
336
|
}
|
|
333
337
|
numOfSameMoves = 0;
|
|
@@ -338,10 +342,9 @@ extern "C" char* edlibAlignmentToCigar(const unsigned char* const alignment, con
|
|
|
338
342
|
numOfSameMoves++;
|
|
339
343
|
}
|
|
340
344
|
}
|
|
341
|
-
cigar
|
|
342
|
-
char* cigar_ = static_cast<char *>(malloc(cigar
|
|
343
|
-
memcpy(cigar_,
|
|
344
|
-
delete cigar;
|
|
345
|
+
cigar.push_back(0); // Null character termination.
|
|
346
|
+
char* cigar_ = static_cast<char *>(malloc(cigar.size() * sizeof(char)));
|
|
347
|
+
memcpy(cigar_, cigar.data(), cigar.size() * sizeof(char));
|
|
345
348
|
|
|
346
349
|
return cigar_;
|
|
347
350
|
}
|
|
@@ -464,8 +467,8 @@ static inline int max(const int x, const int y) {
|
|
|
464
467
|
* @param [in] block
|
|
465
468
|
* @return Values of cells in block, starting with bottom cell in block.
|
|
466
469
|
*/
|
|
467
|
-
static inline
|
|
468
|
-
|
|
470
|
+
static inline std::array<int, WORD_SIZE> getBlockCellValues(const Block block) {
|
|
471
|
+
std::array<int, WORD_SIZE> scores;
|
|
469
472
|
int score = block.score;
|
|
470
473
|
Word mask = HIGH_BIT_MASK;
|
|
471
474
|
for (int i = 0; i < WORD_SIZE - 1; i++) {
|
|
@@ -518,7 +521,7 @@ static inline void readBlockReverse(const Block block, int* const dest) {
|
|
|
518
521
|
* @return True if all cells in block have value larger than k, otherwise false.
|
|
519
522
|
*/
|
|
520
523
|
static inline bool allBlockCellsLarger(const Block block, const int k) {
|
|
521
|
-
|
|
524
|
+
std::array<int, WORD_SIZE> scores = getBlockCellValues(block);
|
|
522
525
|
for (int i = 0; i < WORD_SIZE; i++) {
|
|
523
526
|
if (scores[i] <= k) return false;
|
|
524
527
|
}
|
|
@@ -557,8 +560,6 @@ static int myersCalcEditDistanceSemiGlobal(
|
|
|
557
560
|
// lastBlock is 0-based index of last block in Ukkonen band.
|
|
558
561
|
int firstBlock = 0;
|
|
559
562
|
int lastBlock = min(ceilDiv(k + 1, WORD_SIZE), maxNumBlocks) - 1; // y in Myers
|
|
560
|
-
Block *bl; // Current block
|
|
561
|
-
|
|
562
563
|
Block* blocks = new Block[maxNumBlocks];
|
|
563
564
|
|
|
564
565
|
// For HW, solution will never be larger then queryLength.
|
|
@@ -571,16 +572,15 @@ static int myersCalcEditDistanceSemiGlobal(
|
|
|
571
572
|
const int STRONG_REDUCE_NUM = 2048;
|
|
572
573
|
|
|
573
574
|
// Initialize P, M and score
|
|
574
|
-
bl = blocks;
|
|
575
575
|
for (int b = 0; b <= lastBlock; b++) {
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
bl++;
|
|
576
|
+
blocks[b].score = (b + 1) * WORD_SIZE;
|
|
577
|
+
blocks[b].P = static_cast<Word>(-1); // All 1s
|
|
578
|
+
blocks[b].M = static_cast<Word>(0);
|
|
580
579
|
}
|
|
581
580
|
|
|
582
581
|
int bestScore = -1;
|
|
583
|
-
|
|
582
|
+
int bl = 0; // Current block index
|
|
583
|
+
vector<int> positions;
|
|
584
584
|
const int startHout = mode == EDLIB_MODE_HW ? 0 : 1; // If 0 then gap before query is not penalized;
|
|
585
585
|
const unsigned char* targetChar = target;
|
|
586
586
|
for (int c = 0; c < targetLength; c++) { // for each column
|
|
@@ -588,26 +588,26 @@ static int myersCalcEditDistanceSemiGlobal(
|
|
|
588
588
|
|
|
589
589
|
//----------------------- Calculate column -------------------------//
|
|
590
590
|
int hout = startHout;
|
|
591
|
-
bl =
|
|
591
|
+
bl = firstBlock;
|
|
592
592
|
Peq_c += firstBlock;
|
|
593
593
|
for (int b = firstBlock; b <= lastBlock; b++) {
|
|
594
|
-
hout = calculateBlock(bl
|
|
595
|
-
bl
|
|
594
|
+
hout = calculateBlock(blocks[bl].P, blocks[bl].M, *Peq_c, hout, blocks[bl].P, blocks[bl].M);
|
|
595
|
+
blocks[bl].score += hout;
|
|
596
596
|
bl++; Peq_c++;
|
|
597
597
|
}
|
|
598
598
|
bl--; Peq_c--;
|
|
599
599
|
//------------------------------------------------------------------//
|
|
600
600
|
|
|
601
601
|
//---------- Adjust number of blocks according to Ukkonen ----------//
|
|
602
|
-
if ((lastBlock < maxNumBlocks - 1) && (bl
|
|
602
|
+
if ((lastBlock < maxNumBlocks - 1) && (blocks[bl].score - hout <= k) // bl is pointing to last block
|
|
603
603
|
&& ((*(Peq_c + 1) & WORD_1) || hout < 0)) { // Peq_c is pointing to last block
|
|
604
604
|
// If score of left block is not too big, calculate one more block
|
|
605
605
|
lastBlock++; bl++; Peq_c++;
|
|
606
|
-
bl
|
|
607
|
-
bl
|
|
608
|
-
bl
|
|
606
|
+
blocks[bl].P = static_cast<Word>(-1); // All 1s
|
|
607
|
+
blocks[bl].M = static_cast<Word>(0);
|
|
608
|
+
blocks[bl].score = blocks[bl - 1].score - hout + WORD_SIZE + calculateBlock(blocks[bl].P, blocks[bl].M, *Peq_c, hout, blocks[bl].P, blocks[bl].M);
|
|
609
609
|
} else {
|
|
610
|
-
while (lastBlock >= firstBlock && bl
|
|
610
|
+
while (lastBlock >= firstBlock && blocks[bl].score >= k + WORD_SIZE) {
|
|
611
611
|
lastBlock--; bl--; Peq_c--;
|
|
612
612
|
}
|
|
613
613
|
}
|
|
@@ -617,7 +617,7 @@ static int myersCalcEditDistanceSemiGlobal(
|
|
|
617
617
|
//
|
|
618
618
|
// Reduce the band by decreasing last block if possible.
|
|
619
619
|
if (c % STRONG_REDUCE_NUM == 0) {
|
|
620
|
-
while (lastBlock >= 0 && lastBlock >= firstBlock && allBlockCellsLarger(
|
|
620
|
+
while (lastBlock >= 0 && lastBlock >= firstBlock && allBlockCellsLarger(blocks[bl], k)) {
|
|
621
621
|
lastBlock--; bl--; Peq_c--;
|
|
622
622
|
}
|
|
623
623
|
}
|
|
@@ -656,7 +656,7 @@ static int myersCalcEditDistanceSemiGlobal(
|
|
|
656
656
|
|
|
657
657
|
//------------------------- Update best score ----------------------//
|
|
658
658
|
if (lastBlock == maxNumBlocks - 1) {
|
|
659
|
-
int colScore = bl
|
|
659
|
+
int colScore = blocks[bl].score;
|
|
660
660
|
if (colScore <= k) { // Scores > k dont have correct values (so we cannot use them), but are certainly > k.
|
|
661
661
|
// NOTE: Score that I find in column c is actually score from column c-W
|
|
662
662
|
if (bestScore == -1 || colScore <= bestScore) {
|
|
@@ -679,7 +679,7 @@ static int myersCalcEditDistanceSemiGlobal(
|
|
|
679
679
|
|
|
680
680
|
// Obtain results for last W columns from last column.
|
|
681
681
|
if (lastBlock == maxNumBlocks - 1) {
|
|
682
|
-
|
|
682
|
+
std::array<int, WORD_SIZE> blockScores = getBlockCellValues(blocks[bl]);
|
|
683
683
|
for (int i = 0; i < W; i++) {
|
|
684
684
|
int colScore = blockScores[i + 1];
|
|
685
685
|
if (colScore <= k && (bestScore == -1 || colScore <= bestScore)) {
|
|
@@ -753,17 +753,13 @@ static int myersCalcEditDistanceNW(const Word* const Peq, const int W, const int
|
|
|
753
753
|
int firstBlock = 0;
|
|
754
754
|
// This is optimal now, by my formula.
|
|
755
755
|
int lastBlock = min(maxNumBlocks, ceilDiv(min(k, (k + queryLength - targetLength) / 2) + 1, WORD_SIZE)) - 1;
|
|
756
|
-
Block* bl; // Current block
|
|
757
|
-
|
|
758
756
|
Block* blocks = new Block[maxNumBlocks];
|
|
759
757
|
|
|
760
758
|
// Initialize P, M and score
|
|
761
|
-
bl = blocks;
|
|
762
759
|
for (int b = 0; b <= lastBlock; b++) {
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
bl++;
|
|
760
|
+
blocks[b].score = (b + 1) * WORD_SIZE;
|
|
761
|
+
blocks[b].P = static_cast<Word>(-1); // All 1s
|
|
762
|
+
blocks[b].M = static_cast<Word>(0);
|
|
767
763
|
}
|
|
768
764
|
|
|
769
765
|
// If we want to find alignment, we have to store needed data.
|
|
@@ -774,16 +770,17 @@ static int myersCalcEditDistanceNW(const Word* const Peq, const int W, const int
|
|
|
774
770
|
else
|
|
775
771
|
*alignData = NULL;
|
|
776
772
|
|
|
773
|
+
int bl = 0; // Current block index
|
|
777
774
|
const unsigned char* targetChar = target;
|
|
778
775
|
for (int c = 0; c < targetLength; c++) { // for each column
|
|
779
776
|
const Word* Peq_c = Peq + *targetChar * maxNumBlocks;
|
|
780
777
|
|
|
781
778
|
//----------------------- Calculate column -------------------------//
|
|
782
779
|
int hout = 1;
|
|
783
|
-
bl =
|
|
780
|
+
bl = firstBlock;
|
|
784
781
|
for (int b = firstBlock; b <= lastBlock; b++) {
|
|
785
|
-
hout = calculateBlock(bl
|
|
786
|
-
bl
|
|
782
|
+
hout = calculateBlock(blocks[bl].P, blocks[bl].M, Peq_c[b], hout, blocks[bl].P, blocks[bl].M);
|
|
783
|
+
blocks[bl].score += hout;
|
|
787
784
|
bl++;
|
|
788
785
|
}
|
|
789
786
|
bl--;
|
|
@@ -792,7 +789,7 @@ static int myersCalcEditDistanceNW(const Word* const Peq, const int W, const int
|
|
|
792
789
|
|
|
793
790
|
// Update k. I do it only on end of column because it would slow calculation too much otherwise.
|
|
794
791
|
// NOTICE: I add W when in last block because it is actually result from W cells to the left and W cells up.
|
|
795
|
-
k = min(k, bl
|
|
792
|
+
k = min(k, blocks[bl].score
|
|
796
793
|
+ max(targetLength - c - 1, queryLength - ((1 + lastBlock) * WORD_SIZE - 1) - 1)
|
|
797
794
|
+ (lastBlock == maxNumBlocks - 1 ? W : 0));
|
|
798
795
|
|
|
@@ -802,12 +799,12 @@ static int myersCalcEditDistanceNW(const Word* const Peq, const int W, const int
|
|
|
802
799
|
if (lastBlock + 1 < maxNumBlocks
|
|
803
800
|
&& !(//score[lastBlock] >= k + WORD_SIZE || // NOTICE: this condition could be satisfied if above block also!
|
|
804
801
|
((lastBlock + 1) * WORD_SIZE - 1
|
|
805
|
-
> k - bl
|
|
802
|
+
> k - blocks[bl].score + 2 * WORD_SIZE - 2 - targetLength + c + queryLength))) {
|
|
806
803
|
lastBlock++; bl++;
|
|
807
|
-
bl
|
|
808
|
-
bl
|
|
809
|
-
int newHout = calculateBlock(bl
|
|
810
|
-
bl
|
|
804
|
+
blocks[bl].P = static_cast<Word>(-1); // All 1s
|
|
805
|
+
blocks[bl].M = static_cast<Word>(0);
|
|
806
|
+
int newHout = calculateBlock(blocks[bl].P, blocks[bl].M, Peq_c[lastBlock], hout, blocks[bl].P, blocks[bl].M);
|
|
807
|
+
blocks[bl].score = blocks[bl - 1].score - hout + WORD_SIZE + newHout;
|
|
811
808
|
hout = newHout;
|
|
812
809
|
}
|
|
813
810
|
|
|
@@ -815,10 +812,10 @@ static int myersCalcEditDistanceNW(const Word* const Peq, const int W, const int
|
|
|
815
812
|
// NOTE: Condition used here is more loose than the one from the article, since I simplified the max() part of it.
|
|
816
813
|
// I could consider adding that max part, for optimal performance.
|
|
817
814
|
while (lastBlock >= firstBlock
|
|
818
|
-
&& (bl
|
|
815
|
+
&& (blocks[bl].score >= k + WORD_SIZE
|
|
819
816
|
|| ((lastBlock + 1) * WORD_SIZE - 1 >
|
|
820
817
|
// TODO: Does not work if do not put +1! Why???
|
|
821
|
-
k - bl
|
|
818
|
+
k - blocks[bl].score + 2 * WORD_SIZE - 2 - targetLength + c + queryLength + 1))) {
|
|
822
819
|
lastBlock--; bl--;
|
|
823
820
|
}
|
|
824
821
|
//-------------------------//
|
|
@@ -838,7 +835,7 @@ static int myersCalcEditDistanceNW(const Word* const Peq, const int W, const int
|
|
|
838
835
|
if (c % STRONG_REDUCE_NUM == 0) { // Every some columns do more expensive but more efficient reduction
|
|
839
836
|
while (lastBlock >= firstBlock) {
|
|
840
837
|
// If all cells outside of band, remove block
|
|
841
|
-
|
|
838
|
+
std::array<int, WORD_SIZE> scores = getBlockCellValues(blocks[bl]);
|
|
842
839
|
int numCells = lastBlock == maxNumBlocks - 1 ? WORD_SIZE - W : WORD_SIZE;
|
|
843
840
|
int r = lastBlock * WORD_SIZE + numCells - 1;
|
|
844
841
|
bool reduce = true;
|
|
@@ -856,7 +853,7 @@ static int myersCalcEditDistanceNW(const Word* const Peq, const int W, const int
|
|
|
856
853
|
|
|
857
854
|
while (firstBlock <= lastBlock) {
|
|
858
855
|
// If all cells outside of band, remove block
|
|
859
|
-
|
|
856
|
+
std::array<int, WORD_SIZE> scores = getBlockCellValues(blocks[firstBlock]);
|
|
860
857
|
int numCells = firstBlock == maxNumBlocks - 1 ? WORD_SIZE - W : WORD_SIZE;
|
|
861
858
|
int r = firstBlock * WORD_SIZE + numCells - 1;
|
|
862
859
|
bool reduce = true;
|
|
@@ -884,11 +881,11 @@ static int myersCalcEditDistanceNW(const Word* const Peq, const int W, const int
|
|
|
884
881
|
|
|
885
882
|
//---- Save column so it can be used for reconstruction ----//
|
|
886
883
|
if (findAlignment && c < targetLength) {
|
|
887
|
-
bl =
|
|
884
|
+
bl = firstBlock;
|
|
888
885
|
for (int b = firstBlock; b <= lastBlock; b++) {
|
|
889
|
-
(*alignData)->Ps[maxNumBlocks * c + b] = bl
|
|
890
|
-
(*alignData)->Ms[maxNumBlocks * c + b] = bl
|
|
891
|
-
(*alignData)->scores[maxNumBlocks * c + b] = bl
|
|
886
|
+
(*alignData)->Ps[maxNumBlocks * c + b] = blocks[bl].P;
|
|
887
|
+
(*alignData)->Ms[maxNumBlocks * c + b] = blocks[bl].M;
|
|
888
|
+
(*alignData)->scores[maxNumBlocks * c + b] = blocks[bl].score;
|
|
892
889
|
bl++;
|
|
893
890
|
}
|
|
894
891
|
(*alignData)->firstBlocks[c] = firstBlock;
|
|
@@ -1419,16 +1416,17 @@ static int obtainAlignmentHirschberg(
|
|
|
1419
1416
|
*/
|
|
1420
1417
|
static string transformSequences(const char* const queryOriginal, const int queryLength,
|
|
1421
1418
|
const char* const targetOriginal, const int targetLength,
|
|
1422
|
-
unsigned char** const
|
|
1423
|
-
unsigned char** const
|
|
1419
|
+
unsigned char** const queryTransformed_,
|
|
1420
|
+
unsigned char** const targetTransformed_) {
|
|
1424
1421
|
// Alphabet is constructed from letters that are present in sequences.
|
|
1425
1422
|
// Each letter is assigned an ordinal number, starting from 0 up to alphabetLength - 1,
|
|
1426
1423
|
// and new query and target are created in which letters are replaced with their ordinal numbers.
|
|
1427
1424
|
// This query and target are used in all the calculations later.
|
|
1428
|
-
*queryTransformed = static_cast<unsigned char *>(malloc(sizeof(unsigned char) * queryLength));
|
|
1429
|
-
*targetTransformed = static_cast<unsigned char *>(malloc(sizeof(unsigned char) * targetLength));
|
|
1425
|
+
unsigned char *queryTransformed = static_cast<unsigned char *>(malloc(sizeof(unsigned char) * queryLength));
|
|
1426
|
+
unsigned char *targetTransformed = static_cast<unsigned char *>(malloc(sizeof(unsigned char) * targetLength));
|
|
1430
1427
|
|
|
1431
|
-
|
|
1428
|
+
char alphabet[MAX_UCHAR + 1];
|
|
1429
|
+
int alphabetSize = 0;
|
|
1432
1430
|
|
|
1433
1431
|
// Alphabet information, it is constructed on fly while transforming sequences.
|
|
1434
1432
|
// letterIdx[c] is index of letter c in alphabet.
|
|
@@ -1440,22 +1438,27 @@ static string transformSequences(const char* const queryOriginal, const int quer
|
|
|
1440
1438
|
unsigned char c = static_cast<unsigned char>(queryOriginal[i]);
|
|
1441
1439
|
if (!inAlphabet[c]) {
|
|
1442
1440
|
inAlphabet[c] = true;
|
|
1443
|
-
|
|
1444
|
-
|
|
1441
|
+
const unsigned char idx = static_cast<unsigned char>(alphabetSize++);
|
|
1442
|
+
letterIdx[c] = idx;
|
|
1443
|
+
alphabet[idx] = queryOriginal[i];
|
|
1445
1444
|
}
|
|
1446
|
-
|
|
1445
|
+
queryTransformed[i] = letterIdx[c];
|
|
1447
1446
|
}
|
|
1448
1447
|
for (int i = 0; i < targetLength; i++) {
|
|
1449
1448
|
unsigned char c = static_cast<unsigned char>(targetOriginal[i]);
|
|
1450
1449
|
if (!inAlphabet[c]) {
|
|
1451
1450
|
inAlphabet[c] = true;
|
|
1452
|
-
|
|
1453
|
-
|
|
1451
|
+
const unsigned char idx = static_cast<unsigned char>(alphabetSize++);
|
|
1452
|
+
letterIdx[c] = idx;
|
|
1453
|
+
alphabet[idx] = targetOriginal[i];
|
|
1454
1454
|
}
|
|
1455
|
-
|
|
1455
|
+
targetTransformed[i] = letterIdx[c];
|
|
1456
1456
|
}
|
|
1457
1457
|
|
|
1458
|
-
|
|
1458
|
+
*queryTransformed_ = queryTransformed;
|
|
1459
|
+
*targetTransformed_ = targetTransformed;
|
|
1460
|
+
|
|
1461
|
+
return std::string(alphabet, alphabetSize);
|
|
1459
1462
|
}
|
|
1460
1463
|
|
|
1461
1464
|
|
data/ext/edlib/edlib.h
CHANGED
|
@@ -200,7 +200,7 @@ extern "C" {
|
|
|
200
200
|
* 1 stands for insertion to target.
|
|
201
201
|
* 2 stands for insertion to query.
|
|
202
202
|
* 3 stands for mismatch.
|
|
203
|
-
* Alignment aligns query to target from
|
|
203
|
+
* Alignment aligns query to target from beginning of query till end of query.
|
|
204
204
|
* If gaps are not penalized, they are not in alignment.
|
|
205
205
|
* If you do not free whole result object using edlibFreeAlignResult(), do not forget to use free().
|
|
206
206
|
*/
|
data/ext/edlib/edlibext.c
CHANGED
|
@@ -318,7 +318,7 @@ set_additional_equalities(EdlibAlignConfig *config, EdlibEqualityPair *eqpairs,
|
|
|
318
318
|
}
|
|
319
319
|
|
|
320
320
|
config->additionalEqualities = eqpairs;
|
|
321
|
-
config->additionalEqualitiesLength = len;
|
|
321
|
+
config->additionalEqualitiesLength = (int)len;
|
|
322
322
|
|
|
323
323
|
free(first_arr);
|
|
324
324
|
free(second_arr);
|
|
@@ -329,9 +329,14 @@ set_additional_equalities(EdlibAlignConfig *config, EdlibEqualityPair *eqpairs,
|
|
|
329
329
|
static VALUE
|
|
330
330
|
aligner_set_additional_equalities(VALUE self, VALUE equalities)
|
|
331
331
|
{
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
332
|
+
RbAlignConfig *aligner_config = NULL;
|
|
333
|
+
TypedData_Get_Struct(self, RbAlignConfig, &config_type, aligner_config);
|
|
334
|
+
EdlibAlignConfig *config = aligner_config->config;
|
|
335
|
+
EdlibEqualityPair *eqpairs = aligner_config->equalityPairs;
|
|
336
|
+
VALUE result = set_additional_equalities(config, eqpairs, equalities);
|
|
337
|
+
// Update the pointer in the struct
|
|
338
|
+
aligner_config->equalityPairs = (EdlibEqualityPair *)config->additionalEqualities;
|
|
339
|
+
return result;
|
|
335
340
|
}
|
|
336
341
|
|
|
337
342
|
static VALUE
|
|
@@ -382,9 +387,9 @@ aligner_align(VALUE self, VALUE query, VALUE target)
|
|
|
382
387
|
|
|
383
388
|
EdlibAlignResult result = edlibAlign(
|
|
384
389
|
StringValueCStr(query),
|
|
385
|
-
RSTRING_LEN(query),
|
|
390
|
+
(int)RSTRING_LEN(query),
|
|
386
391
|
StringValueCStr(target),
|
|
387
|
-
RSTRING_LEN(target),
|
|
392
|
+
(int)RSTRING_LEN(target),
|
|
388
393
|
*config);
|
|
389
394
|
|
|
390
395
|
if (result.status != 0)
|
|
@@ -429,6 +434,7 @@ aligner_align(VALUE self, VALUE query, VALUE target)
|
|
|
429
434
|
|
|
430
435
|
char *ccigar = edlibAlignmentToCigar(result.alignment, result.alignmentLength, 1); // EDLIB_CIGAR_EXTENDED
|
|
431
436
|
cigar = rb_str_new2(ccigar);
|
|
437
|
+
free(ccigar);
|
|
432
438
|
|
|
433
439
|
VALUE hash = rb_hash_new();
|
|
434
440
|
rb_hash_aset(hash, ID2SYM(rb_intern("edit_distance")), edit_distance);
|
data/lib/edlib/version.rb
CHANGED
data/lib/edlib.rb
CHANGED
metadata
CHANGED
|
@@ -1,17 +1,16 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: edlib
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.0
|
|
4
|
+
version: 0.1.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- kojix2
|
|
8
|
-
autorequire:
|
|
9
8
|
bindir: bin
|
|
10
9
|
cert_chain: []
|
|
11
|
-
date:
|
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
|
12
11
|
dependencies: []
|
|
13
|
-
description:
|
|
14
|
-
(Levenshtein) distance.
|
|
12
|
+
description: Lightweight, super fast C/C++ library for sequence alignment using edit
|
|
13
|
+
(Levenshtein) distance.
|
|
15
14
|
email:
|
|
16
15
|
- 2xijok@gmail.com
|
|
17
16
|
executables: []
|
|
@@ -20,6 +19,7 @@ extensions:
|
|
|
20
19
|
extra_rdoc_files: []
|
|
21
20
|
files:
|
|
22
21
|
- README.md
|
|
22
|
+
- ext/edlib/LICENSE
|
|
23
23
|
- ext/edlib/edlib.cpp
|
|
24
24
|
- ext/edlib/edlib.h
|
|
25
25
|
- ext/edlib/edlibext.c
|
|
@@ -31,7 +31,6 @@ homepage: https://github.com/kojix2/ruby-edlib
|
|
|
31
31
|
licenses:
|
|
32
32
|
- MIT
|
|
33
33
|
metadata: {}
|
|
34
|
-
post_install_message:
|
|
35
34
|
rdoc_options: []
|
|
36
35
|
require_paths:
|
|
37
36
|
- lib
|
|
@@ -46,8 +45,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
46
45
|
- !ruby/object:Gem::Version
|
|
47
46
|
version: '0'
|
|
48
47
|
requirements: []
|
|
49
|
-
rubygems_version: 3.
|
|
50
|
-
signing_key:
|
|
48
|
+
rubygems_version: 3.6.9
|
|
51
49
|
specification_version: 4
|
|
52
50
|
summary: ruby-edlib is a wrapper for edlib.
|
|
53
51
|
test_files: []
|