bio-bwa 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/Gemfile +15 -0
- data/Gemfile.lock +28 -0
- data/LICENSE.txt +35 -0
- data/README.rdoc +33 -0
- data/Rakefile +56 -0
- data/VERSION +1 -0
- data/bio-bwa.gemspec +152 -0
- data/doc/Bio.html +93 -0
- data/doc/Bio/BWA.html +2884 -0
- data/doc/Bio/BWA/Library.html +229 -0
- data/doc/_index.html +119 -0
- data/doc/class_list.html +36 -0
- data/doc/css/common.css +1 -0
- data/doc/css/full_list.css +53 -0
- data/doc/css/style.css +310 -0
- data/doc/file.LICENSE.html +88 -0
- data/doc/file.README.html +119 -0
- data/doc/file_list.html +41 -0
- data/doc/frames.html +13 -0
- data/doc/index.html +119 -0
- data/doc/js/app.js +203 -0
- data/doc/js/full_list.js +149 -0
- data/doc/js/jquery.js +154 -0
- data/doc/method_list.html +171 -0
- data/doc/top-level-namespace.html +88 -0
- data/ext/COPYING +674 -0
- data/ext/ChangeLog +3864 -0
- data/ext/NEWS +555 -0
- data/ext/README +29 -0
- data/ext/bamlite.c +155 -0
- data/ext/bamlite.h +94 -0
- data/ext/bntseq.c +303 -0
- data/ext/bntseq.h +80 -0
- data/ext/bwa.1 +562 -0
- data/ext/bwape.c +807 -0
- data/ext/bwase.c +686 -0
- data/ext/bwase.h +27 -0
- data/ext/bwaseqio.c +222 -0
- data/ext/bwt.c +250 -0
- data/ext/bwt.h +105 -0
- data/ext/bwt_gen/Makefile +23 -0
- data/ext/bwt_gen/QSufSort.c +496 -0
- data/ext/bwt_gen/QSufSort.h +40 -0
- data/ext/bwt_gen/bwt_gen.c +1547 -0
- data/ext/bwt_gen/bwt_gen.h +105 -0
- data/ext/bwt_lite.c +94 -0
- data/ext/bwt_lite.h +29 -0
- data/ext/bwtaln.c +345 -0
- data/ext/bwtaln.h +150 -0
- data/ext/bwtgap.c +264 -0
- data/ext/bwtgap.h +38 -0
- data/ext/bwtindex.c +186 -0
- data/ext/bwtio.c +77 -0
- data/ext/bwtmisc.c +269 -0
- data/ext/bwtsw2.h +51 -0
- data/ext/bwtsw2_aux.c +650 -0
- data/ext/bwtsw2_chain.c +107 -0
- data/ext/bwtsw2_core.c +594 -0
- data/ext/bwtsw2_main.c +100 -0
- data/ext/cs2nt.c +191 -0
- data/ext/is.c +218 -0
- data/ext/khash.h +506 -0
- data/ext/kseq.h +208 -0
- data/ext/ksort.h +269 -0
- data/ext/kstring.c +35 -0
- data/ext/kstring.h +46 -0
- data/ext/kvec.h +90 -0
- data/ext/main.c +63 -0
- data/ext/main.h +29 -0
- data/ext/mkrf_conf.rb +49 -0
- data/ext/qualfa2fq.pl +27 -0
- data/ext/simple_dp.c +162 -0
- data/ext/simpletest.c +23 -0
- data/ext/solid2fastq.pl +111 -0
- data/ext/stdaln.c +1072 -0
- data/ext/stdaln.h +162 -0
- data/ext/utils.c +82 -0
- data/ext/utils.h +54 -0
- data/lib/bio-bwa.rb +7 -0
- data/lib/bio/bwa.rb +312 -0
- data/lib/bio/bwa/library.rb +42 -0
- data/test/data/testdata.fa +602 -0
- data/test/data/testdata.long.fa +175 -0
- data/test/data/testdata.short.fa +2 -0
- data/test/helper.rb +18 -0
- data/test/test_bio-bwa_basic.rb +62 -0
- data/test/test_bio-bwa_make_index.rb +42 -0
- data/test/test_bio-bwa_run_aln.rb +49 -0
- data/test/test_bio-bwa_sam_conversion.rb +49 -0
- metadata +218 -0
@@ -0,0 +1,40 @@
|
|
1
|
+
/* QSufSort.h
|
2
|
+
|
3
|
+
Header file for QSufSort.c
|
4
|
+
|
5
|
+
This file contains an implementation of the algorithm presented in "Faster
|
6
|
+
Suffix Sorting" by N. Jesper Larsson (jesper@cs.lth.se) and Kunihiko
|
7
|
+
Sadakane (sada@is.s.u-tokyo.ac.jp).
|
8
|
+
|
9
|
+
This software may be used freely for any purpose. However, when distributed,
|
10
|
+
the original source must be clearly stated, and, when the source code is
|
11
|
+
distributed, the copyright notice must be retained and any alterations in
|
12
|
+
the code must be clearly marked. No warranty is given regarding the quality
|
13
|
+
of this software.
|
14
|
+
|
15
|
+
Modified by Wong Chi-Kwong, 2004
|
16
|
+
|
17
|
+
Changes summary: - Used long variable and function names
|
18
|
+
- Removed global variables
|
19
|
+
- Replace pointer references with array references
|
20
|
+
- Used insertion sort in place of selection sort and increased insertion sort threshold
|
21
|
+
- Reconstructing suffix array from inverse becomes an option
|
22
|
+
- Add handling where end-of-text symbol is not necessary < all characters
|
23
|
+
- Removed codes for supporting alphabet size > number of characters
|
24
|
+
|
25
|
+
No warrenty is given regarding the quality of the modifications.
|
26
|
+
|
27
|
+
*/
|
28
|
+
|
29
|
+
#ifndef __QSUFSORT_H__
|
30
|
+
#define __QSUFSORT_H__
|
31
|
+
|
32
|
+
#define KEY(V, I, p, h) ( V[ I[p] + h ] )
|
33
|
+
#define INSERT_SORT_NUM_ITEM 16
|
34
|
+
|
35
|
+
void QSufSortSuffixSort(int* __restrict V, int* __restrict I, const int numChar, const int largestInputSymbol,
|
36
|
+
const int smallestInputSymbol, const int skipTransform);
|
37
|
+
void QSufSortGenerateSaFromInverse(const int *V, int* __restrict I, const int numChar);
|
38
|
+
|
39
|
+
|
40
|
+
#endif
|
@@ -0,0 +1,1547 @@
|
|
1
|
+
/*
|
2
|
+
|
3
|
+
BWTConstruct.c BWT-Index Construction
|
4
|
+
|
5
|
+
This module constructs BWT and auxiliary data structures.
|
6
|
+
|
7
|
+
Copyright (C) 2004, Wong Chi Kwong.
|
8
|
+
|
9
|
+
This program is free software; you can redistribute it and/or
|
10
|
+
modify it under the terms of the GNU General Public License
|
11
|
+
as published by the Free Software Foundation; either version 2
|
12
|
+
of the License, or (at your option) any later version.
|
13
|
+
|
14
|
+
This program is distributed in the hope that it will be useful,
|
15
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
+
GNU General Public License for more details.
|
18
|
+
|
19
|
+
You should have received a copy of the GNU General Public License
|
20
|
+
along with this program; if not, write to the Free Software
|
21
|
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
22
|
+
|
23
|
+
*/
|
24
|
+
|
25
|
+
#include <stdio.h>
|
26
|
+
#include <stdlib.h>
|
27
|
+
#include <string.h>
|
28
|
+
#include "bwt_gen.h"
|
29
|
+
#include "QSufSort.h"
|
30
|
+
|
31
|
+
static unsigned int TextLengthFromBytePacked(unsigned int bytePackedLength, unsigned int bitPerChar,
|
32
|
+
unsigned int lastByteLength)
|
33
|
+
{
|
34
|
+
if (bytePackedLength > ALL_ONE_MASK / (BITS_IN_BYTE / bitPerChar)) {
|
35
|
+
fprintf(stderr, "TextLengthFromBytePacked(): text length > 2^32!\n");
|
36
|
+
exit(1);
|
37
|
+
}
|
38
|
+
return (bytePackedLength - 1) * (BITS_IN_BYTE / bitPerChar) + lastByteLength;
|
39
|
+
}
|
40
|
+
|
41
|
+
static void initializeVAL(unsigned int *startAddr, const unsigned int length, const unsigned int initValue)
|
42
|
+
{
|
43
|
+
unsigned int i;
|
44
|
+
for (i=0; i<length; i++) startAddr[i] = initValue;
|
45
|
+
}
|
46
|
+
|
47
|
+
static void GenerateDNAOccCountTable(unsigned int *dnaDecodeTable)
|
48
|
+
{
|
49
|
+
unsigned int i, j, c, t;
|
50
|
+
|
51
|
+
for (i=0; i<DNA_OCC_CNT_TABLE_SIZE_IN_WORD; i++) {
|
52
|
+
dnaDecodeTable[i] = 0;
|
53
|
+
c = i;
|
54
|
+
for (j=0; j<8; j++) {
|
55
|
+
t = c & 0x00000003;
|
56
|
+
dnaDecodeTable[i] += 1 << (t * 8);
|
57
|
+
c >>= 2;
|
58
|
+
}
|
59
|
+
}
|
60
|
+
|
61
|
+
}
|
62
|
+
// for BWTIncCreate()
|
63
|
+
static unsigned int BWTOccValueMajorSizeInWord(const unsigned int numChar)
|
64
|
+
{
|
65
|
+
unsigned int numOfOccValue;
|
66
|
+
unsigned int numOfOccIntervalPerMajor;
|
67
|
+
numOfOccValue = (numChar + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; // Value at both end for bi-directional encoding
|
68
|
+
numOfOccIntervalPerMajor = OCC_INTERVAL_MAJOR / OCC_INTERVAL;
|
69
|
+
return (numOfOccValue + numOfOccIntervalPerMajor - 1) / numOfOccIntervalPerMajor * ALPHABET_SIZE;
|
70
|
+
}
|
71
|
+
// for BWTIncCreate()
|
72
|
+
static unsigned int BWTOccValueMinorSizeInWord(const unsigned int numChar)
|
73
|
+
{
|
74
|
+
unsigned int numOfOccValue;
|
75
|
+
numOfOccValue = (numChar + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; // Value at both end for bi-directional encoding
|
76
|
+
return (numOfOccValue + OCC_VALUE_PER_WORD - 1) / OCC_VALUE_PER_WORD * ALPHABET_SIZE;
|
77
|
+
}
|
78
|
+
// for BWTIncCreate()
|
79
|
+
static unsigned int BWTResidentSizeInWord(const unsigned int numChar) {
|
80
|
+
|
81
|
+
unsigned int numCharRoundUpToOccInterval;
|
82
|
+
|
83
|
+
// The $ in BWT at the position of inverseSa0 is not encoded
|
84
|
+
numCharRoundUpToOccInterval = (numChar + OCC_INTERVAL - 1) / OCC_INTERVAL * OCC_INTERVAL;
|
85
|
+
|
86
|
+
return (numCharRoundUpToOccInterval + CHAR_PER_WORD - 1) / CHAR_PER_WORD;
|
87
|
+
|
88
|
+
}
|
89
|
+
|
90
|
+
static void BWTIncSetBuildSizeAndTextAddr(BWTInc *bwtInc)
|
91
|
+
{
|
92
|
+
unsigned int maxBuildSize;
|
93
|
+
|
94
|
+
if (bwtInc->bwt->textLength == 0) {
|
95
|
+
// initial build
|
96
|
+
// Minus 2 because n+1 entries of seq and rank needed for n char
|
97
|
+
maxBuildSize = (bwtInc->availableWord - 2 - OCC_INTERVAL / CHAR_PER_WORD)
|
98
|
+
/ (2 * CHAR_PER_WORD + 1) * CHAR_PER_WORD;
|
99
|
+
if (bwtInc->initialMaxBuildSize > 0) {
|
100
|
+
bwtInc->buildSize = min(bwtInc->initialMaxBuildSize, maxBuildSize);
|
101
|
+
} else {
|
102
|
+
bwtInc->buildSize = maxBuildSize;
|
103
|
+
}
|
104
|
+
} else {
|
105
|
+
// Minus 3 because n+1 entries of sorted rank, seq and rank needed for n char
|
106
|
+
// Minus numberOfIterationDone because bwt slightly shift to left in each iteration
|
107
|
+
maxBuildSize = (bwtInc->availableWord - bwtInc->bwt->bwtSizeInWord - bwtInc->bwt->occSizeInWord - 3
|
108
|
+
- bwtInc->numberOfIterationDone * OCC_INTERVAL / BIT_PER_CHAR)
|
109
|
+
/ 3;
|
110
|
+
if (maxBuildSize < CHAR_PER_WORD) {
|
111
|
+
fprintf(stderr, "BWTIncSetBuildSizeAndTextAddr(): Not enough space allocated to continue construction!\n");
|
112
|
+
exit(1);
|
113
|
+
}
|
114
|
+
if (bwtInc->incMaxBuildSize > 0) {
|
115
|
+
bwtInc->buildSize = min(bwtInc->incMaxBuildSize, maxBuildSize);
|
116
|
+
} else {
|
117
|
+
bwtInc->buildSize = maxBuildSize;
|
118
|
+
}
|
119
|
+
if (bwtInc->buildSize < CHAR_PER_WORD) {
|
120
|
+
bwtInc->buildSize = CHAR_PER_WORD;
|
121
|
+
}
|
122
|
+
}
|
123
|
+
|
124
|
+
if (bwtInc->buildSize < CHAR_PER_WORD) {
|
125
|
+
fprintf(stderr, "BWTIncSetBuildSizeAndTextAddr(): Not enough space allocated to continue construction!\n");
|
126
|
+
exit(1);
|
127
|
+
}
|
128
|
+
|
129
|
+
bwtInc->buildSize = bwtInc->buildSize / CHAR_PER_WORD * CHAR_PER_WORD;
|
130
|
+
|
131
|
+
bwtInc->packedText = bwtInc->workingMemory + 2 * (bwtInc->buildSize + 1);
|
132
|
+
bwtInc->textBuffer = (unsigned char*)(bwtInc->workingMemory + bwtInc->buildSize + 1);
|
133
|
+
|
134
|
+
}
|
135
|
+
|
136
|
+
// for ceilLog2()
|
137
|
+
unsigned int leadingZero(const unsigned int input)
|
138
|
+
{
|
139
|
+
unsigned int l;
|
140
|
+
const static unsigned int leadingZero8bit[256] = {8,7,6,6,5,5,5,5,4,4,4,4,4,4,4,4,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
|
141
|
+
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
|
142
|
+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
143
|
+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
144
|
+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
145
|
+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
146
|
+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
147
|
+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
|
148
|
+
|
149
|
+
if (input & 0xFFFF0000) {
|
150
|
+
if (input & 0xFF000000) {
|
151
|
+
l = leadingZero8bit[input >> 24];
|
152
|
+
} else {
|
153
|
+
l = 8 + leadingZero8bit[input >> 16];
|
154
|
+
}
|
155
|
+
} else {
|
156
|
+
if (input & 0x0000FF00) {
|
157
|
+
l = 16 + leadingZero8bit[input >> 8];
|
158
|
+
} else {
|
159
|
+
l = 24 + leadingZero8bit[input];
|
160
|
+
}
|
161
|
+
}
|
162
|
+
return l;
|
163
|
+
|
164
|
+
}
|
165
|
+
// for BitPerBytePackedChar()
|
166
|
+
static unsigned int ceilLog2(const unsigned int input)
|
167
|
+
{
|
168
|
+
if (input <= 1) return 0;
|
169
|
+
return BITS_IN_WORD - leadingZero(input - 1);
|
170
|
+
|
171
|
+
}
|
172
|
+
// for ConvertBytePackedToWordPacked()
|
173
|
+
static unsigned int BitPerBytePackedChar(const unsigned int alphabetSize)
|
174
|
+
{
|
175
|
+
unsigned int bitPerChar;
|
176
|
+
bitPerChar = ceilLog2(alphabetSize);
|
177
|
+
// Return the largest number of bit that does not affect packing efficiency
|
178
|
+
if (BITS_IN_BYTE / (BITS_IN_BYTE / bitPerChar) > bitPerChar)
|
179
|
+
bitPerChar = BITS_IN_BYTE / (BITS_IN_BYTE / bitPerChar);
|
180
|
+
return bitPerChar;
|
181
|
+
}
|
182
|
+
// for ConvertBytePackedToWordPacked()
|
183
|
+
static unsigned int BitPerWordPackedChar(const unsigned int alphabetSize)
|
184
|
+
{
|
185
|
+
return ceilLog2(alphabetSize);
|
186
|
+
}
|
187
|
+
|
188
|
+
static void ConvertBytePackedToWordPacked(const unsigned char *input, unsigned int *output, const unsigned int alphabetSize,
|
189
|
+
const unsigned int textLength)
|
190
|
+
{
|
191
|
+
unsigned int i, j, k;
|
192
|
+
unsigned int c;
|
193
|
+
unsigned int bitPerBytePackedChar;
|
194
|
+
unsigned int bitPerWordPackedChar;
|
195
|
+
unsigned int charPerWord;
|
196
|
+
unsigned int charPerByte;
|
197
|
+
unsigned int bytePerIteration;
|
198
|
+
unsigned int byteProcessed = 0;
|
199
|
+
unsigned int wordProcessed = 0;
|
200
|
+
unsigned int mask, shift;
|
201
|
+
|
202
|
+
unsigned int buffer[BITS_IN_WORD];
|
203
|
+
|
204
|
+
bitPerBytePackedChar = BitPerBytePackedChar(alphabetSize);
|
205
|
+
bitPerWordPackedChar = BitPerWordPackedChar(alphabetSize);
|
206
|
+
charPerByte = BITS_IN_BYTE / bitPerBytePackedChar;
|
207
|
+
charPerWord = BITS_IN_WORD / bitPerWordPackedChar;
|
208
|
+
|
209
|
+
bytePerIteration = charPerWord / charPerByte;
|
210
|
+
mask = truncateRight(ALL_ONE_MASK, BITS_IN_WORD - bitPerWordPackedChar);
|
211
|
+
shift = BITS_IN_WORD - BITS_IN_BYTE + bitPerBytePackedChar - bitPerWordPackedChar;
|
212
|
+
|
213
|
+
while ((wordProcessed + 1) * charPerWord < textLength) {
|
214
|
+
|
215
|
+
k = 0;
|
216
|
+
for (i=0; i<bytePerIteration; i++) {
|
217
|
+
c = (unsigned int)input[byteProcessed] << shift;
|
218
|
+
for (j=0; j<charPerByte; j++) {
|
219
|
+
buffer[k] = c & mask;
|
220
|
+
c <<= bitPerBytePackedChar;
|
221
|
+
k++;
|
222
|
+
}
|
223
|
+
byteProcessed++;
|
224
|
+
}
|
225
|
+
|
226
|
+
c = 0;
|
227
|
+
for (i=0; i<charPerWord; i++) {
|
228
|
+
c |= buffer[i] >> bitPerWordPackedChar * i;
|
229
|
+
}
|
230
|
+
output[wordProcessed] = c;
|
231
|
+
wordProcessed++;
|
232
|
+
|
233
|
+
}
|
234
|
+
|
235
|
+
k = 0;
|
236
|
+
for (i=0; i < (textLength - wordProcessed * charPerWord - 1) / charPerByte + 1; i++) {
|
237
|
+
c = (unsigned int)input[byteProcessed] << shift;
|
238
|
+
for (j=0; j<charPerByte; j++) {
|
239
|
+
buffer[k] = c & mask;
|
240
|
+
c <<= bitPerBytePackedChar;
|
241
|
+
k++;
|
242
|
+
}
|
243
|
+
byteProcessed++;
|
244
|
+
}
|
245
|
+
|
246
|
+
c = 0;
|
247
|
+
for (i=0; i<textLength - wordProcessed * charPerWord; i++) {
|
248
|
+
c |= buffer[i] >> bitPerWordPackedChar * i;
|
249
|
+
}
|
250
|
+
output[wordProcessed] = c;
|
251
|
+
}
|
252
|
+
|
253
|
+
BWT *BWTCreate(const unsigned int textLength, unsigned int *decodeTable)
|
254
|
+
{
|
255
|
+
BWT *bwt;
|
256
|
+
|
257
|
+
bwt = (BWT*)calloc(1, sizeof(BWT));
|
258
|
+
|
259
|
+
bwt->textLength = 0;
|
260
|
+
bwt->inverseSa = 0;
|
261
|
+
|
262
|
+
bwt->cumulativeFreq = (unsigned*)calloc((ALPHABET_SIZE + 1), sizeof(unsigned int*));
|
263
|
+
initializeVAL(bwt->cumulativeFreq, ALPHABET_SIZE + 1, 0);
|
264
|
+
|
265
|
+
bwt->bwtSizeInWord = 0;
|
266
|
+
bwt->saValueOnBoundary = NULL;
|
267
|
+
|
268
|
+
// Generate decode tables
|
269
|
+
if (decodeTable == NULL) {
|
270
|
+
bwt->decodeTable = (unsigned*)calloc(DNA_OCC_CNT_TABLE_SIZE_IN_WORD, sizeof(unsigned int));
|
271
|
+
GenerateDNAOccCountTable(bwt->decodeTable);
|
272
|
+
} else {
|
273
|
+
bwt->decodeTable = decodeTable;
|
274
|
+
}
|
275
|
+
|
276
|
+
bwt->occMajorSizeInWord = BWTOccValueMajorSizeInWord(textLength);
|
277
|
+
bwt->occValueMajor = (unsigned*)calloc(bwt->occMajorSizeInWord, sizeof(unsigned int));
|
278
|
+
|
279
|
+
bwt->occSizeInWord = 0;
|
280
|
+
bwt->occValue = NULL;
|
281
|
+
|
282
|
+
bwt->saInterval = ALL_ONE_MASK;
|
283
|
+
bwt->saValueSize = 0;
|
284
|
+
bwt->saValue = NULL;
|
285
|
+
|
286
|
+
bwt->inverseSaInterval = ALL_ONE_MASK;
|
287
|
+
bwt->inverseSaSize = 0;
|
288
|
+
bwt->inverseSa = NULL;
|
289
|
+
|
290
|
+
return bwt;
|
291
|
+
}
|
292
|
+
|
293
|
+
BWTInc *BWTIncCreate(const unsigned int textLength, const float targetNBit,
|
294
|
+
const unsigned int initialMaxBuildSize, const unsigned int incMaxBuildSize)
|
295
|
+
{
|
296
|
+
BWTInc *bwtInc;
|
297
|
+
unsigned int i;
|
298
|
+
|
299
|
+
if (targetNBit == 0) {
|
300
|
+
fprintf(stderr, "BWTIncCreate() : targetNBit = 0!\n");
|
301
|
+
exit(1);
|
302
|
+
}
|
303
|
+
|
304
|
+
bwtInc = (BWTInc*)calloc(1, sizeof(BWTInc));
|
305
|
+
bwtInc->numberOfIterationDone = 0;
|
306
|
+
bwtInc->bwt = BWTCreate(textLength, NULL);
|
307
|
+
bwtInc->initialMaxBuildSize = initialMaxBuildSize;
|
308
|
+
bwtInc->incMaxBuildSize = incMaxBuildSize;
|
309
|
+
bwtInc->targetNBit = targetNBit;
|
310
|
+
bwtInc->cumulativeCountInCurrentBuild = (unsigned*)calloc((ALPHABET_SIZE + 1), sizeof(unsigned int));
|
311
|
+
initializeVAL(bwtInc->cumulativeCountInCurrentBuild, ALPHABET_SIZE + 1, 0);
|
312
|
+
|
313
|
+
// Build frequently accessed data
|
314
|
+
bwtInc->packedShift = (unsigned*)calloc(CHAR_PER_WORD, sizeof(unsigned int));
|
315
|
+
for (i=0; i<CHAR_PER_WORD; i++) {
|
316
|
+
bwtInc->packedShift[i] = BITS_IN_WORD - (i+1) * BIT_PER_CHAR;
|
317
|
+
}
|
318
|
+
|
319
|
+
bwtInc->targetTextLength = textLength;
|
320
|
+
bwtInc->availableWord = (unsigned int)((textLength + OCC_INTERVAL - 1) / OCC_INTERVAL * OCC_INTERVAL / BITS_IN_WORD * bwtInc->targetNBit);
|
321
|
+
if (bwtInc->availableWord < BWTResidentSizeInWord(textLength) + BWTOccValueMinorSizeInWord(textLength)) {
|
322
|
+
fprintf(stderr, "BWTIncCreate() : targetNBit is too low!\n");
|
323
|
+
exit(1);
|
324
|
+
}
|
325
|
+
bwtInc->workingMemory = (unsigned*)calloc(bwtInc->availableWord, BYTES_IN_WORD);
|
326
|
+
|
327
|
+
return bwtInc;
|
328
|
+
|
329
|
+
}
|
330
|
+
// for BWTIncConstruct()
|
331
|
+
static void BWTIncPutPackedTextToRank(const unsigned int *packedText, unsigned int* __restrict rank,
|
332
|
+
unsigned int* __restrict cumulativeCount, const unsigned int numChar)
|
333
|
+
{
|
334
|
+
unsigned int i, j;
|
335
|
+
unsigned int c, t;
|
336
|
+
unsigned int packedMask;
|
337
|
+
unsigned int rankIndex;
|
338
|
+
unsigned int lastWord, numCharInLastWord;
|
339
|
+
|
340
|
+
lastWord = (numChar - 1) / CHAR_PER_WORD;
|
341
|
+
numCharInLastWord = numChar - lastWord * CHAR_PER_WORD;
|
342
|
+
|
343
|
+
packedMask = ALL_ONE_MASK >> (BITS_IN_WORD - BIT_PER_CHAR);
|
344
|
+
rankIndex = numChar - 1;
|
345
|
+
|
346
|
+
t = packedText[lastWord] >> (BITS_IN_WORD - numCharInLastWord * BIT_PER_CHAR);
|
347
|
+
for (i=0; i<numCharInLastWord; i++) {
|
348
|
+
c = t & packedMask;
|
349
|
+
cumulativeCount[c+1]++;
|
350
|
+
rank[rankIndex] = c;
|
351
|
+
rankIndex--;
|
352
|
+
t >>= BIT_PER_CHAR;
|
353
|
+
}
|
354
|
+
|
355
|
+
for (i=lastWord; i--;) { // loop from lastWord - 1 to 0
|
356
|
+
t = packedText[i];
|
357
|
+
for (j=0; j<CHAR_PER_WORD; j++) {
|
358
|
+
c = t & packedMask;
|
359
|
+
cumulativeCount[c+1]++;
|
360
|
+
rank[rankIndex] = c;
|
361
|
+
rankIndex--;
|
362
|
+
t >>= BIT_PER_CHAR;
|
363
|
+
}
|
364
|
+
}
|
365
|
+
|
366
|
+
// Convert occurrence to cumulativeCount
|
367
|
+
cumulativeCount[2] += cumulativeCount[1];
|
368
|
+
cumulativeCount[3] += cumulativeCount[2];
|
369
|
+
cumulativeCount[4] += cumulativeCount[3];
|
370
|
+
}
|
371
|
+
|
372
|
+
|
373
|
+
static void ForwardDNAAllOccCountNoLimit(const unsigned int* dna, const unsigned int index,
|
374
|
+
unsigned int* __restrict occCount, const unsigned int* dnaDecodeTable)
|
375
|
+
{
|
376
|
+
static const unsigned int truncateRightMask[16] = { 0x00000000, 0xC0000000, 0xF0000000, 0xFC000000,
|
377
|
+
0xFF000000, 0xFFC00000, 0xFFF00000, 0xFFFC0000,
|
378
|
+
0xFFFF0000, 0xFFFFC000, 0xFFFFF000, 0xFFFFFC00,
|
379
|
+
0xFFFFFF00, 0xFFFFFFC0, 0xFFFFFFF0, 0xFFFFFFFC };
|
380
|
+
|
381
|
+
unsigned int iteration, wordToCount, charToCount;
|
382
|
+
unsigned int i, j, c;
|
383
|
+
unsigned int sum;
|
384
|
+
|
385
|
+
occCount[0] = 0;
|
386
|
+
occCount[1] = 0;
|
387
|
+
occCount[2] = 0;
|
388
|
+
occCount[3] = 0;
|
389
|
+
|
390
|
+
iteration = index / 256;
|
391
|
+
wordToCount = (index - iteration * 256) / 16;
|
392
|
+
charToCount = index - iteration * 256 - wordToCount * 16;
|
393
|
+
|
394
|
+
for (i=0; i<iteration; i++) {
|
395
|
+
|
396
|
+
sum = 0;
|
397
|
+
for (j=0; j<16; j++) {
|
398
|
+
sum += dnaDecodeTable[*dna >> 16];
|
399
|
+
sum += dnaDecodeTable[*dna & 0x0000FFFF];
|
400
|
+
dna++;
|
401
|
+
}
|
402
|
+
if (!DNA_OCC_SUM_EXCEPTION(sum)) {
|
403
|
+
occCount[0] += sum & 0x000000FF; sum >>= 8;
|
404
|
+
occCount[1] += sum & 0x000000FF; sum >>= 8;
|
405
|
+
occCount[2] += sum & 0x000000FF; sum >>= 8;
|
406
|
+
occCount[3] += sum;
|
407
|
+
} else {
|
408
|
+
// only some or all of the 3 bits are on
|
409
|
+
// in reality, only one of the four cases are possible
|
410
|
+
if (sum == 0x00000100) {
|
411
|
+
occCount[0] += 256;
|
412
|
+
} else if (sum == 0x00010000) {
|
413
|
+
occCount[1] += 256;
|
414
|
+
} else if (sum == 0x01000000) {
|
415
|
+
occCount[2] += 256;
|
416
|
+
} else if (sum == 0x00000000) {
|
417
|
+
occCount[3] += 256;
|
418
|
+
} else {
|
419
|
+
fprintf(stderr, "ForwardDNAAllOccCountNoLimit(): DNA occ sum exception!\n");
|
420
|
+
exit(1);
|
421
|
+
}
|
422
|
+
}
|
423
|
+
|
424
|
+
}
|
425
|
+
|
426
|
+
sum = 0;
|
427
|
+
for (j=0; j<wordToCount; j++) {
|
428
|
+
sum += dnaDecodeTable[*dna >> 16];
|
429
|
+
sum += dnaDecodeTable[*dna & 0x0000FFFF];
|
430
|
+
dna++;
|
431
|
+
}
|
432
|
+
|
433
|
+
if (charToCount > 0) {
|
434
|
+
c = *dna & truncateRightMask[charToCount]; // increase count of 'a' by 16 - c;
|
435
|
+
sum += dnaDecodeTable[c >> 16];
|
436
|
+
sum += dnaDecodeTable[c & 0xFFFF];
|
437
|
+
sum += charToCount - 16; // decrease count of 'a' by 16 - positionToProcess
|
438
|
+
}
|
439
|
+
|
440
|
+
occCount[0] += sum & 0x000000FF; sum >>= 8;
|
441
|
+
occCount[1] += sum & 0x000000FF; sum >>= 8;
|
442
|
+
occCount[2] += sum & 0x000000FF; sum >>= 8;
|
443
|
+
occCount[3] += sum;
|
444
|
+
}
|
445
|
+
|
446
|
+
static void BWTIncBuildPackedBwt(const unsigned int *relativeRank, unsigned int* __restrict bwt, const unsigned int numChar,
|
447
|
+
const unsigned int *cumulativeCount, const unsigned int *packedShift) {
|
448
|
+
|
449
|
+
unsigned int i, c, r;
|
450
|
+
unsigned int previousRank, currentRank;
|
451
|
+
unsigned int wordIndex, charIndex;
|
452
|
+
unsigned int inverseSa0;
|
453
|
+
|
454
|
+
inverseSa0 = previousRank = relativeRank[0];
|
455
|
+
|
456
|
+
for (i=1; i<=numChar; i++) {
|
457
|
+
currentRank = relativeRank[i];
|
458
|
+
// previousRank > cumulativeCount[c] because $ is one of the char
|
459
|
+
c = (previousRank > cumulativeCount[1]) + (previousRank > cumulativeCount[2])
|
460
|
+
+ (previousRank > cumulativeCount[3]);
|
461
|
+
// set bwt for currentRank
|
462
|
+
if (c > 0) {
|
463
|
+
// c <> 'a'
|
464
|
+
r = currentRank;
|
465
|
+
if (r > inverseSa0) {
|
466
|
+
// - 1 because $ at inverseSa0 is not encoded
|
467
|
+
r--;
|
468
|
+
}
|
469
|
+
wordIndex = r / CHAR_PER_WORD;
|
470
|
+
charIndex = r - wordIndex * CHAR_PER_WORD;
|
471
|
+
bwt[wordIndex] |= c << packedShift[charIndex];
|
472
|
+
}
|
473
|
+
previousRank = currentRank;
|
474
|
+
}
|
475
|
+
}
|
476
|
+
|
477
|
+
static inline unsigned int BWTOccValueExplicit(const BWT *bwt, const unsigned int occIndexExplicit,
|
478
|
+
const unsigned int character)
|
479
|
+
{
|
480
|
+
unsigned int occIndexMajor;
|
481
|
+
|
482
|
+
occIndexMajor = occIndexExplicit * OCC_INTERVAL / OCC_INTERVAL_MAJOR;
|
483
|
+
|
484
|
+
if (occIndexExplicit % OCC_VALUE_PER_WORD == 0) {
|
485
|
+
return bwt->occValueMajor[occIndexMajor * ALPHABET_SIZE + character] +
|
486
|
+
(bwt->occValue[occIndexExplicit / OCC_VALUE_PER_WORD * ALPHABET_SIZE + character] >> 16);
|
487
|
+
|
488
|
+
} else {
|
489
|
+
return bwt->occValueMajor[occIndexMajor * ALPHABET_SIZE + character] +
|
490
|
+
(bwt->occValue[occIndexExplicit / OCC_VALUE_PER_WORD * ALPHABET_SIZE + character] & 0x0000FFFF);
|
491
|
+
}
|
492
|
+
}
|
493
|
+
|
494
|
+
|
495
|
+
static unsigned int ForwardDNAOccCount(const unsigned int* dna, const unsigned int index, const unsigned int character,
|
496
|
+
const unsigned int* dnaDecodeTable)
|
497
|
+
{
|
498
|
+
static const unsigned int truncateRightMask[16] = { 0x00000000, 0xC0000000, 0xF0000000, 0xFC000000,
|
499
|
+
0xFF000000, 0xFFC00000, 0xFFF00000, 0xFFFC0000,
|
500
|
+
0xFFFF0000, 0xFFFFC000, 0xFFFFF000, 0xFFFFFC00,
|
501
|
+
0xFFFFFF00, 0xFFFFFFC0, 0xFFFFFFF0, 0xFFFFFFFC };
|
502
|
+
|
503
|
+
unsigned int wordToCount, charToCount;
|
504
|
+
unsigned int i, c;
|
505
|
+
unsigned int sum = 0;
|
506
|
+
|
507
|
+
wordToCount = index / 16;
|
508
|
+
charToCount = index - wordToCount * 16;
|
509
|
+
|
510
|
+
for (i=0; i<wordToCount; i++) {
|
511
|
+
sum += dnaDecodeTable[dna[i] >> 16];
|
512
|
+
sum += dnaDecodeTable[dna[i] & 0x0000FFFF];
|
513
|
+
}
|
514
|
+
|
515
|
+
if (charToCount > 0) {
|
516
|
+
c = dna[i] & truncateRightMask[charToCount]; // increase count of 'a' by 16 - c;
|
517
|
+
sum += dnaDecodeTable[c >> 16];
|
518
|
+
sum += dnaDecodeTable[c & 0xFFFF];
|
519
|
+
sum += charToCount - 16; // decrease count of 'a' by 16 - positionToProcess
|
520
|
+
}
|
521
|
+
|
522
|
+
return (sum >> (character * 8)) & 0x000000FF;
|
523
|
+
|
524
|
+
}
|
525
|
+
|
526
|
+
static unsigned int BackwardDNAOccCount(const unsigned int* dna, const unsigned int index, const unsigned int character,
|
527
|
+
const unsigned int* dnaDecodeTable)
|
528
|
+
{
|
529
|
+
static const unsigned int truncateLeftMask[16] = { 0x00000000, 0x00000003, 0x0000000F, 0x0000003F,
|
530
|
+
0x000000FF, 0x000003FF, 0x00000FFF, 0x00003FFF,
|
531
|
+
0x0000FFFF, 0x0003FFFF, 0x000FFFFF, 0x003FFFFF,
|
532
|
+
0x00FFFFFF, 0x03FFFFFF, 0x0FFFFFFF, 0x3FFFFFFF };
|
533
|
+
|
534
|
+
unsigned int wordToCount, charToCount;
|
535
|
+
unsigned int i, c;
|
536
|
+
unsigned int sum = 0;
|
537
|
+
|
538
|
+
wordToCount = index / 16;
|
539
|
+
charToCount = index - wordToCount * 16;
|
540
|
+
|
541
|
+
dna -= wordToCount + 1;
|
542
|
+
|
543
|
+
if (charToCount > 0) {
|
544
|
+
c = *dna & truncateLeftMask[charToCount]; // increase count of 'a' by 16 - c;
|
545
|
+
sum += dnaDecodeTable[c >> 16];
|
546
|
+
sum += dnaDecodeTable[c & 0xFFFF];
|
547
|
+
sum += charToCount - 16; // decrease count of 'a' by 16 - positionToProcess
|
548
|
+
}
|
549
|
+
|
550
|
+
for (i=0; i<wordToCount; i++) {
|
551
|
+
dna++;
|
552
|
+
sum += dnaDecodeTable[*dna >> 16];
|
553
|
+
sum += dnaDecodeTable[*dna & 0x0000FFFF];
|
554
|
+
}
|
555
|
+
|
556
|
+
return (sum >> (character * 8)) & 0x000000FF;
|
557
|
+
|
558
|
+
}
|
559
|
+
|
560
|
+
unsigned int BWTOccValue(const BWT *bwt, unsigned int index, const unsigned int character) {
|
561
|
+
|
562
|
+
unsigned int occValue;
|
563
|
+
unsigned int occExplicitIndex, occIndex;
|
564
|
+
|
565
|
+
// $ is supposed to be positioned at inverseSa0 but it is not encoded
|
566
|
+
// therefore index is subtracted by 1 for adjustment
|
567
|
+
if (index > bwt->inverseSa0) {
|
568
|
+
index--;
|
569
|
+
}
|
570
|
+
|
571
|
+
occExplicitIndex = (index + OCC_INTERVAL / 2 - 1) / OCC_INTERVAL; // Bidirectional encoding
|
572
|
+
occIndex = occExplicitIndex * OCC_INTERVAL;
|
573
|
+
occValue = BWTOccValueExplicit(bwt, occExplicitIndex, character);
|
574
|
+
|
575
|
+
if (occIndex == index) {
|
576
|
+
return occValue;
|
577
|
+
}
|
578
|
+
|
579
|
+
if (occIndex < index) {
|
580
|
+
return occValue + ForwardDNAOccCount(bwt->bwtCode + occIndex / CHAR_PER_WORD, index - occIndex, character, bwt->decodeTable);
|
581
|
+
} else {
|
582
|
+
return occValue - BackwardDNAOccCount(bwt->bwtCode + occIndex / CHAR_PER_WORD, occIndex - index, character, bwt->decodeTable);
|
583
|
+
}
|
584
|
+
|
585
|
+
}
|
586
|
+
|
587
|
+
static unsigned int BWTIncGetAbsoluteRank(BWT *bwt, unsigned int* __restrict absoluteRank, unsigned int* __restrict seq,
|
588
|
+
const unsigned int *packedText, const unsigned int numChar,
|
589
|
+
const unsigned int* cumulativeCount, const unsigned int firstCharInLastIteration)
|
590
|
+
{
|
591
|
+
unsigned int saIndex;
|
592
|
+
unsigned int lastWord;
|
593
|
+
unsigned int packedMask;
|
594
|
+
unsigned int i, j;
|
595
|
+
unsigned int c, t;
|
596
|
+
unsigned int rankIndex;
|
597
|
+
unsigned int shift;
|
598
|
+
unsigned int seqIndexFromStart[ALPHABET_SIZE];
|
599
|
+
unsigned int seqIndexFromEnd[ALPHABET_SIZE];
|
600
|
+
|
601
|
+
for (i=0; i<ALPHABET_SIZE; i++) {
|
602
|
+
seqIndexFromStart[i] = cumulativeCount[i];
|
603
|
+
seqIndexFromEnd[i] = cumulativeCount[i+1] - 1;
|
604
|
+
}
|
605
|
+
|
606
|
+
shift = BITS_IN_WORD - BIT_PER_CHAR;
|
607
|
+
packedMask = ALL_ONE_MASK >> shift;
|
608
|
+
saIndex = bwt->inverseSa0;
|
609
|
+
rankIndex = numChar - 1;
|
610
|
+
|
611
|
+
lastWord = numChar / CHAR_PER_WORD;
|
612
|
+
for (i=lastWord; i--;) { // loop from lastWord - 1 to 0
|
613
|
+
t = packedText[i];
|
614
|
+
for (j=0; j<CHAR_PER_WORD; j++) {
|
615
|
+
c = t & packedMask;
|
616
|
+
saIndex = bwt->cumulativeFreq[c] + BWTOccValue(bwt, saIndex, c) + 1;
|
617
|
+
// A counting sort using the first character of suffix is done here
|
618
|
+
// If rank > inverseSa0 -> fill seq from end, otherwise fill seq from start -> to leave the right entry for inverseSa0
|
619
|
+
if (saIndex > bwt->inverseSa0) {
|
620
|
+
seq[seqIndexFromEnd[c]] = rankIndex;
|
621
|
+
absoluteRank[seqIndexFromEnd[c]] = saIndex;
|
622
|
+
seqIndexFromEnd[c]--;
|
623
|
+
} else {
|
624
|
+
seq[seqIndexFromStart[c]] = rankIndex;
|
625
|
+
absoluteRank[seqIndexFromStart[c]] = saIndex;
|
626
|
+
seqIndexFromStart[c]++;
|
627
|
+
}
|
628
|
+
rankIndex--;
|
629
|
+
t >>= BIT_PER_CHAR;
|
630
|
+
}
|
631
|
+
}
|
632
|
+
|
633
|
+
absoluteRank[seqIndexFromStart[firstCharInLastIteration]] = bwt->inverseSa0; // representing the substring of all preceding characters
|
634
|
+
seq[seqIndexFromStart[firstCharInLastIteration]] = numChar;
|
635
|
+
|
636
|
+
return seqIndexFromStart[firstCharInLastIteration];
|
637
|
+
}
|
638
|
+
|
639
|
+
static void BWTIncSortKey(unsigned int* __restrict key, unsigned int* __restrict seq, const unsigned int numItem)
|
640
|
+
{
|
641
|
+
#define EQUAL_KEY_THRESHOLD 4 // Partition for equal key if data array size / the number of data with equal value with pivot < EQUAL_KEY_THRESHOLD
|
642
|
+
|
643
|
+
int lowIndex, highIndex, midIndex;
|
644
|
+
int lowPartitionIndex, highPartitionIndex;
|
645
|
+
int lowStack[32], highStack[32];
|
646
|
+
int stackDepth;
|
647
|
+
int i, j;
|
648
|
+
unsigned int tempSeq, tempKey;
|
649
|
+
int numberOfEqualKey;
|
650
|
+
|
651
|
+
if (numItem < 2) return;
|
652
|
+
|
653
|
+
stackDepth = 0;
|
654
|
+
|
655
|
+
lowIndex = 0;
|
656
|
+
highIndex = numItem - 1;
|
657
|
+
|
658
|
+
for (;;) {
|
659
|
+
|
660
|
+
for (;;) {
|
661
|
+
|
662
|
+
// Sort small array of data
|
663
|
+
if (highIndex - lowIndex < BWTINC_INSERT_SORT_NUM_ITEM) { // Insertion sort on smallest arrays
|
664
|
+
for (i=lowIndex+1; i<=highIndex; i++) {
|
665
|
+
tempSeq = seq[i];
|
666
|
+
tempKey = key[i];
|
667
|
+
for (j = i; j > lowIndex && key[j-1] > tempKey; j--) {
|
668
|
+
seq[j] = seq[j-1];
|
669
|
+
key[j] = key[j-1];
|
670
|
+
}
|
671
|
+
if (j != i) {
|
672
|
+
seq[j] = tempSeq;
|
673
|
+
key[j] = tempKey;
|
674
|
+
}
|
675
|
+
}
|
676
|
+
break;
|
677
|
+
}
|
678
|
+
|
679
|
+
// Choose pivot as median of the lowest, middle, and highest data; sort the three data
|
680
|
+
|
681
|
+
midIndex = average(lowIndex, highIndex);
|
682
|
+
if (key[lowIndex] > key[midIndex]) {
|
683
|
+
tempSeq = seq[lowIndex];
|
684
|
+
tempKey = key[lowIndex];
|
685
|
+
seq[lowIndex] = seq[midIndex];
|
686
|
+
key[lowIndex] = key[midIndex];
|
687
|
+
seq[midIndex] = tempSeq;
|
688
|
+
key[midIndex] = tempKey;
|
689
|
+
}
|
690
|
+
if (key[lowIndex] > key[highIndex]) {
|
691
|
+
tempSeq = seq[lowIndex];
|
692
|
+
tempKey = key[lowIndex];
|
693
|
+
seq[lowIndex] = seq[highIndex];
|
694
|
+
key[lowIndex] = key[highIndex];
|
695
|
+
seq[highIndex] = tempSeq;
|
696
|
+
key[highIndex] = tempKey;
|
697
|
+
}
|
698
|
+
if (key[midIndex] > key[highIndex]) {
|
699
|
+
tempSeq = seq[midIndex];
|
700
|
+
tempKey = key[midIndex];
|
701
|
+
seq[midIndex] = seq[highIndex];
|
702
|
+
key[midIndex] = key[highIndex];
|
703
|
+
seq[highIndex] = tempSeq;
|
704
|
+
key[highIndex] = tempKey;
|
705
|
+
}
|
706
|
+
|
707
|
+
// Partition data
|
708
|
+
|
709
|
+
numberOfEqualKey = 0;
|
710
|
+
|
711
|
+
lowPartitionIndex = lowIndex + 1;
|
712
|
+
highPartitionIndex = highIndex - 1;
|
713
|
+
|
714
|
+
for (;;) {
|
715
|
+
while (lowPartitionIndex <= highPartitionIndex && key[lowPartitionIndex] <= key[midIndex]) {
|
716
|
+
numberOfEqualKey += (key[lowPartitionIndex] == key[midIndex]);
|
717
|
+
lowPartitionIndex++;
|
718
|
+
}
|
719
|
+
while (lowPartitionIndex < highPartitionIndex) {
|
720
|
+
if (key[midIndex] >= key[highPartitionIndex]) {
|
721
|
+
numberOfEqualKey += (key[midIndex] == key[highPartitionIndex]);
|
722
|
+
break;
|
723
|
+
}
|
724
|
+
highPartitionIndex--;
|
725
|
+
}
|
726
|
+
if (lowPartitionIndex >= highPartitionIndex) {
|
727
|
+
break;
|
728
|
+
}
|
729
|
+
tempSeq = seq[lowPartitionIndex];
|
730
|
+
tempKey = key[lowPartitionIndex];
|
731
|
+
seq[lowPartitionIndex] = seq[highPartitionIndex];
|
732
|
+
key[lowPartitionIndex] = key[highPartitionIndex];
|
733
|
+
seq[highPartitionIndex] = tempSeq;
|
734
|
+
key[highPartitionIndex] = tempKey;
|
735
|
+
if (highPartitionIndex == midIndex) {
|
736
|
+
// partition key has been moved
|
737
|
+
midIndex = lowPartitionIndex;
|
738
|
+
}
|
739
|
+
lowPartitionIndex++;
|
740
|
+
highPartitionIndex--;
|
741
|
+
}
|
742
|
+
|
743
|
+
// Adjust the partition index
|
744
|
+
highPartitionIndex = lowPartitionIndex;
|
745
|
+
lowPartitionIndex--;
|
746
|
+
|
747
|
+
// move the partition key to end of low partition
|
748
|
+
tempSeq = seq[midIndex];
|
749
|
+
tempKey = key[midIndex];
|
750
|
+
seq[midIndex] = seq[lowPartitionIndex];
|
751
|
+
key[midIndex] = key[lowPartitionIndex];
|
752
|
+
seq[lowPartitionIndex] = tempSeq;
|
753
|
+
key[lowPartitionIndex] = tempKey;
|
754
|
+
|
755
|
+
if (highIndex - lowIndex + BWTINC_INSERT_SORT_NUM_ITEM <= EQUAL_KEY_THRESHOLD * numberOfEqualKey) {
|
756
|
+
|
757
|
+
// Many keys = partition key; separate the equal key data from the lower partition
|
758
|
+
|
759
|
+
midIndex = lowIndex;
|
760
|
+
|
761
|
+
for (;;) {
|
762
|
+
while (midIndex < lowPartitionIndex && key[midIndex] < key[lowPartitionIndex]) {
|
763
|
+
midIndex++;
|
764
|
+
}
|
765
|
+
while (midIndex < lowPartitionIndex && key[lowPartitionIndex] == key[lowPartitionIndex - 1]) {
|
766
|
+
lowPartitionIndex--;
|
767
|
+
}
|
768
|
+
if (midIndex >= lowPartitionIndex) {
|
769
|
+
break;
|
770
|
+
}
|
771
|
+
tempSeq = seq[midIndex];
|
772
|
+
tempKey = key[midIndex];
|
773
|
+
seq[midIndex] = seq[lowPartitionIndex - 1];
|
774
|
+
key[midIndex] = key[lowPartitionIndex - 1];
|
775
|
+
seq[lowPartitionIndex - 1] = tempSeq;
|
776
|
+
key[lowPartitionIndex - 1] = tempKey;
|
777
|
+
midIndex++;
|
778
|
+
lowPartitionIndex--;
|
779
|
+
}
|
780
|
+
|
781
|
+
}
|
782
|
+
|
783
|
+
if (lowPartitionIndex - lowIndex > highIndex - highPartitionIndex) {
|
784
|
+
// put the larger partition to stack
|
785
|
+
lowStack[stackDepth] = lowIndex;
|
786
|
+
highStack[stackDepth] = lowPartitionIndex - 1;
|
787
|
+
stackDepth++;
|
788
|
+
// sort the smaller partition first
|
789
|
+
lowIndex = highPartitionIndex;
|
790
|
+
} else {
|
791
|
+
// put the larger partition to stack
|
792
|
+
lowStack[stackDepth] = highPartitionIndex;
|
793
|
+
highStack[stackDepth] = highIndex;
|
794
|
+
stackDepth++;
|
795
|
+
// sort the smaller partition first
|
796
|
+
if (lowPartitionIndex > lowIndex) {
|
797
|
+
highIndex = lowPartitionIndex - 1;
|
798
|
+
} else {
|
799
|
+
// all keys in the partition equals to the partition key
|
800
|
+
break;
|
801
|
+
}
|
802
|
+
}
|
803
|
+
continue;
|
804
|
+
}
|
805
|
+
|
806
|
+
// Pop a range from stack
|
807
|
+
if (stackDepth > 0) {
|
808
|
+
stackDepth--;
|
809
|
+
lowIndex = lowStack[stackDepth];
|
810
|
+
highIndex = highStack[stackDepth];
|
811
|
+
continue;
|
812
|
+
} else return;
|
813
|
+
}
|
814
|
+
}
|
815
|
+
|
816
|
+
|
817
|
+
static void BWTIncBuildRelativeRank(unsigned int* __restrict sortedRank, unsigned int* __restrict seq,
|
818
|
+
unsigned int* __restrict relativeRank, const unsigned int numItem,
|
819
|
+
unsigned int oldInverseSa0, const unsigned int *cumulativeCount)
|
820
|
+
{
|
821
|
+
unsigned int i, c;
|
822
|
+
unsigned int s, r;
|
823
|
+
unsigned int lastRank, lastIndex;
|
824
|
+
unsigned int oldInverseSa0RelativeRank = 0;
|
825
|
+
unsigned int freq;
|
826
|
+
|
827
|
+
lastIndex = numItem;
|
828
|
+
lastRank = sortedRank[numItem];
|
829
|
+
if (lastRank > oldInverseSa0) {
|
830
|
+
sortedRank[numItem]--; // to prepare for merging; $ is not encoded in bwt
|
831
|
+
}
|
832
|
+
s = seq[numItem];
|
833
|
+
relativeRank[s] = numItem;
|
834
|
+
if (lastRank == oldInverseSa0) {
|
835
|
+
oldInverseSa0RelativeRank = numItem;
|
836
|
+
oldInverseSa0++; // so that this segment of code is not run again
|
837
|
+
lastRank++; // so that oldInverseSa0 become a sorted group with 1 item
|
838
|
+
}
|
839
|
+
|
840
|
+
c = ALPHABET_SIZE - 1;
|
841
|
+
freq = cumulativeCount[c];
|
842
|
+
|
843
|
+
for (i=numItem; i--;) { // from numItem - 1 to 0
|
844
|
+
r = sortedRank[i];
|
845
|
+
if (r > oldInverseSa0) {
|
846
|
+
sortedRank[i]--; // to prepare for merging; $ is not encoded in bwt
|
847
|
+
}
|
848
|
+
s = seq[i];
|
849
|
+
if (i < freq) {
|
850
|
+
if (lastIndex >= freq) {
|
851
|
+
lastRank++; // to trigger the group across alphabet boundary to be split
|
852
|
+
}
|
853
|
+
c--;
|
854
|
+
freq = cumulativeCount[c];
|
855
|
+
}
|
856
|
+
if (r == lastRank) {
|
857
|
+
relativeRank[s] = lastIndex;
|
858
|
+
} else {
|
859
|
+
if (i == lastIndex - 1) {
|
860
|
+
if (lastIndex < numItem && (int)seq[lastIndex + 1] < 0) {
|
861
|
+
seq[lastIndex] = seq[lastIndex + 1] - 1;
|
862
|
+
} else {
|
863
|
+
seq[lastIndex] = (unsigned int)-1;
|
864
|
+
}
|
865
|
+
}
|
866
|
+
lastIndex = i;
|
867
|
+
lastRank = r;
|
868
|
+
relativeRank[s] = i;
|
869
|
+
if (r == oldInverseSa0) {
|
870
|
+
oldInverseSa0RelativeRank = i;
|
871
|
+
oldInverseSa0++; // so that this segment of code is not run again
|
872
|
+
lastRank++; // so that oldInverseSa0 become a sorted group with 1 item
|
873
|
+
}
|
874
|
+
}
|
875
|
+
}
|
876
|
+
|
877
|
+
}
|
878
|
+
|
879
|
+
static void BWTIncBuildBwt(unsigned int* seq, const unsigned int *relativeRank, const unsigned int numChar,
|
880
|
+
const unsigned int *cumulativeCount)
|
881
|
+
{
|
882
|
+
unsigned int i, c;
|
883
|
+
unsigned int previousRank, currentRank;
|
884
|
+
|
885
|
+
previousRank = relativeRank[0];
|
886
|
+
|
887
|
+
for (i=1; i<=numChar; i++) {
|
888
|
+
currentRank = relativeRank[i];
|
889
|
+
c = (previousRank >= cumulativeCount[1]) + (previousRank >= cumulativeCount[2])
|
890
|
+
+ (previousRank >= cumulativeCount[3]);
|
891
|
+
seq[currentRank] = c;
|
892
|
+
previousRank = currentRank;
|
893
|
+
}
|
894
|
+
}
|
895
|
+
|
896
|
+
static void BWTIncMergeBwt(const unsigned int *sortedRank, const unsigned int* oldBwt, const unsigned int *insertBwt,
|
897
|
+
unsigned int* __restrict mergedBwt, const unsigned int numOldBwt, const unsigned int numInsertBwt)
|
898
|
+
{
|
899
|
+
unsigned int bitsInWordMinusBitPerChar;
|
900
|
+
unsigned int leftShift, rightShift;
|
901
|
+
unsigned int o;
|
902
|
+
unsigned int oIndex, iIndex, mIndex;
|
903
|
+
unsigned int mWord, mChar, oWord, oChar;
|
904
|
+
unsigned int numInsert;
|
905
|
+
|
906
|
+
bitsInWordMinusBitPerChar = BITS_IN_WORD - BIT_PER_CHAR;
|
907
|
+
|
908
|
+
oIndex = 0;
|
909
|
+
iIndex = 0;
|
910
|
+
mIndex = 0;
|
911
|
+
|
912
|
+
mWord = 0;
|
913
|
+
mChar = 0;
|
914
|
+
|
915
|
+
mergedBwt[0] = 0; // this can be cleared as merged Bwt slightly shift to the left in each iteration
|
916
|
+
|
917
|
+
while (oIndex < numOldBwt) {
|
918
|
+
|
919
|
+
// copy from insertBwt
|
920
|
+
while (iIndex <= numInsertBwt && sortedRank[iIndex] <= oIndex) {
|
921
|
+
if (sortedRank[iIndex] != 0) { // special value to indicate that this is for new inverseSa0
|
922
|
+
mergedBwt[mWord] |= insertBwt[iIndex] << (BITS_IN_WORD - (mChar + 1) * BIT_PER_CHAR);
|
923
|
+
mIndex++;
|
924
|
+
mChar++;
|
925
|
+
if (mChar == CHAR_PER_WORD) {
|
926
|
+
mChar = 0;
|
927
|
+
mWord++;
|
928
|
+
mergedBwt[mWord] = 0; // no need to worry about crossing mergedBwt boundary
|
929
|
+
}
|
930
|
+
}
|
931
|
+
iIndex++;
|
932
|
+
}
|
933
|
+
|
934
|
+
// Copy from oldBwt to mergedBwt
|
935
|
+
if (iIndex <= numInsertBwt) {
|
936
|
+
o = sortedRank[iIndex];
|
937
|
+
} else {
|
938
|
+
o = numOldBwt;
|
939
|
+
}
|
940
|
+
numInsert = o - oIndex;
|
941
|
+
|
942
|
+
oWord = oIndex / CHAR_PER_WORD;
|
943
|
+
oChar = oIndex - oWord * CHAR_PER_WORD;
|
944
|
+
if (oChar > mChar) {
|
945
|
+
leftShift = (oChar - mChar) * BIT_PER_CHAR;
|
946
|
+
rightShift = (CHAR_PER_WORD + mChar - oChar) * BIT_PER_CHAR;
|
947
|
+
mergedBwt[mWord] = mergedBwt[mWord]
|
948
|
+
| (oldBwt[oWord] << (oChar * BIT_PER_CHAR) >> (mChar * BIT_PER_CHAR))
|
949
|
+
| (oldBwt[oWord+1] >> rightShift);
|
950
|
+
oIndex += min(numInsert, CHAR_PER_WORD - mChar);
|
951
|
+
while (o > oIndex) {
|
952
|
+
oWord++;
|
953
|
+
mWord++;
|
954
|
+
mergedBwt[mWord] = (oldBwt[oWord] << leftShift) | (oldBwt[oWord+1] >> rightShift);
|
955
|
+
oIndex += CHAR_PER_WORD;
|
956
|
+
}
|
957
|
+
} else if (oChar < mChar) {
|
958
|
+
rightShift = (mChar - oChar) * BIT_PER_CHAR;
|
959
|
+
leftShift = (CHAR_PER_WORD + oChar - mChar) * BIT_PER_CHAR;
|
960
|
+
mergedBwt[mWord] = mergedBwt[mWord]
|
961
|
+
| (oldBwt[oWord] << (oChar * BIT_PER_CHAR) >> (mChar * BIT_PER_CHAR));
|
962
|
+
oIndex += min(numInsert, CHAR_PER_WORD - mChar);
|
963
|
+
while (o > oIndex) {
|
964
|
+
oWord++;
|
965
|
+
mWord++;
|
966
|
+
mergedBwt[mWord] = (oldBwt[oWord-1] << leftShift) | (oldBwt[oWord] >> rightShift);
|
967
|
+
oIndex += CHAR_PER_WORD;
|
968
|
+
}
|
969
|
+
} else { // oChar == mChar
|
970
|
+
mergedBwt[mWord] = mergedBwt[mWord] | truncateLeft(oldBwt[oWord], mChar * BIT_PER_CHAR);
|
971
|
+
oIndex += min(numInsert, CHAR_PER_WORD - mChar);
|
972
|
+
while (o > oIndex) {
|
973
|
+
oWord++;
|
974
|
+
mWord++;
|
975
|
+
mergedBwt[mWord] = oldBwt[oWord];
|
976
|
+
oIndex += CHAR_PER_WORD;
|
977
|
+
}
|
978
|
+
}
|
979
|
+
oIndex = o;
|
980
|
+
mIndex += numInsert;
|
981
|
+
|
982
|
+
// Clear the trailing garbage in mergedBwt
|
983
|
+
mWord = mIndex / CHAR_PER_WORD;
|
984
|
+
mChar = mIndex - mWord * CHAR_PER_WORD;
|
985
|
+
if (mChar == 0) {
|
986
|
+
mergedBwt[mWord] = 0;
|
987
|
+
} else {
|
988
|
+
mergedBwt[mWord] = truncateRight(mergedBwt[mWord], (BITS_IN_WORD - mChar * BIT_PER_CHAR));
|
989
|
+
}
|
990
|
+
|
991
|
+
}
|
992
|
+
|
993
|
+
// copy from insertBwt
|
994
|
+
while (iIndex <= numInsertBwt) {
|
995
|
+
if (sortedRank[iIndex] != 0) {
|
996
|
+
mergedBwt[mWord] |= insertBwt[iIndex] << (BITS_IN_WORD - (mChar + 1) * BIT_PER_CHAR);
|
997
|
+
mIndex++;
|
998
|
+
mChar++;
|
999
|
+
if (mChar == CHAR_PER_WORD) {
|
1000
|
+
mChar = 0;
|
1001
|
+
mWord++;
|
1002
|
+
mergedBwt[mWord] = 0; // no need to worry about crossing mergedBwt boundary
|
1003
|
+
}
|
1004
|
+
}
|
1005
|
+
iIndex++;
|
1006
|
+
}
|
1007
|
+
}
|
1008
|
+
|
1009
|
+
void BWTClearTrailingBwtCode(BWT *bwt)
|
1010
|
+
{
|
1011
|
+
unsigned int bwtResidentSizeInWord;
|
1012
|
+
unsigned int wordIndex, offset;
|
1013
|
+
unsigned int i;
|
1014
|
+
|
1015
|
+
bwtResidentSizeInWord = BWTResidentSizeInWord(bwt->textLength);
|
1016
|
+
|
1017
|
+
wordIndex = bwt->textLength / CHAR_PER_WORD;
|
1018
|
+
offset = (bwt->textLength - wordIndex * CHAR_PER_WORD) * BIT_PER_CHAR;
|
1019
|
+
if (offset > 0) {
|
1020
|
+
bwt->bwtCode[wordIndex] = truncateRight(bwt->bwtCode[wordIndex], BITS_IN_WORD - offset);
|
1021
|
+
} else {
|
1022
|
+
if (wordIndex < bwtResidentSizeInWord) {
|
1023
|
+
bwt->bwtCode[wordIndex] = 0;
|
1024
|
+
}
|
1025
|
+
}
|
1026
|
+
|
1027
|
+
for (i=wordIndex+1; i<bwtResidentSizeInWord; i++) {
|
1028
|
+
bwt->bwtCode[i] = 0;
|
1029
|
+
}
|
1030
|
+
}
|
1031
|
+
|
1032
|
+
|
1033
|
+
void BWTGenerateOccValueFromBwt(const unsigned int* bwt, unsigned int* __restrict occValue,
|
1034
|
+
unsigned int* __restrict occValueMajor,
|
1035
|
+
const unsigned int textLength, const unsigned int* decodeTable)
|
1036
|
+
{
|
1037
|
+
unsigned int numberOfOccValueMajor, numberOfOccValue;
|
1038
|
+
unsigned int wordBetweenOccValue;
|
1039
|
+
unsigned int numberOfOccIntervalPerMajor;
|
1040
|
+
unsigned int c;
|
1041
|
+
unsigned int i, j;
|
1042
|
+
unsigned int occMajorIndex;
|
1043
|
+
unsigned int occIndex, bwtIndex;
|
1044
|
+
unsigned int sum;
|
1045
|
+
unsigned int tempOccValue0[ALPHABET_SIZE], tempOccValue1[ALPHABET_SIZE];
|
1046
|
+
|
1047
|
+
wordBetweenOccValue = OCC_INTERVAL / CHAR_PER_WORD;
|
1048
|
+
|
1049
|
+
// Calculate occValue
|
1050
|
+
// [lh3] by default: OCC_INTERVAL_MAJOR=65536, OCC_INTERVAL=256
|
1051
|
+
numberOfOccValue = (textLength + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; // Value at both end for bi-directional encoding
|
1052
|
+
numberOfOccIntervalPerMajor = OCC_INTERVAL_MAJOR / OCC_INTERVAL;
|
1053
|
+
numberOfOccValueMajor = (numberOfOccValue + numberOfOccIntervalPerMajor - 1) / numberOfOccIntervalPerMajor;
|
1054
|
+
|
1055
|
+
tempOccValue0[0] = 0;
|
1056
|
+
tempOccValue0[1] = 0;
|
1057
|
+
tempOccValue0[2] = 0;
|
1058
|
+
tempOccValue0[3] = 0;
|
1059
|
+
occValueMajor[0] = 0;
|
1060
|
+
occValueMajor[1] = 0;
|
1061
|
+
occValueMajor[2] = 0;
|
1062
|
+
occValueMajor[3] = 0;
|
1063
|
+
|
1064
|
+
occIndex = 0;
|
1065
|
+
bwtIndex = 0;
|
1066
|
+
for (occMajorIndex=1; occMajorIndex<numberOfOccValueMajor; occMajorIndex++) {
|
1067
|
+
|
1068
|
+
for (i=0; i<numberOfOccIntervalPerMajor/2; i++) {
|
1069
|
+
|
1070
|
+
sum = 0;
|
1071
|
+
tempOccValue1[0] = tempOccValue0[0];
|
1072
|
+
tempOccValue1[1] = tempOccValue0[1];
|
1073
|
+
tempOccValue1[2] = tempOccValue0[2];
|
1074
|
+
tempOccValue1[3] = tempOccValue0[3];
|
1075
|
+
|
1076
|
+
for (j=0; j<wordBetweenOccValue; j++) {
|
1077
|
+
c = bwt[bwtIndex];
|
1078
|
+
sum += decodeTable[c >> 16];
|
1079
|
+
sum += decodeTable[c & 0x0000FFFF];
|
1080
|
+
bwtIndex++;
|
1081
|
+
}
|
1082
|
+
if (!DNA_OCC_SUM_EXCEPTION(sum)) {
|
1083
|
+
tempOccValue1[0] += (sum & 0x000000FF); sum >>= 8;
|
1084
|
+
tempOccValue1[1] += (sum & 0x000000FF); sum >>= 8;
|
1085
|
+
tempOccValue1[2] += (sum & 0x000000FF); sum >>= 8;
|
1086
|
+
tempOccValue1[3] += sum;
|
1087
|
+
} else {
|
1088
|
+
if (sum == 0x00000100) {
|
1089
|
+
tempOccValue1[0] += 256;
|
1090
|
+
} else if (sum == 0x00010000) {
|
1091
|
+
tempOccValue1[1] += 256;
|
1092
|
+
} else if (sum == 0x01000000) {
|
1093
|
+
tempOccValue1[2] += 256;
|
1094
|
+
} else {
|
1095
|
+
tempOccValue1[3] += 256;
|
1096
|
+
}
|
1097
|
+
}
|
1098
|
+
occValue[occIndex * 4 + 0] = (tempOccValue0[0] << 16) | tempOccValue1[0];
|
1099
|
+
occValue[occIndex * 4 + 1] = (tempOccValue0[1] << 16) | tempOccValue1[1];
|
1100
|
+
occValue[occIndex * 4 + 2] = (tempOccValue0[2] << 16) | tempOccValue1[2];
|
1101
|
+
occValue[occIndex * 4 + 3] = (tempOccValue0[3] << 16) | tempOccValue1[3];
|
1102
|
+
tempOccValue0[0] = tempOccValue1[0];
|
1103
|
+
tempOccValue0[1] = tempOccValue1[1];
|
1104
|
+
tempOccValue0[2] = tempOccValue1[2];
|
1105
|
+
tempOccValue0[3] = tempOccValue1[3];
|
1106
|
+
sum = 0;
|
1107
|
+
|
1108
|
+
occIndex++;
|
1109
|
+
|
1110
|
+
for (j=0; j<wordBetweenOccValue; j++) {
|
1111
|
+
c = bwt[bwtIndex];
|
1112
|
+
sum += decodeTable[c >> 16];
|
1113
|
+
sum += decodeTable[c & 0x0000FFFF];
|
1114
|
+
bwtIndex++;
|
1115
|
+
}
|
1116
|
+
if (!DNA_OCC_SUM_EXCEPTION(sum)) {
|
1117
|
+
tempOccValue0[0] += (sum & 0x000000FF); sum >>= 8;
|
1118
|
+
tempOccValue0[1] += (sum & 0x000000FF); sum >>= 8;
|
1119
|
+
tempOccValue0[2] += (sum & 0x000000FF); sum >>= 8;
|
1120
|
+
tempOccValue0[3] += sum;
|
1121
|
+
} else {
|
1122
|
+
if (sum == 0x00000100) {
|
1123
|
+
tempOccValue0[0] += 256;
|
1124
|
+
} else if (sum == 0x00010000) {
|
1125
|
+
tempOccValue0[1] += 256;
|
1126
|
+
} else if (sum == 0x01000000) {
|
1127
|
+
tempOccValue0[2] += 256;
|
1128
|
+
} else {
|
1129
|
+
tempOccValue0[3] += 256;
|
1130
|
+
}
|
1131
|
+
}
|
1132
|
+
}
|
1133
|
+
|
1134
|
+
occValueMajor[occMajorIndex * 4 + 0] = occValueMajor[(occMajorIndex - 1) * 4 + 0] + tempOccValue0[0];
|
1135
|
+
occValueMajor[occMajorIndex * 4 + 1] = occValueMajor[(occMajorIndex - 1) * 4 + 1] + tempOccValue0[1];
|
1136
|
+
occValueMajor[occMajorIndex * 4 + 2] = occValueMajor[(occMajorIndex - 1) * 4 + 2] + tempOccValue0[2];
|
1137
|
+
occValueMajor[occMajorIndex * 4 + 3] = occValueMajor[(occMajorIndex - 1) * 4 + 3] + tempOccValue0[3];
|
1138
|
+
tempOccValue0[0] = 0;
|
1139
|
+
tempOccValue0[1] = 0;
|
1140
|
+
tempOccValue0[2] = 0;
|
1141
|
+
tempOccValue0[3] = 0;
|
1142
|
+
|
1143
|
+
}
|
1144
|
+
|
1145
|
+
while (occIndex < (numberOfOccValue-1)/2) {
|
1146
|
+
sum = 0;
|
1147
|
+
tempOccValue1[0] = tempOccValue0[0];
|
1148
|
+
tempOccValue1[1] = tempOccValue0[1];
|
1149
|
+
tempOccValue1[2] = tempOccValue0[2];
|
1150
|
+
tempOccValue1[3] = tempOccValue0[3];
|
1151
|
+
for (j=0; j<wordBetweenOccValue; j++) {
|
1152
|
+
c = bwt[bwtIndex];
|
1153
|
+
sum += decodeTable[c >> 16];
|
1154
|
+
sum += decodeTable[c & 0x0000FFFF];
|
1155
|
+
bwtIndex++;
|
1156
|
+
}
|
1157
|
+
if (!DNA_OCC_SUM_EXCEPTION(sum)) {
|
1158
|
+
tempOccValue1[0] += (sum & 0x000000FF); sum >>= 8;
|
1159
|
+
tempOccValue1[1] += (sum & 0x000000FF); sum >>= 8;
|
1160
|
+
tempOccValue1[2] += (sum & 0x000000FF); sum >>= 8;
|
1161
|
+
tempOccValue1[3] += sum;
|
1162
|
+
} else {
|
1163
|
+
if (sum == 0x00000100) {
|
1164
|
+
tempOccValue1[0] += 256;
|
1165
|
+
} else if (sum == 0x00010000) {
|
1166
|
+
tempOccValue1[1] += 256;
|
1167
|
+
} else if (sum == 0x01000000) {
|
1168
|
+
tempOccValue1[2] += 256;
|
1169
|
+
} else {
|
1170
|
+
tempOccValue1[3] += 256;
|
1171
|
+
}
|
1172
|
+
}
|
1173
|
+
occValue[occIndex * 4 + 0] = (tempOccValue0[0] << 16) | tempOccValue1[0];
|
1174
|
+
occValue[occIndex * 4 + 1] = (tempOccValue0[1] << 16) | tempOccValue1[1];
|
1175
|
+
occValue[occIndex * 4 + 2] = (tempOccValue0[2] << 16) | tempOccValue1[2];
|
1176
|
+
occValue[occIndex * 4 + 3] = (tempOccValue0[3] << 16) | tempOccValue1[3];
|
1177
|
+
tempOccValue0[0] = tempOccValue1[0];
|
1178
|
+
tempOccValue0[1] = tempOccValue1[1];
|
1179
|
+
tempOccValue0[2] = tempOccValue1[2];
|
1180
|
+
tempOccValue0[3] = tempOccValue1[3];
|
1181
|
+
sum = 0;
|
1182
|
+
occIndex++;
|
1183
|
+
|
1184
|
+
for (j=0; j<wordBetweenOccValue; j++) {
|
1185
|
+
c = bwt[bwtIndex];
|
1186
|
+
sum += decodeTable[c >> 16];
|
1187
|
+
sum += decodeTable[c & 0x0000FFFF];
|
1188
|
+
bwtIndex++;
|
1189
|
+
}
|
1190
|
+
if (!DNA_OCC_SUM_EXCEPTION(sum)) {
|
1191
|
+
tempOccValue0[0] += (sum & 0x000000FF); sum >>= 8;
|
1192
|
+
tempOccValue0[1] += (sum & 0x000000FF); sum >>= 8;
|
1193
|
+
tempOccValue0[2] += (sum & 0x000000FF); sum >>= 8;
|
1194
|
+
tempOccValue0[3] += sum;
|
1195
|
+
} else {
|
1196
|
+
if (sum == 0x00000100) {
|
1197
|
+
tempOccValue0[0] += 256;
|
1198
|
+
} else if (sum == 0x00010000) {
|
1199
|
+
tempOccValue0[1] += 256;
|
1200
|
+
} else if (sum == 0x01000000) {
|
1201
|
+
tempOccValue0[2] += 256;
|
1202
|
+
} else {
|
1203
|
+
tempOccValue0[3] += 256;
|
1204
|
+
}
|
1205
|
+
}
|
1206
|
+
}
|
1207
|
+
|
1208
|
+
sum = 0;
|
1209
|
+
tempOccValue1[0] = tempOccValue0[0];
|
1210
|
+
tempOccValue1[1] = tempOccValue0[1];
|
1211
|
+
tempOccValue1[2] = tempOccValue0[2];
|
1212
|
+
tempOccValue1[3] = tempOccValue0[3];
|
1213
|
+
|
1214
|
+
if (occIndex * 2 < numberOfOccValue - 1) {
|
1215
|
+
for (j=0; j<wordBetweenOccValue; j++) {
|
1216
|
+
c = bwt[bwtIndex];
|
1217
|
+
sum += decodeTable[c >> 16];
|
1218
|
+
sum += decodeTable[c & 0x0000FFFF];
|
1219
|
+
bwtIndex++;
|
1220
|
+
}
|
1221
|
+
if (!DNA_OCC_SUM_EXCEPTION(sum)) {
|
1222
|
+
tempOccValue1[0] += (sum & 0x000000FF); sum >>= 8;
|
1223
|
+
tempOccValue1[1] += (sum & 0x000000FF); sum >>= 8;
|
1224
|
+
tempOccValue1[2] += (sum & 0x000000FF); sum >>= 8;
|
1225
|
+
tempOccValue1[3] += sum;
|
1226
|
+
} else {
|
1227
|
+
if (sum == 0x00000100) {
|
1228
|
+
tempOccValue1[0] += 256;
|
1229
|
+
} else if (sum == 0x00010000) {
|
1230
|
+
tempOccValue1[1] += 256;
|
1231
|
+
} else if (sum == 0x01000000) {
|
1232
|
+
tempOccValue1[2] += 256;
|
1233
|
+
} else {
|
1234
|
+
tempOccValue1[3] += 256;
|
1235
|
+
}
|
1236
|
+
}
|
1237
|
+
}
|
1238
|
+
|
1239
|
+
occValue[occIndex * 4 + 0] = (tempOccValue0[0] << 16) | tempOccValue1[0];
|
1240
|
+
occValue[occIndex * 4 + 1] = (tempOccValue0[1] << 16) | tempOccValue1[1];
|
1241
|
+
occValue[occIndex * 4 + 2] = (tempOccValue0[2] << 16) | tempOccValue1[2];
|
1242
|
+
occValue[occIndex * 4 + 3] = (tempOccValue0[3] << 16) | tempOccValue1[3];
|
1243
|
+
|
1244
|
+
}
|
1245
|
+
|
1246
|
+
static void BWTIncConstruct(BWTInc *bwtInc, const unsigned int numChar)
|
1247
|
+
{
|
1248
|
+
unsigned int i;
|
1249
|
+
unsigned int mergedBwtSizeInWord, mergedOccSizeInWord;
|
1250
|
+
unsigned int firstCharInThisIteration;
|
1251
|
+
|
1252
|
+
unsigned int *relativeRank, *seq, *sortedRank, *insertBwt, *mergedBwt;
|
1253
|
+
unsigned int newInverseSa0RelativeRank, oldInverseSa0RelativeRank, newInverseSa0;
|
1254
|
+
|
1255
|
+
#ifdef DEBUG
|
1256
|
+
if (numChar > bwtInc->buildSize) {
|
1257
|
+
fprintf(stderr, "BWTIncConstruct(): numChar > buildSize!\n");
|
1258
|
+
exit(1);
|
1259
|
+
}
|
1260
|
+
#endif
|
1261
|
+
|
1262
|
+
mergedBwtSizeInWord = BWTResidentSizeInWord(bwtInc->bwt->textLength + numChar);
|
1263
|
+
mergedOccSizeInWord = BWTOccValueMinorSizeInWord(bwtInc->bwt->textLength + numChar);
|
1264
|
+
|
1265
|
+
initializeVAL(bwtInc->cumulativeCountInCurrentBuild, ALPHABET_SIZE + 1, 0);
|
1266
|
+
|
1267
|
+
if (bwtInc->bwt->textLength == 0) { // Initial build
|
1268
|
+
|
1269
|
+
// Set address
|
1270
|
+
seq = bwtInc->workingMemory;
|
1271
|
+
relativeRank = seq + bwtInc->buildSize + 1;
|
1272
|
+
mergedBwt = insertBwt = bwtInc->workingMemory + bwtInc->availableWord - mergedBwtSizeInWord; // build in place
|
1273
|
+
|
1274
|
+
BWTIncPutPackedTextToRank(bwtInc->packedText, relativeRank, bwtInc->cumulativeCountInCurrentBuild, numChar);
|
1275
|
+
|
1276
|
+
firstCharInThisIteration = relativeRank[0];
|
1277
|
+
relativeRank[numChar] = 0;
|
1278
|
+
|
1279
|
+
// Sort suffix
|
1280
|
+
QSufSortSuffixSort((int*)relativeRank, (int*)seq, (int)numChar, (int)ALPHABET_SIZE - 1, 0, FALSE);
|
1281
|
+
newInverseSa0 = relativeRank[0];
|
1282
|
+
|
1283
|
+
// Clear BWT area
|
1284
|
+
initializeVAL(insertBwt, mergedBwtSizeInWord, 0);
|
1285
|
+
|
1286
|
+
// Build BWT
|
1287
|
+
BWTIncBuildPackedBwt(relativeRank, insertBwt, numChar, bwtInc->cumulativeCountInCurrentBuild, bwtInc->packedShift);
|
1288
|
+
|
1289
|
+
// so that the cumulativeCount is not deducted
|
1290
|
+
bwtInc->firstCharInLastIteration = ALPHABET_SIZE;
|
1291
|
+
|
1292
|
+
} else { // Incremental build
|
1293
|
+
// Set address
|
1294
|
+
sortedRank = bwtInc->workingMemory;
|
1295
|
+
seq = sortedRank + bwtInc->buildSize + 1;
|
1296
|
+
insertBwt = seq;
|
1297
|
+
relativeRank = seq + bwtInc->buildSize + 1;
|
1298
|
+
|
1299
|
+
// Store the first character of this iteration
|
1300
|
+
firstCharInThisIteration = bwtInc->packedText[0] >> (BITS_IN_WORD - BIT_PER_CHAR);
|
1301
|
+
|
1302
|
+
// Count occurrence of input text
|
1303
|
+
ForwardDNAAllOccCountNoLimit(bwtInc->packedText, numChar, bwtInc->cumulativeCountInCurrentBuild + 1, bwtInc->bwt->decodeTable);
|
1304
|
+
// Add the first character of the previous iteration to represent the inverseSa0 of the previous iteration
|
1305
|
+
bwtInc->cumulativeCountInCurrentBuild[bwtInc->firstCharInLastIteration + 1]++;
|
1306
|
+
bwtInc->cumulativeCountInCurrentBuild[2] += bwtInc->cumulativeCountInCurrentBuild[1];
|
1307
|
+
bwtInc->cumulativeCountInCurrentBuild[3] += bwtInc->cumulativeCountInCurrentBuild[2];
|
1308
|
+
bwtInc->cumulativeCountInCurrentBuild[4] += bwtInc->cumulativeCountInCurrentBuild[3];
|
1309
|
+
|
1310
|
+
// Get rank of new suffix among processed suffix
|
1311
|
+
// The seq array is built into ALPHABET_SIZE + 2 groups; ALPHABET_SIZE groups + 1 group divided into 2 by inverseSa0 + inverseSa0 as 1 group
|
1312
|
+
oldInverseSa0RelativeRank = BWTIncGetAbsoluteRank(bwtInc->bwt, sortedRank, seq, bwtInc->packedText,
|
1313
|
+
numChar, bwtInc->cumulativeCountInCurrentBuild, bwtInc->firstCharInLastIteration);
|
1314
|
+
|
1315
|
+
// Sort rank by ALPHABET_SIZE + 2 groups (or ALPHABET_SIZE + 1 groups when inverseSa0 sit on the border of a group)
|
1316
|
+
for (i=0; i<ALPHABET_SIZE; i++) {
|
1317
|
+
if (bwtInc->cumulativeCountInCurrentBuild[i] > oldInverseSa0RelativeRank ||
|
1318
|
+
bwtInc->cumulativeCountInCurrentBuild[i+1] <= oldInverseSa0RelativeRank) {
|
1319
|
+
BWTIncSortKey(sortedRank + bwtInc->cumulativeCountInCurrentBuild[i], seq + bwtInc->cumulativeCountInCurrentBuild[i], bwtInc->cumulativeCountInCurrentBuild[i+1] - bwtInc->cumulativeCountInCurrentBuild[i]);
|
1320
|
+
} else {
|
1321
|
+
if (bwtInc->cumulativeCountInCurrentBuild[i] < oldInverseSa0RelativeRank) {
|
1322
|
+
BWTIncSortKey(sortedRank + bwtInc->cumulativeCountInCurrentBuild[i], seq + bwtInc->cumulativeCountInCurrentBuild[i], oldInverseSa0RelativeRank - bwtInc->cumulativeCountInCurrentBuild[i]);
|
1323
|
+
}
|
1324
|
+
if (bwtInc->cumulativeCountInCurrentBuild[i+1] > oldInverseSa0RelativeRank + 1) {
|
1325
|
+
BWTIncSortKey(sortedRank + oldInverseSa0RelativeRank + 1, seq + oldInverseSa0RelativeRank + 1, bwtInc->cumulativeCountInCurrentBuild[i+1] - oldInverseSa0RelativeRank - 1);
|
1326
|
+
}
|
1327
|
+
}
|
1328
|
+
}
|
1329
|
+
|
1330
|
+
// build relative rank; sortedRank is updated for merging to cater for the fact that $ is not encoded in bwt
|
1331
|
+
// the cumulative freq information is used to make sure that inverseSa0 and suffix beginning with different characters are kept in different unsorted groups)
|
1332
|
+
BWTIncBuildRelativeRank(sortedRank, seq, relativeRank, numChar, bwtInc->bwt->inverseSa0, bwtInc->cumulativeCountInCurrentBuild);
|
1333
|
+
#ifdef DEBUG
|
1334
|
+
if (relativeRank[numChar] != oldInverseSa0RelativeRank) {
|
1335
|
+
fprintf(stderr, "BWTIncConstruct(): relativeRank[numChar] != oldInverseSa0RelativeRank!\n");
|
1336
|
+
exit(1);
|
1337
|
+
}
|
1338
|
+
#endif
|
1339
|
+
|
1340
|
+
// Sort suffix
|
1341
|
+
QSufSortSuffixSort((int*)relativeRank, (int*)seq, (int)numChar, (int)numChar, 1, TRUE);
|
1342
|
+
|
1343
|
+
newInverseSa0RelativeRank = relativeRank[0];
|
1344
|
+
newInverseSa0 = sortedRank[newInverseSa0RelativeRank] + newInverseSa0RelativeRank;
|
1345
|
+
|
1346
|
+
sortedRank[newInverseSa0RelativeRank] = 0; // a special value so that this is skipped in the merged bwt
|
1347
|
+
|
1348
|
+
// Build BWT
|
1349
|
+
BWTIncBuildBwt(seq, relativeRank, numChar, bwtInc->cumulativeCountInCurrentBuild);
|
1350
|
+
|
1351
|
+
// Merge BWT
|
1352
|
+
mergedBwt = bwtInc->workingMemory + bwtInc->availableWord - mergedBwtSizeInWord
|
1353
|
+
- bwtInc->numberOfIterationDone * OCC_INTERVAL / BIT_PER_CHAR;
|
1354
|
+
// minus numberOfIteration * occInterval to create a buffer for merging
|
1355
|
+
BWTIncMergeBwt(sortedRank, bwtInc->bwt->bwtCode, insertBwt, mergedBwt, bwtInc->bwt->textLength, numChar);
|
1356
|
+
|
1357
|
+
}
|
1358
|
+
|
1359
|
+
// Build auxiliary structure and update info and pointers in BWT
|
1360
|
+
bwtInc->bwt->textLength += numChar;
|
1361
|
+
bwtInc->bwt->bwtCode = mergedBwt;
|
1362
|
+
bwtInc->bwt->bwtSizeInWord = mergedBwtSizeInWord;
|
1363
|
+
bwtInc->bwt->occSizeInWord = mergedOccSizeInWord;
|
1364
|
+
if (mergedBwt < bwtInc->workingMemory + mergedOccSizeInWord) {
|
1365
|
+
fprintf(stderr, "BWTIncConstruct() : Not enough memory allocated!\n");
|
1366
|
+
exit(1);
|
1367
|
+
}
|
1368
|
+
|
1369
|
+
bwtInc->bwt->occValue = mergedBwt - mergedOccSizeInWord;
|
1370
|
+
|
1371
|
+
BWTClearTrailingBwtCode(bwtInc->bwt);
|
1372
|
+
BWTGenerateOccValueFromBwt(bwtInc->bwt->bwtCode, bwtInc->bwt->occValue, bwtInc->bwt->occValueMajor,
|
1373
|
+
bwtInc->bwt->textLength, bwtInc->bwt->decodeTable);
|
1374
|
+
|
1375
|
+
bwtInc->bwt->inverseSa0 = newInverseSa0;
|
1376
|
+
|
1377
|
+
bwtInc->bwt->cumulativeFreq[1] += bwtInc->cumulativeCountInCurrentBuild[1] - (bwtInc->firstCharInLastIteration <= 0);
|
1378
|
+
bwtInc->bwt->cumulativeFreq[2] += bwtInc->cumulativeCountInCurrentBuild[2] - (bwtInc->firstCharInLastIteration <= 1);
|
1379
|
+
bwtInc->bwt->cumulativeFreq[3] += bwtInc->cumulativeCountInCurrentBuild[3] - (bwtInc->firstCharInLastIteration <= 2);
|
1380
|
+
bwtInc->bwt->cumulativeFreq[4] += bwtInc->cumulativeCountInCurrentBuild[4] - (bwtInc->firstCharInLastIteration <= 3);
|
1381
|
+
|
1382
|
+
bwtInc->firstCharInLastIteration = firstCharInThisIteration;
|
1383
|
+
|
1384
|
+
// Set build size and text address for the next build
|
1385
|
+
BWTIncSetBuildSizeAndTextAddr(bwtInc);
|
1386
|
+
bwtInc->numberOfIterationDone++;
|
1387
|
+
|
1388
|
+
}
|
1389
|
+
|
1390
|
+
BWTInc *BWTIncConstructFromPacked(const char *inputFileName, const float targetNBit,
|
1391
|
+
const unsigned int initialMaxBuildSize, const unsigned int incMaxBuildSize)
|
1392
|
+
{
|
1393
|
+
|
1394
|
+
FILE *packedFile;
|
1395
|
+
unsigned int packedFileLen;
|
1396
|
+
unsigned int totalTextLength;
|
1397
|
+
unsigned int textToLoad, textSizeInByte;
|
1398
|
+
unsigned int processedTextLength;
|
1399
|
+
unsigned char lastByteLength;
|
1400
|
+
|
1401
|
+
BWTInc *bwtInc;
|
1402
|
+
|
1403
|
+
packedFile = (FILE*)fopen(inputFileName, "rb");
|
1404
|
+
|
1405
|
+
if (packedFile == NULL) {
|
1406
|
+
fprintf(stderr, "BWTIncConstructFromPacked() : Cannot open inputFileName!\n");
|
1407
|
+
exit(1);
|
1408
|
+
}
|
1409
|
+
|
1410
|
+
fseek(packedFile, -1, SEEK_END);
|
1411
|
+
packedFileLen = ftell(packedFile);
|
1412
|
+
if ((int)packedFileLen < 0) {
|
1413
|
+
fprintf(stderr, "BWTIncConstructFromPacked: Cannot determine file length!\n");
|
1414
|
+
exit(1);
|
1415
|
+
}
|
1416
|
+
fread(&lastByteLength, sizeof(unsigned char), 1, packedFile);
|
1417
|
+
totalTextLength = TextLengthFromBytePacked(packedFileLen, BIT_PER_CHAR, lastByteLength);
|
1418
|
+
|
1419
|
+
bwtInc = BWTIncCreate(totalTextLength, targetNBit, initialMaxBuildSize, incMaxBuildSize);
|
1420
|
+
|
1421
|
+
BWTIncSetBuildSizeAndTextAddr(bwtInc);
|
1422
|
+
|
1423
|
+
if (bwtInc->buildSize > totalTextLength) {
|
1424
|
+
textToLoad = totalTextLength;
|
1425
|
+
} else {
|
1426
|
+
textToLoad = totalTextLength - ((totalTextLength - bwtInc->buildSize + CHAR_PER_WORD - 1) / CHAR_PER_WORD * CHAR_PER_WORD);
|
1427
|
+
}
|
1428
|
+
textSizeInByte = textToLoad / CHAR_PER_BYTE; // excluded the odd byte
|
1429
|
+
|
1430
|
+
fseek(packedFile, -2, SEEK_CUR);
|
1431
|
+
fseek(packedFile, -((int)textSizeInByte), SEEK_CUR);
|
1432
|
+
fread(bwtInc->textBuffer, sizeof(unsigned char), textSizeInByte + 1, packedFile);
|
1433
|
+
fseek(packedFile, -((int)textSizeInByte + 1), SEEK_CUR);
|
1434
|
+
|
1435
|
+
ConvertBytePackedToWordPacked(bwtInc->textBuffer, bwtInc->packedText, ALPHABET_SIZE, textToLoad);
|
1436
|
+
BWTIncConstruct(bwtInc, textToLoad);
|
1437
|
+
|
1438
|
+
processedTextLength = textToLoad;
|
1439
|
+
|
1440
|
+
while (processedTextLength < totalTextLength) {
|
1441
|
+
textToLoad = bwtInc->buildSize / CHAR_PER_WORD * CHAR_PER_WORD;
|
1442
|
+
if (textToLoad > totalTextLength - processedTextLength) {
|
1443
|
+
textToLoad = totalTextLength - processedTextLength;
|
1444
|
+
}
|
1445
|
+
textSizeInByte = textToLoad / CHAR_PER_BYTE;
|
1446
|
+
fseek(packedFile, -((int)textSizeInByte), SEEK_CUR);
|
1447
|
+
fread(bwtInc->textBuffer, sizeof(unsigned char), textSizeInByte, packedFile);
|
1448
|
+
fseek(packedFile, -((int)textSizeInByte), SEEK_CUR);
|
1449
|
+
ConvertBytePackedToWordPacked(bwtInc->textBuffer, bwtInc->packedText, ALPHABET_SIZE, textToLoad);
|
1450
|
+
BWTIncConstruct(bwtInc, textToLoad);
|
1451
|
+
processedTextLength += textToLoad;
|
1452
|
+
if (bwtInc->numberOfIterationDone % 10 == 0) {
|
1453
|
+
printf("[BWTIncConstructFromPacked] %u iterations done. %u characters processed.\n",
|
1454
|
+
bwtInc->numberOfIterationDone, processedTextLength);
|
1455
|
+
}
|
1456
|
+
}
|
1457
|
+
return bwtInc;
|
1458
|
+
}
|
1459
|
+
|
1460
|
+
void BWTFree(BWT *bwt)
|
1461
|
+
{
|
1462
|
+
if (bwt == 0) return;
|
1463
|
+
free(bwt->cumulativeFreq);
|
1464
|
+
free(bwt->bwtCode);
|
1465
|
+
free(bwt->occValue);
|
1466
|
+
free(bwt->occValueMajor);
|
1467
|
+
free(bwt->saValue);
|
1468
|
+
free(bwt->inverseSa);
|
1469
|
+
free(bwt->decodeTable);
|
1470
|
+
free(bwt->saIndexRange);
|
1471
|
+
free(bwt->saValueOnBoundary);
|
1472
|
+
free(bwt);
|
1473
|
+
}
|
1474
|
+
|
1475
|
+
void BWTIncFree(BWTInc *bwtInc)
|
1476
|
+
{
|
1477
|
+
if (bwtInc == 0) return;
|
1478
|
+
free(bwtInc->bwt);
|
1479
|
+
free(bwtInc->workingMemory);
|
1480
|
+
free(bwtInc);
|
1481
|
+
}
|
1482
|
+
|
1483
|
+
static unsigned int BWTFileSizeInWord(const unsigned int numChar)
|
1484
|
+
{
|
1485
|
+
// The $ in BWT at the position of inverseSa0 is not encoded
|
1486
|
+
return (numChar + CHAR_PER_WORD - 1) / CHAR_PER_WORD;
|
1487
|
+
}
|
1488
|
+
|
1489
|
+
void BWTSaveBwtCodeAndOcc(const BWT *bwt, const char *bwtFileName, const char *occValueFileName)
|
1490
|
+
{
|
1491
|
+
FILE *bwtFile;
|
1492
|
+
/* FILE *occValueFile; */
|
1493
|
+
unsigned int bwtLength;
|
1494
|
+
|
1495
|
+
bwtFile = (FILE*)fopen(bwtFileName, "wb");
|
1496
|
+
if (bwtFile == NULL) {
|
1497
|
+
fprintf(stderr, "BWTSaveBwtCodeAndOcc(): Cannot open BWT code file!\n");
|
1498
|
+
exit(1);
|
1499
|
+
}
|
1500
|
+
|
1501
|
+
fwrite(&bwt->inverseSa0, sizeof(unsigned int), 1, bwtFile);
|
1502
|
+
fwrite(bwt->cumulativeFreq + 1, sizeof(unsigned int), ALPHABET_SIZE, bwtFile);
|
1503
|
+
bwtLength = BWTFileSizeInWord(bwt->textLength);
|
1504
|
+
fwrite(bwt->bwtCode, sizeof(unsigned int), bwtLength, bwtFile);
|
1505
|
+
fclose(bwtFile);
|
1506
|
+
/*
|
1507
|
+
occValueFile = (FILE*)fopen(occValueFileName, "wb");
|
1508
|
+
if (occValueFile == NULL) {
|
1509
|
+
fprintf(stderr, "BWTSaveBwtCodeAndOcc(): Cannot open occ value file!\n");
|
1510
|
+
exit(1);
|
1511
|
+
}
|
1512
|
+
|
1513
|
+
fwrite(&bwt->inverseSa0, sizeof(unsigned int), 1, occValueFile);
|
1514
|
+
fwrite(bwt->cumulativeFreq + 1, sizeof(unsigned int), ALPHABET_SIZE, occValueFile);
|
1515
|
+
fwrite(bwt->occValue, sizeof(unsigned int), bwt->occSizeInWord, occValueFile);
|
1516
|
+
fwrite(bwt->occValueMajor, sizeof(unsigned int), bwt->occMajorSizeInWord, occValueFile);
|
1517
|
+
fclose(occValueFile);
|
1518
|
+
*/
|
1519
|
+
}
|
1520
|
+
|
1521
|
+
void bwt_bwtgen(const char *fn_pac, const char *fn_bwt)
|
1522
|
+
{
|
1523
|
+
BWTInc *bwtInc;
|
1524
|
+
bwtInc = BWTIncConstructFromPacked(fn_pac, 2.5, 10000000, 10000000);
|
1525
|
+
printf("[bwt_gen] Finished constructing BWT in %u iterations.\n", bwtInc->numberOfIterationDone);
|
1526
|
+
BWTSaveBwtCodeAndOcc(bwtInc->bwt, fn_bwt, 0);
|
1527
|
+
BWTIncFree(bwtInc);
|
1528
|
+
}
|
1529
|
+
|
1530
|
+
int bwt_bwtgen_main(int argc, char *argv[])
|
1531
|
+
{
|
1532
|
+
if (argc < 3) {
|
1533
|
+
fprintf(stderr, "Usage: bwtgen <in.pac> <out.bwt>\n");
|
1534
|
+
return 1;
|
1535
|
+
}
|
1536
|
+
bwt_bwtgen(argv[1], argv[2]);
|
1537
|
+
return 0;
|
1538
|
+
}
|
1539
|
+
|
1540
|
+
#ifdef MAIN_BWT_GEN
|
1541
|
+
|
1542
|
+
int main(int argc, char *argv[])
|
1543
|
+
{
|
1544
|
+
return bwt_bwtgen_main(argc, argv);
|
1545
|
+
}
|
1546
|
+
|
1547
|
+
#endif
|