bio-bwa 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/Gemfile +15 -0
- data/Gemfile.lock +28 -0
- data/LICENSE.txt +35 -0
- data/README.rdoc +33 -0
- data/Rakefile +56 -0
- data/VERSION +1 -0
- data/bio-bwa.gemspec +152 -0
- data/doc/Bio.html +93 -0
- data/doc/Bio/BWA.html +2884 -0
- data/doc/Bio/BWA/Library.html +229 -0
- data/doc/_index.html +119 -0
- data/doc/class_list.html +36 -0
- data/doc/css/common.css +1 -0
- data/doc/css/full_list.css +53 -0
- data/doc/css/style.css +310 -0
- data/doc/file.LICENSE.html +88 -0
- data/doc/file.README.html +119 -0
- data/doc/file_list.html +41 -0
- data/doc/frames.html +13 -0
- data/doc/index.html +119 -0
- data/doc/js/app.js +203 -0
- data/doc/js/full_list.js +149 -0
- data/doc/js/jquery.js +154 -0
- data/doc/method_list.html +171 -0
- data/doc/top-level-namespace.html +88 -0
- data/ext/COPYING +674 -0
- data/ext/ChangeLog +3864 -0
- data/ext/NEWS +555 -0
- data/ext/README +29 -0
- data/ext/bamlite.c +155 -0
- data/ext/bamlite.h +94 -0
- data/ext/bntseq.c +303 -0
- data/ext/bntseq.h +80 -0
- data/ext/bwa.1 +562 -0
- data/ext/bwape.c +807 -0
- data/ext/bwase.c +686 -0
- data/ext/bwase.h +27 -0
- data/ext/bwaseqio.c +222 -0
- data/ext/bwt.c +250 -0
- data/ext/bwt.h +105 -0
- data/ext/bwt_gen/Makefile +23 -0
- data/ext/bwt_gen/QSufSort.c +496 -0
- data/ext/bwt_gen/QSufSort.h +40 -0
- data/ext/bwt_gen/bwt_gen.c +1547 -0
- data/ext/bwt_gen/bwt_gen.h +105 -0
- data/ext/bwt_lite.c +94 -0
- data/ext/bwt_lite.h +29 -0
- data/ext/bwtaln.c +345 -0
- data/ext/bwtaln.h +150 -0
- data/ext/bwtgap.c +264 -0
- data/ext/bwtgap.h +38 -0
- data/ext/bwtindex.c +186 -0
- data/ext/bwtio.c +77 -0
- data/ext/bwtmisc.c +269 -0
- data/ext/bwtsw2.h +51 -0
- data/ext/bwtsw2_aux.c +650 -0
- data/ext/bwtsw2_chain.c +107 -0
- data/ext/bwtsw2_core.c +594 -0
- data/ext/bwtsw2_main.c +100 -0
- data/ext/cs2nt.c +191 -0
- data/ext/is.c +218 -0
- data/ext/khash.h +506 -0
- data/ext/kseq.h +208 -0
- data/ext/ksort.h +269 -0
- data/ext/kstring.c +35 -0
- data/ext/kstring.h +46 -0
- data/ext/kvec.h +90 -0
- data/ext/main.c +63 -0
- data/ext/main.h +29 -0
- data/ext/mkrf_conf.rb +49 -0
- data/ext/qualfa2fq.pl +27 -0
- data/ext/simple_dp.c +162 -0
- data/ext/simpletest.c +23 -0
- data/ext/solid2fastq.pl +111 -0
- data/ext/stdaln.c +1072 -0
- data/ext/stdaln.h +162 -0
- data/ext/utils.c +82 -0
- data/ext/utils.h +54 -0
- data/lib/bio-bwa.rb +7 -0
- data/lib/bio/bwa.rb +312 -0
- data/lib/bio/bwa/library.rb +42 -0
- data/test/data/testdata.fa +602 -0
- data/test/data/testdata.long.fa +175 -0
- data/test/data/testdata.short.fa +2 -0
- data/test/helper.rb +18 -0
- data/test/test_bio-bwa_basic.rb +62 -0
- data/test/test_bio-bwa_make_index.rb +42 -0
- data/test/test_bio-bwa_run_aln.rb +49 -0
- data/test/test_bio-bwa_sam_conversion.rb +49 -0
- metadata +218 -0
data/ext/bwt.h
ADDED
@@ -0,0 +1,105 @@
|
|
1
|
+
/* The MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2008 Genome Research Ltd (GRL).
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
20
|
+
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
21
|
+
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
22
|
+
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
23
|
+
SOFTWARE.
|
24
|
+
*/
|
25
|
+
|
26
|
+
/* Contact: Heng Li <lh3@sanger.ac.uk> */
|
27
|
+
|
28
|
+
#ifndef BWA_BWT_H
|
29
|
+
#define BWA_BWT_H
|
30
|
+
|
31
|
+
#include <stdint.h>
|
32
|
+
|
33
|
+
// requirement: (OCC_INTERVAL%16 == 0)
|
34
|
+
#define OCC_INTERVAL 0x80
|
35
|
+
|
36
|
+
#ifndef BWA_UBYTE
|
37
|
+
#define BWA_UBYTE
|
38
|
+
typedef unsigned char ubyte_t;
|
39
|
+
#endif
|
40
|
+
typedef uint32_t bwtint_t;
|
41
|
+
|
42
|
+
typedef struct {
|
43
|
+
bwtint_t primary; // S^{-1}(0), or the primary index of BWT
|
44
|
+
bwtint_t L2[5]; // C(), cumulative count
|
45
|
+
bwtint_t seq_len; // sequence length
|
46
|
+
bwtint_t bwt_size; // size of bwt, about seq_len/4
|
47
|
+
uint32_t *bwt; // BWT
|
48
|
+
// occurance array, separated to two parts
|
49
|
+
uint32_t cnt_table[256];
|
50
|
+
// suffix array
|
51
|
+
int sa_intv;
|
52
|
+
bwtint_t n_sa;
|
53
|
+
bwtint_t *sa;
|
54
|
+
} bwt_t;
|
55
|
+
|
56
|
+
#define bwt_bwt(b, k) ((b)->bwt[(k)/OCC_INTERVAL*12 + 4 + (k)%OCC_INTERVAL/16])
|
57
|
+
|
58
|
+
/* retrieve a character from the $-removed BWT string. Note that
|
59
|
+
* bwt_t::bwt is not exactly the BWT string and therefore this macro is
|
60
|
+
* called bwt_B0 instead of bwt_B */
|
61
|
+
#define bwt_B0(b, k) (bwt_bwt(b, k)>>((~(k)&0xf)<<1)&3)
|
62
|
+
|
63
|
+
#define bwt_occ_intv(b, k) ((b)->bwt + (k)/OCC_INTERVAL*12)
|
64
|
+
|
65
|
+
// inverse Psi function
|
66
|
+
#define bwt_invPsi(bwt, k) \
|
67
|
+
(((k) == (bwt)->primary)? 0 : \
|
68
|
+
((k) < (bwt)->primary)? \
|
69
|
+
(bwt)->L2[bwt_B0(bwt, k)] + bwt_occ(bwt, k, bwt_B0(bwt, k)) \
|
70
|
+
: (bwt)->L2[bwt_B0(bwt, (k)-1)] + bwt_occ(bwt, k, bwt_B0(bwt, (k)-1)))
|
71
|
+
|
72
|
+
#ifdef __cplusplus
|
73
|
+
extern "C" {
|
74
|
+
#endif
|
75
|
+
|
76
|
+
void bwt_dump_bwt(const char *fn, const bwt_t *bwt);
|
77
|
+
void bwt_dump_sa(const char *fn, const bwt_t *bwt);
|
78
|
+
|
79
|
+
bwt_t *bwt_restore_bwt(const char *fn);
|
80
|
+
void bwt_restore_sa(const char *fn, bwt_t *bwt);
|
81
|
+
|
82
|
+
void bwt_destroy(bwt_t *bwt);
|
83
|
+
|
84
|
+
void bwt_bwtgen(const char *fn_pac, const char *fn_bwt); // from BWT-SW
|
85
|
+
void bwt_cal_sa(bwt_t *bwt, int intv);
|
86
|
+
|
87
|
+
void bwt_bwtupdate_core(bwt_t *bwt);
|
88
|
+
|
89
|
+
inline bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, ubyte_t c);
|
90
|
+
inline void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4]);
|
91
|
+
bwtint_t bwt_sa(const bwt_t *bwt, bwtint_t k);
|
92
|
+
|
93
|
+
// more efficient version of bwt_occ/bwt_occ4 for retrieving two close Occ values
|
94
|
+
void bwt_gen_cnt_table(bwt_t *bwt);
|
95
|
+
inline void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint_t *ok, bwtint_t *ol);
|
96
|
+
inline void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtint_t cntl[4]);
|
97
|
+
|
98
|
+
int bwt_match_exact(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *sa_begin, bwtint_t *sa_end);
|
99
|
+
int bwt_match_exact_alt(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *k0, bwtint_t *l0);
|
100
|
+
|
101
|
+
#ifdef __cplusplus
|
102
|
+
}
|
103
|
+
#endif
|
104
|
+
|
105
|
+
#endif
|
@@ -0,0 +1,23 @@
|
|
1
|
+
CC= gcc
|
2
|
+
CFLAGS= -g -Wall -O2 -m64 # comment out `-m64' for 32-bit compilation
|
3
|
+
DFLAGS= -D_FILE_OFFSET_BITS=64
|
4
|
+
OBJS= bwt_gen.o QSufSort.o
|
5
|
+
INCLUDES=
|
6
|
+
VERSION= 0.1.0
|
7
|
+
LIBS=
|
8
|
+
SUBDIRS=
|
9
|
+
|
10
|
+
.SUFFIXES:.c .o
|
11
|
+
|
12
|
+
.c.o:
|
13
|
+
$(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@
|
14
|
+
|
15
|
+
lib:libbwtgen.a
|
16
|
+
|
17
|
+
libbwtgen.a:$(OBJS)
|
18
|
+
$(AR) -cru $@ $(OBJS)
|
19
|
+
|
20
|
+
cleanlocal:
|
21
|
+
rm -f gmon.out *.o a.out $(PROG) *~ *.a
|
22
|
+
|
23
|
+
clean:cleanlocal
|
@@ -0,0 +1,496 @@
|
|
1
|
+
/* QSufSort.c
|
2
|
+
|
3
|
+
Original source from qsufsort.c
|
4
|
+
|
5
|
+
Copyright 1999, N. Jesper Larsson, all rights reserved.
|
6
|
+
|
7
|
+
This file contains an implementation of the algorithm presented in "Faster
|
8
|
+
Suffix Sorting" by N. Jesper Larsson (jesper@cs.lth.se) and Kunihiko
|
9
|
+
Sadakane (sada@is.s.u-tokyo.ac.jp).
|
10
|
+
|
11
|
+
This software may be used freely for any purpose. However, when distributed,
|
12
|
+
the original source must be clearly stated, and, when the source code is
|
13
|
+
distributed, the copyright notice must be retained and any alterations in
|
14
|
+
the code must be clearly marked. No warranty is given regarding the quality
|
15
|
+
of this software.
|
16
|
+
|
17
|
+
Modified by Wong Chi-Kwong, 2004
|
18
|
+
|
19
|
+
Changes summary: - Used long variable and function names
|
20
|
+
- Removed global variables
|
21
|
+
- Replace pointer references with array references
|
22
|
+
- Used insertion sort in place of selection sort and increased insertion sort threshold
|
23
|
+
- Reconstructing suffix array from inverse becomes an option
|
24
|
+
- Add handling where end-of-text symbol is not necessary < all characters
|
25
|
+
- Removed codes for supporting alphabet size > number of characters
|
26
|
+
|
27
|
+
No warrenty is given regarding the quality of the modifications.
|
28
|
+
|
29
|
+
*/
|
30
|
+
|
31
|
+
|
32
|
+
#include <stdio.h>
|
33
|
+
#include <stdlib.h>
|
34
|
+
#include <limits.h>
|
35
|
+
#include "bwt_gen.h"
|
36
|
+
#include "QSufSort.h"
|
37
|
+
|
38
|
+
// Static functions
|
39
|
+
static void QSufSortSortSplit(int* __restrict V, int* __restrict I, const int lowestPos,
|
40
|
+
const int highestPos, const int numSortedChar);
|
41
|
+
static int QSufSortChoosePivot(int* __restrict V, int* __restrict I, const int lowestPos,
|
42
|
+
const int highestPos, const int numSortedChar);
|
43
|
+
static void QSufSortInsertSortSplit(int* __restrict V, int* __restrict I, const int lowestPos,
|
44
|
+
const int highestPos, const int numSortedChar);
|
45
|
+
static void QSufSortBucketSort(int* __restrict V, int* __restrict I, const int numChar, const int alphabetSize);
|
46
|
+
static int QSufSortTransform(int* __restrict V, int* __restrict I, const int numChar, const int largestInputSymbol,
|
47
|
+
const int smallestInputSymbol, const int maxNewAlphabetSize, int *numSymbolAggregated);
|
48
|
+
|
49
|
+
// from MiscUtilities.c
|
50
|
+
static unsigned int leadingZero(const unsigned int input) {
|
51
|
+
|
52
|
+
unsigned int l;
|
53
|
+
const static unsigned int leadingZero8bit[256] = {8,7,6,6,5,5,5,5,4,4,4,4,4,4,4,4,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
|
54
|
+
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
|
55
|
+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
56
|
+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
57
|
+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
58
|
+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
59
|
+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
60
|
+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
|
61
|
+
|
62
|
+
if (input & 0xFFFF0000) {
|
63
|
+
if (input & 0xFF000000) {
|
64
|
+
l = leadingZero8bit[input >> 24];
|
65
|
+
} else {
|
66
|
+
l = 8 + leadingZero8bit[input >> 16];
|
67
|
+
}
|
68
|
+
} else {
|
69
|
+
if (input & 0x0000FF00) {
|
70
|
+
l = 16 + leadingZero8bit[input >> 8];
|
71
|
+
} else {
|
72
|
+
l = 24 + leadingZero8bit[input];
|
73
|
+
}
|
74
|
+
}
|
75
|
+
return l;
|
76
|
+
|
77
|
+
}
|
78
|
+
|
79
|
+
/* Makes suffix array p of x. x becomes inverse of p. p and x are both of size
|
80
|
+
n+1. Contents of x[0...n-1] are integers in the range l...k-1. Original
|
81
|
+
contents of x[n] is disregarded, the n-th symbol being regarded as
|
82
|
+
end-of-string smaller than all other symbols.*/
|
83
|
+
void QSufSortSuffixSort(int* __restrict V, int* __restrict I, const int numChar, const int largestInputSymbol,
|
84
|
+
const int smallestInputSymbol, const int skipTransform) {
|
85
|
+
|
86
|
+
int i, j;
|
87
|
+
int s, negatedSortedGroupLength;
|
88
|
+
int numSymbolAggregated;
|
89
|
+
int maxNumInputSymbol;
|
90
|
+
int numSortedPos = 1;
|
91
|
+
int newAlphabetSize;
|
92
|
+
|
93
|
+
maxNumInputSymbol = largestInputSymbol - smallestInputSymbol + 1;
|
94
|
+
|
95
|
+
if (!skipTransform) {
|
96
|
+
/* bucketing possible*/
|
97
|
+
newAlphabetSize = QSufSortTransform(V, I, numChar, largestInputSymbol, smallestInputSymbol,
|
98
|
+
numChar, &numSymbolAggregated);
|
99
|
+
QSufSortBucketSort(V, I, numChar, newAlphabetSize);
|
100
|
+
I[0] = -1;
|
101
|
+
V[numChar] = 0;
|
102
|
+
numSortedPos = numSymbolAggregated;
|
103
|
+
}
|
104
|
+
|
105
|
+
while ((int)(I[0]) >= -(int)numChar) {
|
106
|
+
i = 0;
|
107
|
+
negatedSortedGroupLength = 0;
|
108
|
+
do {
|
109
|
+
s = I[i];
|
110
|
+
if (s < 0) {
|
111
|
+
i -= s; /* skip over sorted group.*/
|
112
|
+
negatedSortedGroupLength += s;
|
113
|
+
} else {
|
114
|
+
if (negatedSortedGroupLength) {
|
115
|
+
I[i+negatedSortedGroupLength] = negatedSortedGroupLength; /* combine preceding sorted groups */
|
116
|
+
negatedSortedGroupLength = 0;
|
117
|
+
}
|
118
|
+
j = V[s] + 1;
|
119
|
+
QSufSortSortSplit(V, I, i, j - 1, numSortedPos);
|
120
|
+
i = j;
|
121
|
+
}
|
122
|
+
} while (i <= numChar);
|
123
|
+
if (negatedSortedGroupLength) {
|
124
|
+
/* array ends with a sorted group.*/
|
125
|
+
I[i+negatedSortedGroupLength] = negatedSortedGroupLength; /* combine sorted groups at end of I.*/
|
126
|
+
}
|
127
|
+
numSortedPos *= 2; /* double sorted-depth.*/
|
128
|
+
}
|
129
|
+
|
130
|
+
}
|
131
|
+
|
132
|
+
void QSufSortGenerateSaFromInverse(const int* V, int* __restrict I, const int numChar) {
|
133
|
+
|
134
|
+
int i;
|
135
|
+
for (i=0; i<=numChar; i++) {
|
136
|
+
I[V[i]] = i + 1;
|
137
|
+
}
|
138
|
+
|
139
|
+
}
|
140
|
+
|
141
|
+
/* Sorting routine called for each unsorted group. Sorts the array of integers
|
142
|
+
(suffix numbers) of length n starting at p. The algorithm is a ternary-split
|
143
|
+
quicksort taken from Bentley & McIlroy, "Engineering a Sort Function",
|
144
|
+
Software -- Practice and Experience 23(11), 1249-1265 (November 1993). This
|
145
|
+
function is based on Program 7.*/
|
146
|
+
static void QSufSortSortSplit(int* __restrict V, int* __restrict I, const int lowestPos,
|
147
|
+
const int highestPos, const int numSortedChar) {
|
148
|
+
|
149
|
+
int a, b, c, d;
|
150
|
+
int l, m;
|
151
|
+
int f, v, s, t;
|
152
|
+
int tmp;
|
153
|
+
int numItem;
|
154
|
+
|
155
|
+
#ifdef DEBUG
|
156
|
+
if (lowestPos > highestPos) {
|
157
|
+
fprintf(stderr, "QSufSortSortSplit(): lowestPos > highestPos!\n");
|
158
|
+
exit(1);
|
159
|
+
}
|
160
|
+
#endif
|
161
|
+
|
162
|
+
numItem = highestPos - lowestPos + 1;
|
163
|
+
|
164
|
+
if (numItem <= INSERT_SORT_NUM_ITEM) {
|
165
|
+
QSufSortInsertSortSplit(V, I, lowestPos, highestPos, numSortedChar);
|
166
|
+
return;
|
167
|
+
}
|
168
|
+
|
169
|
+
v = QSufSortChoosePivot(V, I, lowestPos, highestPos, numSortedChar);
|
170
|
+
|
171
|
+
a = b = lowestPos;
|
172
|
+
c = d = highestPos;
|
173
|
+
|
174
|
+
while (TRUE) {
|
175
|
+
while (c >= b && (f = KEY(V, I, b, numSortedChar)) <= v) {
|
176
|
+
if (f == v) {
|
177
|
+
swap(I[a], I[b], tmp);
|
178
|
+
a++;
|
179
|
+
}
|
180
|
+
b++;
|
181
|
+
}
|
182
|
+
while (c >= b && (f = KEY(V, I, c, numSortedChar)) >= v) {
|
183
|
+
if (f == v) {
|
184
|
+
swap(I[c], I[d], tmp);
|
185
|
+
d--;
|
186
|
+
}
|
187
|
+
c--;
|
188
|
+
}
|
189
|
+
if (b > c) {
|
190
|
+
break;
|
191
|
+
}
|
192
|
+
swap(I[b], I[c], tmp);
|
193
|
+
b++;
|
194
|
+
c--;
|
195
|
+
}
|
196
|
+
|
197
|
+
s = a - lowestPos;
|
198
|
+
t = b - a;
|
199
|
+
s = min(s, t);
|
200
|
+
for (l = lowestPos, m = b - s; m < b; l++, m++) {
|
201
|
+
swap(I[l], I[m], tmp);
|
202
|
+
}
|
203
|
+
|
204
|
+
s = d - c;
|
205
|
+
t = highestPos - d;
|
206
|
+
s = min(s, t);
|
207
|
+
for (l = b, m = highestPos - s + 1; m <= highestPos; l++, m++) {
|
208
|
+
swap(I[l], I[m], tmp);
|
209
|
+
}
|
210
|
+
|
211
|
+
s = b - a;
|
212
|
+
t = d - c;
|
213
|
+
if (s > 0) {
|
214
|
+
QSufSortSortSplit(V, I, lowestPos, lowestPos + s - 1, numSortedChar);
|
215
|
+
}
|
216
|
+
|
217
|
+
// Update group number for equal portion
|
218
|
+
a = lowestPos + s;
|
219
|
+
b = highestPos - t;
|
220
|
+
if (a == b) {
|
221
|
+
// Sorted group
|
222
|
+
V[I[a]] = a;
|
223
|
+
I[a] = -1;
|
224
|
+
} else {
|
225
|
+
// Unsorted group
|
226
|
+
for (c=a; c<=b; c++) {
|
227
|
+
V[I[c]] = b;
|
228
|
+
}
|
229
|
+
}
|
230
|
+
|
231
|
+
if (t > 0) {
|
232
|
+
QSufSortSortSplit(V, I, highestPos - t + 1, highestPos, numSortedChar);
|
233
|
+
}
|
234
|
+
|
235
|
+
}
|
236
|
+
|
237
|
+
/* Algorithm by Bentley & McIlroy.*/
|
238
|
+
static int QSufSortChoosePivot(int* __restrict V, int* __restrict I, const int lowestPos,
|
239
|
+
const int highestPos, const int numSortedChar) {
|
240
|
+
|
241
|
+
int m;
|
242
|
+
int keyl, keym, keyn;
|
243
|
+
int key1, key2, key3;
|
244
|
+
int s;
|
245
|
+
int numItem;
|
246
|
+
|
247
|
+
#ifdef DEBUG
|
248
|
+
if (lowestPos > highestPos) {
|
249
|
+
fprintf(stderr, "QSufSortChoosePivot(): lowestPos > highestPos!\n");
|
250
|
+
exit(1);
|
251
|
+
}
|
252
|
+
#endif
|
253
|
+
|
254
|
+
numItem = highestPos - lowestPos + 1;
|
255
|
+
|
256
|
+
#ifdef DEBUG
|
257
|
+
if (numItem <= INSERT_SORT_NUM_ITEM) {
|
258
|
+
fprintf(stderr, "QSufSortChoosePivot(): number of items <= INSERT_SORT_NUM_ITEM!\n");
|
259
|
+
exit(1);
|
260
|
+
}
|
261
|
+
#endif
|
262
|
+
|
263
|
+
m = lowestPos + numItem / 2;
|
264
|
+
|
265
|
+
s = numItem / 8;
|
266
|
+
key1 = KEY(V, I, lowestPos, numSortedChar);
|
267
|
+
key2 = KEY(V, I, lowestPos+s, numSortedChar);
|
268
|
+
key3 = KEY(V, I, lowestPos+2*s, numSortedChar);
|
269
|
+
keyl = med3(key1, key2, key3);
|
270
|
+
key1 = KEY(V, I, m-s, numSortedChar);
|
271
|
+
key2 = KEY(V, I, m, numSortedChar);
|
272
|
+
key3 = KEY(V, I, m+s, numSortedChar);
|
273
|
+
keym = med3(key1, key2, key3);
|
274
|
+
key1 = KEY(V, I, highestPos-2*s, numSortedChar);
|
275
|
+
key2 = KEY(V, I, highestPos-s, numSortedChar);
|
276
|
+
key3 = KEY(V, I, highestPos, numSortedChar);
|
277
|
+
keyn = med3(key1, key2, key3);
|
278
|
+
|
279
|
+
return med3(keyl, keym, keyn);
|
280
|
+
|
281
|
+
|
282
|
+
}
|
283
|
+
|
284
|
+
/* Quadratic sorting method to use for small subarrays. */
|
285
|
+
static void QSufSortInsertSortSplit(int* __restrict V, int* __restrict I, const int lowestPos,
|
286
|
+
const int highestPos, const int numSortedChar) {
|
287
|
+
|
288
|
+
int i, j;
|
289
|
+
int tmpKey, tmpPos;
|
290
|
+
int numItem;
|
291
|
+
int key[INSERT_SORT_NUM_ITEM], pos[INSERT_SORT_NUM_ITEM];
|
292
|
+
int negativeSortedLength;
|
293
|
+
int groupNum;
|
294
|
+
|
295
|
+
#ifdef DEBUG
|
296
|
+
if (lowestPos > highestPos) {
|
297
|
+
fprintf(stderr, "QSufSortInsertSortSplit(): lowestPos > highestPos!\n");
|
298
|
+
exit(1);
|
299
|
+
}
|
300
|
+
#endif
|
301
|
+
|
302
|
+
numItem = highestPos - lowestPos + 1;
|
303
|
+
|
304
|
+
#ifdef DEBUG
|
305
|
+
if (numItem > INSERT_SORT_NUM_ITEM) {
|
306
|
+
fprintf(stderr, "QSufSortInsertSortSplit(): number of items > INSERT_SORT_NUM_ITEM!\n");
|
307
|
+
exit(1);
|
308
|
+
}
|
309
|
+
#endif
|
310
|
+
|
311
|
+
for (i=0; i<numItem; i++) {
|
312
|
+
#ifdef DEBUG
|
313
|
+
if (I[lowestPos + i] < 0) {
|
314
|
+
fprintf(stderr, "QSufSortInsertSortSplit(): I < 0 in unsorted region!\n");
|
315
|
+
exit(1);
|
316
|
+
}
|
317
|
+
#endif
|
318
|
+
pos[i] = I[lowestPos + i];
|
319
|
+
key[i] = V[pos[i] + numSortedChar];
|
320
|
+
}
|
321
|
+
|
322
|
+
for (i=1; i<numItem; i++) {
|
323
|
+
tmpKey = key[i];
|
324
|
+
tmpPos = pos[i];
|
325
|
+
for (j=i; j>0 && key[j-1] > tmpKey; j--) {
|
326
|
+
key[j] = key[j-1];
|
327
|
+
pos[j] = pos[j-1];
|
328
|
+
}
|
329
|
+
key[j] = tmpKey;
|
330
|
+
pos[j] = tmpPos;
|
331
|
+
}
|
332
|
+
|
333
|
+
negativeSortedLength = -1;
|
334
|
+
|
335
|
+
i = numItem - 1;
|
336
|
+
groupNum = highestPos;
|
337
|
+
while (i > 0) {
|
338
|
+
I[i+lowestPos] = pos[i];
|
339
|
+
V[I[i+lowestPos]] = groupNum;
|
340
|
+
if (key[i-1] == key[i]) {
|
341
|
+
negativeSortedLength = 0;
|
342
|
+
} else {
|
343
|
+
if (negativeSortedLength < 0) {
|
344
|
+
I[i+lowestPos] = negativeSortedLength;
|
345
|
+
}
|
346
|
+
groupNum = i + lowestPos - 1;
|
347
|
+
negativeSortedLength--;
|
348
|
+
}
|
349
|
+
i--;
|
350
|
+
}
|
351
|
+
|
352
|
+
I[lowestPos] = pos[0];
|
353
|
+
V[I[lowestPos]] = groupNum;
|
354
|
+
if (negativeSortedLength < 0) {
|
355
|
+
I[lowestPos] = negativeSortedLength;
|
356
|
+
}
|
357
|
+
|
358
|
+
}
|
359
|
+
|
360
|
+
/* Bucketsort for first iteration.
|
361
|
+
|
362
|
+
Input: x[0...n-1] holds integers in the range 1...k-1, all of which appear
|
363
|
+
at least once. x[n] is 0. (This is the corresponding output of transform.) k
|
364
|
+
must be at most n+1. p is array of size n+1 whose contents are disregarded.
|
365
|
+
|
366
|
+
Output: x is V and p is I after the initial sorting stage of the refined
|
367
|
+
suffix sorting algorithm.*/
|
368
|
+
|
369
|
+
static void QSufSortBucketSort(int* __restrict V, int* __restrict I, const int numChar, const int alphabetSize) {
|
370
|
+
|
371
|
+
int i, c;
|
372
|
+
int d;
|
373
|
+
int groupNum;
|
374
|
+
int currentIndex;
|
375
|
+
|
376
|
+
// mark linked list empty
|
377
|
+
for (i=0; i<alphabetSize; i++) {
|
378
|
+
I[i] = -1;
|
379
|
+
}
|
380
|
+
|
381
|
+
// insert to linked list
|
382
|
+
for (i=0; i<=numChar; i++) {
|
383
|
+
c = V[i];
|
384
|
+
V[i] = (int)(I[c]);
|
385
|
+
I[c] = i;
|
386
|
+
}
|
387
|
+
|
388
|
+
currentIndex = numChar;
|
389
|
+
for (i=alphabetSize; i>0; i--) {
|
390
|
+
c = I[i-1];
|
391
|
+
d = (int)(V[c]);
|
392
|
+
groupNum = currentIndex;
|
393
|
+
V[c] = groupNum;
|
394
|
+
if (d >= 0) {
|
395
|
+
I[currentIndex] = c;
|
396
|
+
while (d >= 0) {
|
397
|
+
c = d;
|
398
|
+
d = V[c];
|
399
|
+
V[c] = groupNum;
|
400
|
+
currentIndex--;
|
401
|
+
I[currentIndex] = c;
|
402
|
+
}
|
403
|
+
} else {
|
404
|
+
// sorted group
|
405
|
+
I[currentIndex] = -1;
|
406
|
+
}
|
407
|
+
currentIndex--;
|
408
|
+
}
|
409
|
+
|
410
|
+
}
|
411
|
+
|
412
|
+
/* Transforms the alphabet of x by attempting to aggregate several symbols into
|
413
|
+
one, while preserving the suffix order of x. The alphabet may also be
|
414
|
+
compacted, so that x on output comprises all integers of the new alphabet
|
415
|
+
with no skipped numbers.
|
416
|
+
|
417
|
+
Input: x is an array of size n+1 whose first n elements are positive
|
418
|
+
integers in the range l...k-1. p is array of size n+1, used for temporary
|
419
|
+
storage. q controls aggregation and compaction by defining the maximum intue
|
420
|
+
for any symbol during transformation: q must be at least k-l; if q<=n,
|
421
|
+
compaction is guaranteed; if k-l>n, compaction is never done; if q is
|
422
|
+
INT_MAX, the maximum number of symbols are aggregated into one.
|
423
|
+
|
424
|
+
Output: Returns an integer j in the range 1...q representing the size of the
|
425
|
+
new alphabet. If j<=n+1, the alphabet is compacted. The global variable r is
|
426
|
+
set to the number of old symbols grouped into one. Only x[n] is 0.*/
|
427
|
+
static int QSufSortTransform(int* __restrict V, int* __restrict I, const int numChar, const int largestInputSymbol,
|
428
|
+
const int smallestInputSymbol, const int maxNewAlphabetSize, int *numSymbolAggregated) {
|
429
|
+
|
430
|
+
int c, i, j;
|
431
|
+
int a; // numSymbolAggregated
|
432
|
+
int mask;
|
433
|
+
int minSymbolInChunk = 0, maxSymbolInChunk = 0;
|
434
|
+
int newAlphabetSize;
|
435
|
+
int maxNumInputSymbol, maxNumBit, maxSymbol;
|
436
|
+
|
437
|
+
maxNumInputSymbol = largestInputSymbol - smallestInputSymbol + 1;
|
438
|
+
|
439
|
+
maxNumBit = BITS_IN_WORD - leadingZero(maxNumInputSymbol);
|
440
|
+
maxSymbol = INT_MAX >> maxNumBit;
|
441
|
+
|
442
|
+
c = maxNumInputSymbol;
|
443
|
+
for (a = 0; a < numChar && maxSymbolInChunk <= maxSymbol && c <= maxNewAlphabetSize; a++) {
|
444
|
+
minSymbolInChunk = (minSymbolInChunk << maxNumBit) | (V[a] - smallestInputSymbol + 1);
|
445
|
+
maxSymbolInChunk = c;
|
446
|
+
c = (maxSymbolInChunk << maxNumBit) | maxNumInputSymbol;
|
447
|
+
}
|
448
|
+
|
449
|
+
mask = (1 << (a-1) * maxNumBit) - 1; /* mask masks off top old symbol from chunk.*/
|
450
|
+
V[numChar] = smallestInputSymbol - 1; /* emulate zero terminator.*/
|
451
|
+
|
452
|
+
#ifdef DEBUG
|
453
|
+
// Section of code for maxSymbolInChunk > numChar removed!
|
454
|
+
if (maxSymbolInChunk > numChar) {
|
455
|
+
fprintf(stderr, "QSufSortTransform(): maxSymbolInChunk > numChar!\n");
|
456
|
+
exit(1);
|
457
|
+
}
|
458
|
+
#endif
|
459
|
+
|
460
|
+
/* bucketing possible, compact alphabet.*/
|
461
|
+
for (i=0; i<=maxSymbolInChunk; i++) {
|
462
|
+
I[i] = 0; /* zero transformation table.*/
|
463
|
+
}
|
464
|
+
c = minSymbolInChunk;
|
465
|
+
for (i=a; i<=numChar; i++) {
|
466
|
+
I[c] = 1; /* mark used chunk symbol.*/
|
467
|
+
c = ((c & mask) << maxNumBit) | (V[i] - smallestInputSymbol + 1); /* shift in next old symbol in chunk.*/
|
468
|
+
}
|
469
|
+
for (i=1; i<a; i++) { /* handle last r-1 positions.*/
|
470
|
+
I[c] = 1; /* mark used chunk symbol.*/
|
471
|
+
c = (c & mask) << maxNumBit; /* shift in next old symbol in chunk.*/
|
472
|
+
}
|
473
|
+
newAlphabetSize = 1;
|
474
|
+
for (i=0; i<=maxSymbolInChunk; i++) {
|
475
|
+
if (I[i]) {
|
476
|
+
I[i] = newAlphabetSize;
|
477
|
+
newAlphabetSize++;
|
478
|
+
}
|
479
|
+
}
|
480
|
+
c = minSymbolInChunk;
|
481
|
+
for (i=0, j=a; j<=numChar; i++, j++) {
|
482
|
+
V[i] = I[c]; /* transform to new alphabet.*/
|
483
|
+
c = ((c & mask) << maxNumBit) | (V[j] - smallestInputSymbol + 1); /* shift in next old symbol in chunk.*/
|
484
|
+
}
|
485
|
+
for (; i<numChar; i++) { /* handle last a-1 positions.*/
|
486
|
+
V[i] = I[c]; /* transform to new alphabet.*/
|
487
|
+
c = (c & mask) << maxNumBit; /* shift right-end zero in chunk.*/
|
488
|
+
}
|
489
|
+
|
490
|
+
V[numChar] = 0; /* end-of-string symbol is zero.*/
|
491
|
+
|
492
|
+
*numSymbolAggregated = a;
|
493
|
+
return newAlphabetSize;
|
494
|
+
|
495
|
+
}
|
496
|
+
|