ngs_server 0.4 → 0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ext/tabix/ChangeLog +593 -0
- data/ext/tabix/Makefile +65 -0
- data/ext/tabix/NEWS +126 -0
- data/ext/tabix/TabixReader.java +395 -0
- data/ext/tabix/bam_endian.h +42 -0
- data/ext/tabix/bedidx.c +156 -0
- data/ext/tabix/bgzf.c +714 -0
- data/ext/tabix/bgzf.h +157 -0
- data/ext/tabix/bgzip.c +206 -0
- data/ext/tabix/example.gtf.gz +0 -0
- data/ext/tabix/example.gtf.gz.tbi +0 -0
- data/ext/tabix/extconf.rb +1 -0
- data/ext/tabix/index.c +998 -0
- data/ext/tabix/khash.h +486 -0
- data/ext/tabix/knetfile.c +632 -0
- data/ext/tabix/knetfile.h +75 -0
- data/ext/tabix/kseq.h +227 -0
- data/ext/tabix/ksort.h +271 -0
- data/ext/tabix/kstring.c +165 -0
- data/ext/tabix/kstring.h +68 -0
- data/ext/tabix/main.c +290 -0
- data/ext/tabix/perl/MANIFEST +8 -0
- data/ext/tabix/perl/Makefile.PL +8 -0
- data/ext/tabix/perl/Tabix.pm +76 -0
- data/ext/tabix/perl/Tabix.xs +71 -0
- data/ext/tabix/perl/TabixIterator.pm +41 -0
- data/ext/tabix/perl/t/01local.t +28 -0
- data/ext/tabix/perl/t/02remote.t +28 -0
- data/ext/tabix/perl/typemap +3 -0
- data/ext/tabix/python/setup.py +55 -0
- data/ext/tabix/python/tabixmodule.c +408 -0
- data/ext/tabix/python/test.py +91 -0
- data/ext/tabix/tabix.1 +132 -0
- data/ext/tabix/tabix.h +145 -0
- data/ext/tabix/tabix.py +87 -0
- data/ext/tabix/tabix.tex +121 -0
- data/ext/vcftools/perl/Vcf.pm +5 -3
- data/ext/vcftools/perl/vcf-query +2 -0
- data/lib/ngs_server/version.rb +1 -1
- data/lib/ngs_server.rb +12 -11
- data/ngs_server.gemspec +1 -2
- metadata +39 -2
data/ext/tabix/index.c
ADDED
@@ -0,0 +1,998 @@
|
|
1
|
+
#include <ctype.h>
|
2
|
+
#include <assert.h>
|
3
|
+
#include <sys/stat.h>
|
4
|
+
#include "khash.h"
|
5
|
+
#include "ksort.h"
|
6
|
+
#include "kstring.h"
|
7
|
+
#include "bam_endian.h"
|
8
|
+
#ifdef _USE_KNETFILE
|
9
|
+
#include "knetfile.h"
|
10
|
+
#endif
|
11
|
+
#include "tabix.h"
|
12
|
+
|
13
|
+
#define TAD_MIN_CHUNK_GAP 32768
|
14
|
+
// 1<<14 is the size of minimum bin.
|
15
|
+
#define TAD_LIDX_SHIFT 14
|
16
|
+
|
17
|
+
typedef struct {
|
18
|
+
uint64_t u, v;
|
19
|
+
} pair64_t;
|
20
|
+
|
21
|
+
#define pair64_lt(a,b) ((a).u < (b).u)
|
22
|
+
KSORT_INIT(offt, pair64_t, pair64_lt)
|
23
|
+
|
24
|
+
typedef struct {
|
25
|
+
uint32_t m, n;
|
26
|
+
pair64_t *list;
|
27
|
+
} ti_binlist_t;
|
28
|
+
|
29
|
+
typedef struct {
|
30
|
+
int32_t n, m;
|
31
|
+
uint64_t *offset;
|
32
|
+
} ti_lidx_t;
|
33
|
+
|
34
|
+
KHASH_MAP_INIT_INT(i, ti_binlist_t)
|
35
|
+
KHASH_MAP_INIT_STR(s, int)
|
36
|
+
|
37
|
+
struct __ti_index_t {
|
38
|
+
ti_conf_t conf;
|
39
|
+
int32_t n, max;
|
40
|
+
khash_t(s) *tname;
|
41
|
+
khash_t(i) **index;
|
42
|
+
ti_lidx_t *index2;
|
43
|
+
};
|
44
|
+
|
45
|
+
struct __ti_iter_t {
|
46
|
+
int from_first; // read from the first record; no random access
|
47
|
+
int tid, beg, end, n_off, i, finished;
|
48
|
+
uint64_t curr_off;
|
49
|
+
kstring_t str;
|
50
|
+
const ti_index_t *idx;
|
51
|
+
pair64_t *off;
|
52
|
+
};
|
53
|
+
|
54
|
+
typedef struct {
|
55
|
+
int tid, beg, end, bin;
|
56
|
+
} ti_intv_t;
|
57
|
+
|
58
|
+
ti_conf_t ti_conf_gff = { 0, 1, 4, 5, '#', 0 };
|
59
|
+
ti_conf_t ti_conf_bed = { TI_FLAG_UCSC, 1, 2, 3, '#', 0 };
|
60
|
+
ti_conf_t ti_conf_psltbl = { TI_FLAG_UCSC, 15, 17, 18, '#', 0 };
|
61
|
+
ti_conf_t ti_conf_sam = { TI_PRESET_SAM, 3, 4, 0, '@', 0 };
|
62
|
+
ti_conf_t ti_conf_vcf = { TI_PRESET_VCF, 1, 2, 0, '#', 0 };
|
63
|
+
|
64
|
+
/***************
|
65
|
+
* read a line *
|
66
|
+
***************/
|
67
|
+
|
68
|
+
/*
|
69
|
+
int ti_readline(BGZF *fp, kstring_t *str)
|
70
|
+
{
|
71
|
+
int c, l = 0;
|
72
|
+
str->l = 0;
|
73
|
+
while ((c = bgzf_getc(fp)) >= 0 && c != '\n') {
|
74
|
+
++l;
|
75
|
+
if (c != '\r') kputc(c, str);
|
76
|
+
}
|
77
|
+
if (c < 0 && l == 0) return -1; // end of file
|
78
|
+
return str->l;
|
79
|
+
}
|
80
|
+
*/
|
81
|
+
|
82
|
+
/* Below is a faster implementation largely equivalent to the one
|
83
|
+
* commented out above. */
|
84
|
+
int ti_readline(BGZF *fp, kstring_t *str)
|
85
|
+
{
|
86
|
+
int l, state = 0;
|
87
|
+
unsigned char *buf = (unsigned char*)fp->uncompressed_block;
|
88
|
+
str->l = 0;
|
89
|
+
do {
|
90
|
+
if (fp->block_offset >= fp->block_length) {
|
91
|
+
if (bgzf_read_block(fp) != 0) { state = -2; break; }
|
92
|
+
if (fp->block_length == 0) { state = -1; break; }
|
93
|
+
}
|
94
|
+
for (l = fp->block_offset; l < fp->block_length && buf[l] != '\n'; ++l);
|
95
|
+
if (l < fp->block_length) state = 1;
|
96
|
+
l -= fp->block_offset;
|
97
|
+
if (str->l + l + 1 >= str->m) {
|
98
|
+
str->m = str->l + l + 2;
|
99
|
+
kroundup32(str->m);
|
100
|
+
str->s = (char*)realloc(str->s, str->m);
|
101
|
+
}
|
102
|
+
memcpy(str->s + str->l, buf + fp->block_offset, l);
|
103
|
+
str->l += l;
|
104
|
+
fp->block_offset += l + 1;
|
105
|
+
if (fp->block_offset >= fp->block_length) {
|
106
|
+
#ifdef _USE_KNETFILE
|
107
|
+
fp->block_address = knet_tell(fp->x.fpr);
|
108
|
+
#else
|
109
|
+
fp->block_address = ftello(fp->file);
|
110
|
+
#endif
|
111
|
+
fp->block_offset = 0;
|
112
|
+
fp->block_length = 0;
|
113
|
+
}
|
114
|
+
} while (state == 0);
|
115
|
+
if (str->l == 0 && state < 0) return state;
|
116
|
+
str->s[str->l] = 0;
|
117
|
+
return str->l;
|
118
|
+
}
|
119
|
+
|
120
|
+
/*************************************
|
121
|
+
* get the interval from a data line *
|
122
|
+
*************************************/
|
123
|
+
|
124
|
+
static inline int ti_reg2bin(uint32_t beg, uint32_t end)
|
125
|
+
{
|
126
|
+
--end;
|
127
|
+
if (beg>>14 == end>>14) return 4681 + (beg>>14);
|
128
|
+
if (beg>>17 == end>>17) return 585 + (beg>>17);
|
129
|
+
if (beg>>20 == end>>20) return 73 + (beg>>20);
|
130
|
+
if (beg>>23 == end>>23) return 9 + (beg>>23);
|
131
|
+
if (beg>>26 == end>>26) return 1 + (beg>>26);
|
132
|
+
return 0;
|
133
|
+
}
|
134
|
+
|
135
|
+
static int get_tid(ti_index_t *idx, const char *ss)
|
136
|
+
{
|
137
|
+
khint_t k;
|
138
|
+
int tid;
|
139
|
+
k = kh_get(s, idx->tname, ss);
|
140
|
+
if (k == kh_end(idx->tname)) { // a new target sequence
|
141
|
+
int ret, size;
|
142
|
+
// update idx->n, ->max, ->index and ->index2
|
143
|
+
if (idx->n == idx->max) {
|
144
|
+
idx->max = idx->max? idx->max<<1 : 8;
|
145
|
+
idx->index = realloc(idx->index, idx->max * sizeof(void*));
|
146
|
+
idx->index2 = realloc(idx->index2, idx->max * sizeof(ti_lidx_t));
|
147
|
+
}
|
148
|
+
memset(&idx->index2[idx->n], 0, sizeof(ti_lidx_t));
|
149
|
+
idx->index[idx->n++] = kh_init(i);
|
150
|
+
// update ->tname
|
151
|
+
tid = size = kh_size(idx->tname);
|
152
|
+
k = kh_put(s, idx->tname, strdup(ss), &ret);
|
153
|
+
kh_value(idx->tname, k) = size;
|
154
|
+
assert(idx->n == kh_size(idx->tname));
|
155
|
+
} else tid = kh_value(idx->tname, k);
|
156
|
+
return tid;
|
157
|
+
}
|
158
|
+
|
159
|
+
int ti_get_intv(const ti_conf_t *conf, int len, char *line, ti_interval_t *intv)
|
160
|
+
{
|
161
|
+
int i, b = 0, id = 1, ncols = 0;
|
162
|
+
char *s;
|
163
|
+
intv->ss = intv->se = 0; intv->beg = intv->end = -1;
|
164
|
+
for (i = 0; i <= len; ++i) {
|
165
|
+
if (line[i] == '\t' || line[i] == 0) {
|
166
|
+
++ncols;
|
167
|
+
if (id == conf->sc) {
|
168
|
+
intv->ss = line + b; intv->se = line + i;
|
169
|
+
} else if (id == conf->bc) {
|
170
|
+
// here ->beg is 0-based.
|
171
|
+
intv->beg = intv->end = strtol(line + b, &s, 0);
|
172
|
+
if (!(conf->preset&TI_FLAG_UCSC)) --intv->beg;
|
173
|
+
else ++intv->end;
|
174
|
+
if (intv->beg < 0) intv->beg = 0;
|
175
|
+
if (intv->end < 1) intv->end = 1;
|
176
|
+
} else {
|
177
|
+
if ((conf->preset&0xffff) == TI_PRESET_GENERIC) {
|
178
|
+
if (id == conf->ec) intv->end = strtol(line + b, &s, 0);
|
179
|
+
} else if ((conf->preset&0xffff) == TI_PRESET_SAM) {
|
180
|
+
if (id == 6) { // CIGAR
|
181
|
+
int l = 0, op;
|
182
|
+
char *t;
|
183
|
+
for (s = line + b; s < line + i;) {
|
184
|
+
long x = strtol(s, &t, 10);
|
185
|
+
op = toupper(*t);
|
186
|
+
if (op == 'M' || op == 'D' || op == 'N') l += x;
|
187
|
+
s = t + 1;
|
188
|
+
}
|
189
|
+
if (l == 0) l = 1;
|
190
|
+
intv->end = intv->beg + l;
|
191
|
+
}
|
192
|
+
} else if ((conf->preset&0xffff) == TI_PRESET_VCF) {
|
193
|
+
// FIXME: the following is NOT tested and is likely to be buggy
|
194
|
+
if (id == 4) {
|
195
|
+
if (b < i) intv->end = intv->beg + (i - b);
|
196
|
+
} else if (id == 8) { // look for "END="
|
197
|
+
int c = line[i];
|
198
|
+
line[i] = 0;
|
199
|
+
s = strstr(line + b, "END=");
|
200
|
+
if (s == line + b) s += 4;
|
201
|
+
else if (s) {
|
202
|
+
s = strstr(line + b, ";END=");
|
203
|
+
if (s) s += 5;
|
204
|
+
}
|
205
|
+
if (s) intv->end = strtol(s, &s, 0);
|
206
|
+
line[i] = c;
|
207
|
+
}
|
208
|
+
}
|
209
|
+
}
|
210
|
+
b = i + 1;
|
211
|
+
++id;
|
212
|
+
}
|
213
|
+
}
|
214
|
+
/*
|
215
|
+
if (ncols < conf->sc || ncols < conf->bc || ncols < conf->ec) {
|
216
|
+
if (ncols == 1) fprintf(stderr,"[get_intv] Is the file tab-delimited? The line has %d field only: %s\n", ncols, line);
|
217
|
+
else fprintf(stderr,"[get_intv] The line has %d field(s) only: %s\n", ncols, line);
|
218
|
+
exit(1);
|
219
|
+
}
|
220
|
+
*/
|
221
|
+
if (intv->ss == 0 || intv->se == 0 || intv->beg < 0 || intv->end < 0) return -1;
|
222
|
+
return 0;
|
223
|
+
}
|
224
|
+
|
225
|
+
static int get_intv(ti_index_t *idx, kstring_t *str, ti_intv_t *intv)
|
226
|
+
{
|
227
|
+
ti_interval_t x;
|
228
|
+
intv->tid = intv->beg = intv->end = intv->bin = -1;
|
229
|
+
if (ti_get_intv(&idx->conf, str->l, str->s, &x) == 0) {
|
230
|
+
int c = *x.se;
|
231
|
+
*x.se = '\0'; intv->tid = get_tid(idx, x.ss); *x.se = c;
|
232
|
+
intv->beg = x.beg; intv->end = x.end;
|
233
|
+
intv->bin = ti_reg2bin(intv->beg, intv->end);
|
234
|
+
return (intv->tid >= 0 && intv->beg >= 0 && intv->end >= 0)? 0 : -1;
|
235
|
+
} else {
|
236
|
+
fprintf(stderr, "[%s] the following line cannot be parsed and skipped: %s\n", __func__, str->s);
|
237
|
+
return -1;
|
238
|
+
}
|
239
|
+
}
|
240
|
+
|
241
|
+
/************
|
242
|
+
* indexing *
|
243
|
+
************/
|
244
|
+
|
245
|
+
// requirement: len <= LEN_MASK
|
246
|
+
static inline void insert_offset(khash_t(i) *h, int bin, uint64_t beg, uint64_t end)
|
247
|
+
{
|
248
|
+
khint_t k;
|
249
|
+
ti_binlist_t *l;
|
250
|
+
int ret;
|
251
|
+
k = kh_put(i, h, bin, &ret);
|
252
|
+
l = &kh_value(h, k);
|
253
|
+
if (ret) { // not present
|
254
|
+
l->m = 1; l->n = 0;
|
255
|
+
l->list = (pair64_t*)calloc(l->m, 16);
|
256
|
+
}
|
257
|
+
if (l->n == l->m) {
|
258
|
+
l->m <<= 1;
|
259
|
+
l->list = (pair64_t*)realloc(l->list, l->m * 16);
|
260
|
+
}
|
261
|
+
l->list[l->n].u = beg; l->list[l->n++].v = end;
|
262
|
+
}
|
263
|
+
|
264
|
+
static inline uint64_t insert_offset2(ti_lidx_t *index2, int _beg, int _end, uint64_t offset)
|
265
|
+
{
|
266
|
+
int i, beg, end;
|
267
|
+
beg = _beg >> TAD_LIDX_SHIFT;
|
268
|
+
end = (_end - 1) >> TAD_LIDX_SHIFT;
|
269
|
+
if (index2->m < end + 1) {
|
270
|
+
int old_m = index2->m;
|
271
|
+
index2->m = end + 1;
|
272
|
+
kroundup32(index2->m);
|
273
|
+
index2->offset = (uint64_t*)realloc(index2->offset, index2->m * 8);
|
274
|
+
memset(index2->offset + old_m, 0, 8 * (index2->m - old_m));
|
275
|
+
}
|
276
|
+
if (beg == end) {
|
277
|
+
if (index2->offset[beg] == 0) index2->offset[beg] = offset;
|
278
|
+
} else {
|
279
|
+
for (i = beg; i <= end; ++i)
|
280
|
+
if (index2->offset[i] == 0) index2->offset[i] = offset;
|
281
|
+
}
|
282
|
+
if (index2->n < end + 1) index2->n = end + 1;
|
283
|
+
return (uint64_t)beg<<32 | end;
|
284
|
+
}
|
285
|
+
|
286
|
+
static void merge_chunks(ti_index_t *idx)
|
287
|
+
{
|
288
|
+
khash_t(i) *index;
|
289
|
+
int i, l, m;
|
290
|
+
khint_t k;
|
291
|
+
for (i = 0; i < idx->n; ++i) {
|
292
|
+
index = idx->index[i];
|
293
|
+
for (k = kh_begin(index); k != kh_end(index); ++k) {
|
294
|
+
ti_binlist_t *p;
|
295
|
+
if (!kh_exist(index, k)) continue;
|
296
|
+
p = &kh_value(index, k);
|
297
|
+
m = 0;
|
298
|
+
for (l = 1; l < p->n; ++l) {
|
299
|
+
if (p->list[m].v>>16 == p->list[l].u>>16) p->list[m].v = p->list[l].v;
|
300
|
+
else p->list[++m] = p->list[l];
|
301
|
+
} // ~for(l)
|
302
|
+
p->n = m + 1;
|
303
|
+
} // ~for(k)
|
304
|
+
} // ~for(i)
|
305
|
+
}
|
306
|
+
|
307
|
+
static void fill_missing(ti_index_t *idx)
|
308
|
+
{
|
309
|
+
int i, j;
|
310
|
+
for (i = 0; i < idx->n; ++i) {
|
311
|
+
ti_lidx_t *idx2 = &idx->index2[i];
|
312
|
+
for (j = 1; j < idx2->n; ++j)
|
313
|
+
if (idx2->offset[j] == 0)
|
314
|
+
idx2->offset[j] = idx2->offset[j-1];
|
315
|
+
}
|
316
|
+
}
|
317
|
+
|
318
|
+
ti_index_t *ti_index_core(BGZF *fp, const ti_conf_t *conf)
|
319
|
+
{
|
320
|
+
int ret;
|
321
|
+
ti_index_t *idx;
|
322
|
+
uint32_t last_bin, save_bin;
|
323
|
+
int32_t last_coor, last_tid, save_tid;
|
324
|
+
uint64_t save_off, last_off, lineno = 0, offset0 = (uint64_t)-1, tmp;
|
325
|
+
kstring_t *str;
|
326
|
+
|
327
|
+
str = calloc(1, sizeof(kstring_t));
|
328
|
+
|
329
|
+
idx = (ti_index_t*)calloc(1, sizeof(ti_index_t));
|
330
|
+
idx->conf = *conf;
|
331
|
+
idx->n = idx->max = 0;
|
332
|
+
idx->tname = kh_init(s);
|
333
|
+
idx->index = 0;
|
334
|
+
idx->index2 = 0;
|
335
|
+
|
336
|
+
save_bin = save_tid = last_tid = last_bin = 0xffffffffu;
|
337
|
+
save_off = last_off = bgzf_tell(fp); last_coor = 0xffffffffu;
|
338
|
+
while ((ret = ti_readline(fp, str)) >= 0) {
|
339
|
+
ti_intv_t intv;
|
340
|
+
++lineno;
|
341
|
+
if (lineno <= idx->conf.line_skip || str->s[0] == idx->conf.meta_char) {
|
342
|
+
last_off = bgzf_tell(fp);
|
343
|
+
continue;
|
344
|
+
}
|
345
|
+
get_intv(idx, str, &intv);
|
346
|
+
if ( intv.beg<0 || intv.end<0 )
|
347
|
+
{
|
348
|
+
fprintf(stderr,"[ti_index_core] the indexes overlap or are out of bounds\n");
|
349
|
+
exit(1);
|
350
|
+
}
|
351
|
+
if (last_tid != intv.tid) { // change of chromosomes
|
352
|
+
if (last_tid>intv.tid )
|
353
|
+
{
|
354
|
+
fprintf(stderr,"[ti_index_core] the chromosome blocks not continuous at line %llu, is the file sorted? [pos %d]\n",(unsigned long long)lineno,intv.beg+1);
|
355
|
+
exit(1);
|
356
|
+
}
|
357
|
+
last_tid = intv.tid;
|
358
|
+
last_bin = 0xffffffffu;
|
359
|
+
} else if (last_coor > intv.beg) {
|
360
|
+
fprintf(stderr, "[ti_index_core] the file out of order at line %llu\n", (unsigned long long)lineno);
|
361
|
+
exit(1);
|
362
|
+
}
|
363
|
+
tmp = insert_offset2(&idx->index2[intv.tid], intv.beg, intv.end, last_off);
|
364
|
+
if (last_off == 0) offset0 = tmp;
|
365
|
+
if (intv.bin != last_bin) { // then possibly write the binning index
|
366
|
+
if (save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record
|
367
|
+
insert_offset(idx->index[save_tid], save_bin, save_off, last_off);
|
368
|
+
save_off = last_off;
|
369
|
+
save_bin = last_bin = intv.bin;
|
370
|
+
save_tid = intv.tid;
|
371
|
+
if (save_tid < 0) break;
|
372
|
+
}
|
373
|
+
if (bgzf_tell(fp) <= last_off) {
|
374
|
+
fprintf(stderr, "[ti_index_core] bug in BGZF: %llx < %llx\n",
|
375
|
+
(unsigned long long)bgzf_tell(fp), (unsigned long long)last_off);
|
376
|
+
exit(1);
|
377
|
+
}
|
378
|
+
last_off = bgzf_tell(fp);
|
379
|
+
last_coor = intv.beg;
|
380
|
+
}
|
381
|
+
if (save_tid >= 0) insert_offset(idx->index[save_tid], save_bin, save_off, bgzf_tell(fp));
|
382
|
+
merge_chunks(idx);
|
383
|
+
fill_missing(idx);
|
384
|
+
if (offset0 != (uint64_t)-1 && idx->n && idx->index2[0].offset) {
|
385
|
+
int i, beg = offset0>>32, end = offset0&0xffffffffu;
|
386
|
+
for (i = beg; i <= end; ++i) idx->index2[0].offset[i] = 0;
|
387
|
+
}
|
388
|
+
|
389
|
+
free(str->s); free(str);
|
390
|
+
return idx;
|
391
|
+
}
|
392
|
+
|
393
|
+
void ti_index_destroy(ti_index_t *idx)
|
394
|
+
{
|
395
|
+
khint_t k;
|
396
|
+
int i;
|
397
|
+
if (idx == 0) return;
|
398
|
+
// destroy the name hash table
|
399
|
+
for (k = kh_begin(idx->tname); k != kh_end(idx->tname); ++k) {
|
400
|
+
if (kh_exist(idx->tname, k))
|
401
|
+
free((char*)kh_key(idx->tname, k));
|
402
|
+
}
|
403
|
+
kh_destroy(s, idx->tname);
|
404
|
+
// destroy the binning index
|
405
|
+
for (i = 0; i < idx->n; ++i) {
|
406
|
+
khash_t(i) *index = idx->index[i];
|
407
|
+
ti_lidx_t *index2 = idx->index2 + i;
|
408
|
+
for (k = kh_begin(index); k != kh_end(index); ++k) {
|
409
|
+
if (kh_exist(index, k))
|
410
|
+
free(kh_value(index, k).list);
|
411
|
+
}
|
412
|
+
kh_destroy(i, index);
|
413
|
+
free(index2->offset);
|
414
|
+
}
|
415
|
+
free(idx->index);
|
416
|
+
// destroy the linear index
|
417
|
+
free(idx->index2);
|
418
|
+
free(idx);
|
419
|
+
}
|
420
|
+
|
421
|
+
/******************
|
422
|
+
* index file I/O *
|
423
|
+
******************/
|
424
|
+
|
425
|
+
void ti_index_save(const ti_index_t *idx, BGZF *fp)
|
426
|
+
{
|
427
|
+
int32_t i, size, ti_is_be;
|
428
|
+
khint_t k;
|
429
|
+
ti_is_be = bam_is_big_endian();
|
430
|
+
bgzf_write(fp, "TBI\1", 4);
|
431
|
+
if (ti_is_be) {
|
432
|
+
uint32_t x = idx->n;
|
433
|
+
bgzf_write(fp, bam_swap_endian_4p(&x), 4);
|
434
|
+
} else bgzf_write(fp, &idx->n, 4);
|
435
|
+
assert(sizeof(ti_conf_t) == 24);
|
436
|
+
if (ti_is_be) { // write ti_conf_t;
|
437
|
+
uint32_t x[6];
|
438
|
+
memcpy(x, &idx->conf, 24);
|
439
|
+
for (i = 0; i < 6; ++i) bgzf_write(fp, bam_swap_endian_4p(&x[i]), 4);
|
440
|
+
} else bgzf_write(fp, &idx->conf, sizeof(ti_conf_t));
|
441
|
+
{ // write target names
|
442
|
+
char **name;
|
443
|
+
int32_t l = 0;
|
444
|
+
name = calloc(kh_size(idx->tname), sizeof(void*));
|
445
|
+
for (k = kh_begin(idx->tname); k != kh_end(idx->tname); ++k)
|
446
|
+
if (kh_exist(idx->tname, k))
|
447
|
+
name[kh_value(idx->tname, k)] = (char*)kh_key(idx->tname, k);
|
448
|
+
for (i = 0; i < kh_size(idx->tname); ++i)
|
449
|
+
l += strlen(name[i]) + 1;
|
450
|
+
if (ti_is_be) bgzf_write(fp, bam_swap_endian_4p(&l), 4);
|
451
|
+
else bgzf_write(fp, &l, 4);
|
452
|
+
for (i = 0; i < kh_size(idx->tname); ++i)
|
453
|
+
bgzf_write(fp, name[i], strlen(name[i]) + 1);
|
454
|
+
free(name);
|
455
|
+
}
|
456
|
+
for (i = 0; i < idx->n; ++i) {
|
457
|
+
khash_t(i) *index = idx->index[i];
|
458
|
+
ti_lidx_t *index2 = idx->index2 + i;
|
459
|
+
// write binning index
|
460
|
+
size = kh_size(index);
|
461
|
+
if (ti_is_be) { // big endian
|
462
|
+
uint32_t x = size;
|
463
|
+
bgzf_write(fp, bam_swap_endian_4p(&x), 4);
|
464
|
+
} else bgzf_write(fp, &size, 4);
|
465
|
+
for (k = kh_begin(index); k != kh_end(index); ++k) {
|
466
|
+
if (kh_exist(index, k)) {
|
467
|
+
ti_binlist_t *p = &kh_value(index, k);
|
468
|
+
if (ti_is_be) { // big endian
|
469
|
+
uint32_t x;
|
470
|
+
x = kh_key(index, k); bgzf_write(fp, bam_swap_endian_4p(&x), 4);
|
471
|
+
x = p->n; bgzf_write(fp, bam_swap_endian_4p(&x), 4);
|
472
|
+
for (x = 0; (int)x < p->n; ++x) {
|
473
|
+
bam_swap_endian_8p(&p->list[x].u);
|
474
|
+
bam_swap_endian_8p(&p->list[x].v);
|
475
|
+
}
|
476
|
+
bgzf_write(fp, p->list, 16 * p->n);
|
477
|
+
for (x = 0; (int)x < p->n; ++x) {
|
478
|
+
bam_swap_endian_8p(&p->list[x].u);
|
479
|
+
bam_swap_endian_8p(&p->list[x].v);
|
480
|
+
}
|
481
|
+
} else {
|
482
|
+
bgzf_write(fp, &kh_key(index, k), 4);
|
483
|
+
bgzf_write(fp, &p->n, 4);
|
484
|
+
bgzf_write(fp, p->list, 16 * p->n);
|
485
|
+
}
|
486
|
+
}
|
487
|
+
}
|
488
|
+
// write linear index (index2)
|
489
|
+
if (ti_is_be) {
|
490
|
+
int x = index2->n;
|
491
|
+
bgzf_write(fp, bam_swap_endian_4p(&x), 4);
|
492
|
+
} else bgzf_write(fp, &index2->n, 4);
|
493
|
+
if (ti_is_be) { // big endian
|
494
|
+
int x;
|
495
|
+
for (x = 0; (int)x < index2->n; ++x)
|
496
|
+
bam_swap_endian_8p(&index2->offset[x]);
|
497
|
+
bgzf_write(fp, index2->offset, 8 * index2->n);
|
498
|
+
for (x = 0; (int)x < index2->n; ++x)
|
499
|
+
bam_swap_endian_8p(&index2->offset[x]);
|
500
|
+
} else bgzf_write(fp, index2->offset, 8 * index2->n);
|
501
|
+
}
|
502
|
+
}
|
503
|
+
|
504
|
+
static ti_index_t *ti_index_load_core(BGZF *fp)
|
505
|
+
{
|
506
|
+
int i, ti_is_be;
|
507
|
+
char magic[4];
|
508
|
+
ti_index_t *idx;
|
509
|
+
ti_is_be = bam_is_big_endian();
|
510
|
+
if (fp == 0) {
|
511
|
+
fprintf(stderr, "[ti_index_load_core] fail to load index.\n");
|
512
|
+
return 0;
|
513
|
+
}
|
514
|
+
bgzf_read(fp, magic, 4);
|
515
|
+
if (strncmp(magic, "TBI\1", 4)) {
|
516
|
+
fprintf(stderr, "[ti_index_load] wrong magic number.\n");
|
517
|
+
return 0;
|
518
|
+
}
|
519
|
+
idx = (ti_index_t*)calloc(1, sizeof(ti_index_t));
|
520
|
+
bgzf_read(fp, &idx->n, 4);
|
521
|
+
if (ti_is_be) bam_swap_endian_4p(&idx->n);
|
522
|
+
idx->tname = kh_init(s);
|
523
|
+
idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
|
524
|
+
idx->index2 = (ti_lidx_t*)calloc(idx->n, sizeof(ti_lidx_t));
|
525
|
+
// read idx->conf
|
526
|
+
bgzf_read(fp, &idx->conf, sizeof(ti_conf_t));
|
527
|
+
if (ti_is_be) {
|
528
|
+
bam_swap_endian_4p(&idx->conf.preset);
|
529
|
+
bam_swap_endian_4p(&idx->conf.sc);
|
530
|
+
bam_swap_endian_4p(&idx->conf.bc);
|
531
|
+
bam_swap_endian_4p(&idx->conf.ec);
|
532
|
+
bam_swap_endian_4p(&idx->conf.meta_char);
|
533
|
+
bam_swap_endian_4p(&idx->conf.line_skip);
|
534
|
+
}
|
535
|
+
{ // read target names
|
536
|
+
int j, ret;
|
537
|
+
kstring_t *str;
|
538
|
+
int32_t l;
|
539
|
+
uint8_t *buf;
|
540
|
+
bgzf_read(fp, &l, 4);
|
541
|
+
if (ti_is_be) bam_swap_endian_4p(&l);
|
542
|
+
buf = calloc(l, 1);
|
543
|
+
bgzf_read(fp, buf, l);
|
544
|
+
str = calloc(1, sizeof(kstring_t));
|
545
|
+
for (i = j = 0; i < l; ++i) {
|
546
|
+
if (buf[i] == 0) {
|
547
|
+
khint_t k = kh_put(s, idx->tname, strdup(str->s), &ret);
|
548
|
+
kh_value(idx->tname, k) = j++;
|
549
|
+
str->l = 0;
|
550
|
+
} else kputc(buf[i], str);
|
551
|
+
}
|
552
|
+
free(str->s); free(str); free(buf);
|
553
|
+
}
|
554
|
+
for (i = 0; i < idx->n; ++i) {
|
555
|
+
khash_t(i) *index;
|
556
|
+
ti_lidx_t *index2 = idx->index2 + i;
|
557
|
+
uint32_t key, size;
|
558
|
+
khint_t k;
|
559
|
+
int j, ret;
|
560
|
+
ti_binlist_t *p;
|
561
|
+
index = idx->index[i] = kh_init(i);
|
562
|
+
// load binning index
|
563
|
+
bgzf_read(fp, &size, 4);
|
564
|
+
if (ti_is_be) bam_swap_endian_4p(&size);
|
565
|
+
for (j = 0; j < (int)size; ++j) {
|
566
|
+
bgzf_read(fp, &key, 4);
|
567
|
+
if (ti_is_be) bam_swap_endian_4p(&key);
|
568
|
+
k = kh_put(i, index, key, &ret);
|
569
|
+
p = &kh_value(index, k);
|
570
|
+
bgzf_read(fp, &p->n, 4);
|
571
|
+
if (ti_is_be) bam_swap_endian_4p(&p->n);
|
572
|
+
p->m = p->n;
|
573
|
+
p->list = (pair64_t*)malloc(p->m * 16);
|
574
|
+
bgzf_read(fp, p->list, 16 * p->n);
|
575
|
+
if (ti_is_be) {
|
576
|
+
int x;
|
577
|
+
for (x = 0; x < p->n; ++x) {
|
578
|
+
bam_swap_endian_8p(&p->list[x].u);
|
579
|
+
bam_swap_endian_8p(&p->list[x].v);
|
580
|
+
}
|
581
|
+
}
|
582
|
+
}
|
583
|
+
// load linear index
|
584
|
+
bgzf_read(fp, &index2->n, 4);
|
585
|
+
if (ti_is_be) bam_swap_endian_4p(&index2->n);
|
586
|
+
index2->m = index2->n;
|
587
|
+
index2->offset = (uint64_t*)calloc(index2->m, 8);
|
588
|
+
bgzf_read(fp, index2->offset, index2->n * 8);
|
589
|
+
if (ti_is_be)
|
590
|
+
for (j = 0; j < index2->n; ++j) bam_swap_endian_8p(&index2->offset[j]);
|
591
|
+
}
|
592
|
+
return idx;
|
593
|
+
}
|
594
|
+
|
595
|
+
ti_index_t *ti_index_load_local(const char *fnidx)
|
596
|
+
{
|
597
|
+
BGZF *fp;
|
598
|
+
fp = bgzf_open(fnidx, "r");
|
599
|
+
if (fp) {
|
600
|
+
ti_index_t *idx = ti_index_load_core(fp);
|
601
|
+
bgzf_close(fp);
|
602
|
+
return idx;
|
603
|
+
} else return 0;
|
604
|
+
}
|
605
|
+
|
606
|
+
#ifdef _USE_KNETFILE
|
607
|
+
static void download_from_remote(const char *url)
|
608
|
+
{
|
609
|
+
const int buf_size = 1 * 1024 * 1024;
|
610
|
+
char *fn;
|
611
|
+
FILE *fp;
|
612
|
+
uint8_t *buf;
|
613
|
+
knetFile *fp_remote;
|
614
|
+
int l;
|
615
|
+
if (strstr(url, "ftp://") != url && strstr(url, "http://") != url) return;
|
616
|
+
l = strlen(url);
|
617
|
+
for (fn = (char*)url + l - 1; fn >= url; --fn)
|
618
|
+
if (*fn == '/') break;
|
619
|
+
++fn; // fn now points to the file name
|
620
|
+
fp_remote = knet_open(url, "r");
|
621
|
+
if (fp_remote == 0) {
|
622
|
+
fprintf(stderr, "[download_from_remote] fail to open remote file.\n");
|
623
|
+
return;
|
624
|
+
}
|
625
|
+
if ((fp = fopen(fn, "w")) == 0) {
|
626
|
+
fprintf(stderr, "[download_from_remote] fail to create file in the working directory.\n");
|
627
|
+
knet_close(fp_remote);
|
628
|
+
return;
|
629
|
+
}
|
630
|
+
buf = (uint8_t*)calloc(buf_size, 1);
|
631
|
+
while ((l = knet_read(fp_remote, buf, buf_size)) != 0)
|
632
|
+
fwrite(buf, 1, l, fp);
|
633
|
+
free(buf);
|
634
|
+
fclose(fp);
|
635
|
+
knet_close(fp_remote);
|
636
|
+
}
|
637
|
+
#else
|
638
|
+
static void download_from_remote(const char *url)
|
639
|
+
{
|
640
|
+
return;
|
641
|
+
}
|
642
|
+
#endif
|
643
|
+
|
644
|
+
static char *get_local_version(const char *fn)
|
645
|
+
{
|
646
|
+
struct stat sbuf;
|
647
|
+
char *fnidx = (char*)calloc(strlen(fn) + 5, 1);
|
648
|
+
strcat(strcpy(fnidx, fn), ".tbi");
|
649
|
+
if ((strstr(fnidx, "ftp://") == fnidx || strstr(fnidx, "http://") == fnidx)) {
|
650
|
+
char *p, *url;
|
651
|
+
int l = strlen(fnidx);
|
652
|
+
for (p = fnidx + l - 1; p >= fnidx; --p)
|
653
|
+
if (*p == '/') break;
|
654
|
+
url = fnidx; fnidx = strdup(p + 1);
|
655
|
+
if (stat(fnidx, &sbuf) == 0) {
|
656
|
+
free(url);
|
657
|
+
return fnidx;
|
658
|
+
}
|
659
|
+
fprintf(stderr, "[%s] downloading the index file...\n", __func__);
|
660
|
+
download_from_remote(url);
|
661
|
+
free(url);
|
662
|
+
}
|
663
|
+
if (stat(fnidx, &sbuf) == 0) return fnidx;
|
664
|
+
free(fnidx); return 0;
|
665
|
+
}
|
666
|
+
|
667
|
+
const char **ti_seqname(const ti_index_t *idx, int *n)
|
668
|
+
{
|
669
|
+
const char **names;
|
670
|
+
khint_t k;
|
671
|
+
*n = idx->n;
|
672
|
+
names = calloc(idx->n, sizeof(void*));
|
673
|
+
for (k = kh_begin(idx->tname); k < kh_end(idx->tname); ++k)
|
674
|
+
if (kh_exist(idx->tname, k))
|
675
|
+
names[kh_val(idx->tname, k)] = kh_key(idx->tname, k);
|
676
|
+
return names;
|
677
|
+
}
|
678
|
+
|
679
|
+
ti_index_t *ti_index_load(const char *fn)
|
680
|
+
{
|
681
|
+
ti_index_t *idx;
|
682
|
+
char *fname = get_local_version(fn);
|
683
|
+
if (fname == 0) return 0;
|
684
|
+
idx = ti_index_load_local(fname);
|
685
|
+
if (idx == 0) fprintf(stderr, "[ti_index_load] fail to load the index: %s\n", fname);
|
686
|
+
free(fname);
|
687
|
+
return idx;
|
688
|
+
}
|
689
|
+
|
690
|
+
int ti_index_build2(const char *fn, const ti_conf_t *conf, const char *_fnidx)
|
691
|
+
{
|
692
|
+
char *fnidx;
|
693
|
+
BGZF *fp, *fpidx;
|
694
|
+
ti_index_t *idx;
|
695
|
+
if ((fp = bgzf_open(fn, "r")) == 0) {
|
696
|
+
fprintf(stderr, "[ti_index_build2] fail to open the file: %s\n", fn);
|
697
|
+
return -1;
|
698
|
+
}
|
699
|
+
idx = ti_index_core(fp, conf);
|
700
|
+
bgzf_close(fp);
|
701
|
+
if (_fnidx == 0) {
|
702
|
+
fnidx = (char*)calloc(strlen(fn) + 5, 1);
|
703
|
+
strcpy(fnidx, fn); strcat(fnidx, ".tbi");
|
704
|
+
} else fnidx = strdup(_fnidx);
|
705
|
+
fpidx = bgzf_open(fnidx, "w");
|
706
|
+
if (fpidx == 0) {
|
707
|
+
fprintf(stderr, "[ti_index_build2] fail to create the index file.\n");
|
708
|
+
free(fnidx);
|
709
|
+
return -1;
|
710
|
+
}
|
711
|
+
ti_index_save(idx, fpidx);
|
712
|
+
ti_index_destroy(idx);
|
713
|
+
bgzf_close(fpidx);
|
714
|
+
free(fnidx);
|
715
|
+
return 0;
|
716
|
+
}
|
717
|
+
|
718
|
+
int ti_index_build(const char *fn, const ti_conf_t *conf)
|
719
|
+
{
|
720
|
+
return ti_index_build2(fn, conf, 0);
|
721
|
+
}
|
722
|
+
|
723
|
+
/********************************************
|
724
|
+
* parse a region in the format chr:beg-end *
|
725
|
+
********************************************/
|
726
|
+
|
727
|
+
int ti_get_tid(const ti_index_t *idx, const char *name)
|
728
|
+
{
|
729
|
+
khiter_t iter;
|
730
|
+
const khash_t(s) *h = idx->tname;
|
731
|
+
iter = kh_get(s, h, name); /* get the tid */
|
732
|
+
if (iter == kh_end(h)) return -1;
|
733
|
+
return kh_value(h, iter);
|
734
|
+
}
|
735
|
+
|
736
|
+
int ti_parse_region(const ti_index_t *idx, const char *str, int *tid, int *begin, int *end)
|
737
|
+
{
|
738
|
+
char *s, *p;
|
739
|
+
int i, l, k;
|
740
|
+
l = strlen(str);
|
741
|
+
p = s = (char*)malloc(l+1);
|
742
|
+
/* squeeze out "," */
|
743
|
+
for (i = k = 0; i != l; ++i)
|
744
|
+
if (str[i] != ',' && !isspace(str[i])) s[k++] = str[i];
|
745
|
+
s[k] = 0;
|
746
|
+
for (i = 0; i != k; ++i) if (s[i] == ':') break;
|
747
|
+
s[i] = 0;
|
748
|
+
if ((*tid = ti_get_tid(idx, s)) < 0) {
|
749
|
+
free(s);
|
750
|
+
return -1;
|
751
|
+
}
|
752
|
+
if (i == k) { /* dump the whole sequence */
|
753
|
+
*begin = 0; *end = 1<<29; free(s);
|
754
|
+
return 0;
|
755
|
+
}
|
756
|
+
for (p = s + i + 1; i != k; ++i) if (s[i] == '-') break;
|
757
|
+
*begin = atoi(p);
|
758
|
+
if (i < k) {
|
759
|
+
p = s + i + 1;
|
760
|
+
*end = atoi(p);
|
761
|
+
} else *end = 1<<29;
|
762
|
+
if (*begin > 0) --*begin;
|
763
|
+
free(s);
|
764
|
+
if (*begin > *end) return -1;
|
765
|
+
return 0;
|
766
|
+
}
|
767
|
+
|
768
|
+
/*******************************
|
769
|
+
* retrieve a specified region *
|
770
|
+
*******************************/
|
771
|
+
|
772
|
+
#define MAX_BIN 37450 // =(8^6-1)/7+1
|
773
|
+
|
774
|
+
static inline int reg2bins(uint32_t beg, uint32_t end, uint16_t list[MAX_BIN])
|
775
|
+
{
|
776
|
+
int i = 0, k;
|
777
|
+
if (beg >= end) return 0;
|
778
|
+
if (end >= 1u<<29) end = 1u<<29;
|
779
|
+
--end;
|
780
|
+
list[i++] = 0;
|
781
|
+
for (k = 1 + (beg>>26); k <= 1 + (end>>26); ++k) list[i++] = k;
|
782
|
+
for (k = 9 + (beg>>23); k <= 9 + (end>>23); ++k) list[i++] = k;
|
783
|
+
for (k = 73 + (beg>>20); k <= 73 + (end>>20); ++k) list[i++] = k;
|
784
|
+
for (k = 585 + (beg>>17); k <= 585 + (end>>17); ++k) list[i++] = k;
|
785
|
+
for (k = 4681 + (beg>>14); k <= 4681 + (end>>14); ++k) list[i++] = k;
|
786
|
+
return i;
|
787
|
+
}
|
788
|
+
|
789
|
+
ti_iter_t ti_iter_first()
|
790
|
+
{
|
791
|
+
ti_iter_t iter;
|
792
|
+
iter = calloc(1, sizeof(struct __ti_iter_t));
|
793
|
+
iter->from_first = 1;
|
794
|
+
return iter;
|
795
|
+
}
|
796
|
+
|
797
|
+
ti_iter_t ti_iter_query(const ti_index_t *idx, int tid, int beg, int end)
|
798
|
+
{
|
799
|
+
uint16_t *bins;
|
800
|
+
int i, n_bins, n_off;
|
801
|
+
pair64_t *off;
|
802
|
+
khint_t k;
|
803
|
+
khash_t(i) *index;
|
804
|
+
uint64_t min_off;
|
805
|
+
ti_iter_t iter = 0;
|
806
|
+
|
807
|
+
if (beg < 0) beg = 0;
|
808
|
+
if (end < beg) return 0;
|
809
|
+
// initialize the iterator
|
810
|
+
iter = calloc(1, sizeof(struct __ti_iter_t));
|
811
|
+
iter->idx = idx; iter->tid = tid; iter->beg = beg; iter->end = end; iter->i = -1;
|
812
|
+
// random access
|
813
|
+
bins = (uint16_t*)calloc(MAX_BIN, 2);
|
814
|
+
n_bins = reg2bins(beg, end, bins);
|
815
|
+
index = idx->index[tid];
|
816
|
+
if (idx->index2[tid].n > 0) {
|
817
|
+
min_off = (beg>>TAD_LIDX_SHIFT >= idx->index2[tid].n)? idx->index2[tid].offset[idx->index2[tid].n-1]
|
818
|
+
: idx->index2[tid].offset[beg>>TAD_LIDX_SHIFT];
|
819
|
+
if (min_off == 0) { // improvement for index files built by tabix prior to 0.1.4
|
820
|
+
int n = beg>>TAD_LIDX_SHIFT;
|
821
|
+
if (n > idx->index2[tid].n) n = idx->index2[tid].n;
|
822
|
+
for (i = n - 1; i >= 0; --i)
|
823
|
+
if (idx->index2[tid].offset[i] != 0) break;
|
824
|
+
if (i >= 0) min_off = idx->index2[tid].offset[i];
|
825
|
+
}
|
826
|
+
} else min_off = 0; // tabix 0.1.2 may produce such index files
|
827
|
+
for (i = n_off = 0; i < n_bins; ++i) {
|
828
|
+
if ((k = kh_get(i, index, bins[i])) != kh_end(index))
|
829
|
+
n_off += kh_value(index, k).n;
|
830
|
+
}
|
831
|
+
if (n_off == 0) {
|
832
|
+
free(bins); return iter;
|
833
|
+
}
|
834
|
+
off = (pair64_t*)calloc(n_off, 16);
|
835
|
+
for (i = n_off = 0; i < n_bins; ++i) {
|
836
|
+
if ((k = kh_get(i, index, bins[i])) != kh_end(index)) {
|
837
|
+
int j;
|
838
|
+
ti_binlist_t *p = &kh_value(index, k);
|
839
|
+
for (j = 0; j < p->n; ++j)
|
840
|
+
if (p->list[j].v > min_off) off[n_off++] = p->list[j];
|
841
|
+
}
|
842
|
+
}
|
843
|
+
if (n_off == 0) {
|
844
|
+
free(bins); free(off); return iter;
|
845
|
+
}
|
846
|
+
free(bins);
|
847
|
+
{
|
848
|
+
int l;
|
849
|
+
ks_introsort(offt, n_off, off);
|
850
|
+
// resolve completely contained adjacent blocks
|
851
|
+
for (i = 1, l = 0; i < n_off; ++i)
|
852
|
+
if (off[l].v < off[i].v)
|
853
|
+
off[++l] = off[i];
|
854
|
+
n_off = l + 1;
|
855
|
+
// resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
|
856
|
+
for (i = 1; i < n_off; ++i)
|
857
|
+
if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u;
|
858
|
+
{ // merge adjacent blocks
|
859
|
+
for (i = 1, l = 0; i < n_off; ++i) {
|
860
|
+
if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;
|
861
|
+
else off[++l] = off[i];
|
862
|
+
}
|
863
|
+
n_off = l + 1;
|
864
|
+
}
|
865
|
+
}
|
866
|
+
iter->n_off = n_off; iter->off = off;
|
867
|
+
return iter;
|
868
|
+
}
|
869
|
+
|
870
|
+
const char *ti_iter_read(BGZF *fp, ti_iter_t iter, int *len)
|
871
|
+
{
|
872
|
+
if (iter->finished) return 0;
|
873
|
+
if (iter->from_first) {
|
874
|
+
int ret;
|
875
|
+
if ((ret = ti_readline(fp, &iter->str)) < 0) {
|
876
|
+
iter->finished = 1;
|
877
|
+
return 0;
|
878
|
+
} else {
|
879
|
+
if (len) *len = iter->str.l;
|
880
|
+
return iter->str.s;
|
881
|
+
}
|
882
|
+
}
|
883
|
+
if (iter->n_off == 0) return 0;
|
884
|
+
while (1) {
|
885
|
+
int ret;
|
886
|
+
if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v) { // then jump to the next chunk
|
887
|
+
if (iter->i == iter->n_off - 1) break; // no more chunks
|
888
|
+
if (iter->i >= 0) assert(iter->curr_off == iter->off[iter->i].v); // otherwise bug
|
889
|
+
if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek
|
890
|
+
bgzf_seek(fp, iter->off[iter->i+1].u, SEEK_SET);
|
891
|
+
iter->curr_off = bgzf_tell(fp);
|
892
|
+
}
|
893
|
+
++iter->i;
|
894
|
+
}
|
895
|
+
if ((ret = ti_readline(fp, &iter->str)) >= 0) {
|
896
|
+
ti_intv_t intv;
|
897
|
+
iter->curr_off = bgzf_tell(fp);
|
898
|
+
if (iter->str.s[0] == iter->idx->conf.meta_char) continue;
|
899
|
+
get_intv((ti_index_t*)iter->idx, &iter->str, &intv);
|
900
|
+
if (intv.tid != iter->tid || intv.beg >= iter->end) break; // no need to proceed
|
901
|
+
else if (intv.end > iter->beg && iter->end > intv.beg) {
|
902
|
+
if (len) *len = iter->str.l;
|
903
|
+
return iter->str.s;
|
904
|
+
}
|
905
|
+
} else break; // end of file
|
906
|
+
}
|
907
|
+
iter->finished = 1;
|
908
|
+
return 0;
|
909
|
+
}
|
910
|
+
|
911
|
+
void ti_iter_destroy(ti_iter_t iter)
|
912
|
+
{
|
913
|
+
if (iter) {
|
914
|
+
free(iter->str.s); free(iter->off);
|
915
|
+
free(iter);
|
916
|
+
}
|
917
|
+
}
|
918
|
+
|
919
|
+
int ti_fetch(BGZF *fp, const ti_index_t *idx, int tid, int beg, int end, void *data, ti_fetch_f func)
|
920
|
+
{
|
921
|
+
ti_iter_t iter;
|
922
|
+
const char *s;
|
923
|
+
int len;
|
924
|
+
iter = ti_iter_query(idx, tid, beg, end);
|
925
|
+
while ((s = ti_iter_read(fp, iter, &len)) != 0)
|
926
|
+
func(len, s, data);
|
927
|
+
ti_iter_destroy(iter);
|
928
|
+
return 0;
|
929
|
+
}
|
930
|
+
|
931
|
+
const ti_conf_t *ti_get_conf(ti_index_t *idx) { return idx? &idx->conf : 0; }
|
932
|
+
|
933
|
+
/*******************
|
934
|
+
* High-level APIs *
|
935
|
+
*******************/
|
936
|
+
|
937
|
+
tabix_t *ti_open(const char *fn, const char *fnidx)
|
938
|
+
{
|
939
|
+
tabix_t *t;
|
940
|
+
BGZF *fp;
|
941
|
+
if ((fp = bgzf_open(fn, "r")) == 0) return 0;
|
942
|
+
t = calloc(1, sizeof(tabix_t));
|
943
|
+
t->fn = strdup(fn);
|
944
|
+
if (fnidx) t->fnidx = strdup(fnidx);
|
945
|
+
t->fp = fp;
|
946
|
+
return t;
|
947
|
+
}
|
948
|
+
|
949
|
+
void ti_close(tabix_t *t)
|
950
|
+
{
|
951
|
+
if (t) {
|
952
|
+
bgzf_close(t->fp);
|
953
|
+
if (t->idx) ti_index_destroy(t->idx);
|
954
|
+
free(t->fn); free(t->fnidx);
|
955
|
+
free(t);
|
956
|
+
}
|
957
|
+
}
|
958
|
+
|
959
|
+
int ti_lazy_index_load(tabix_t *t)
|
960
|
+
{
|
961
|
+
if (t->idx == 0) { // load index
|
962
|
+
if (t->fnidx) t->idx = ti_index_load_local(t->fnidx);
|
963
|
+
else t->idx = ti_index_load(t->fn);
|
964
|
+
if (t->idx == 0) return -1; // fail to load index
|
965
|
+
}
|
966
|
+
return 0;
|
967
|
+
}
|
968
|
+
|
969
|
+
ti_iter_t ti_queryi(tabix_t *t, int tid, int beg, int end)
|
970
|
+
{
|
971
|
+
if (tid < 0) return ti_iter_first();
|
972
|
+
if (ti_lazy_index_load(t) != 0) return 0;
|
973
|
+
return ti_iter_query(t->idx, tid, beg, end);
|
974
|
+
}
|
975
|
+
|
976
|
+
ti_iter_t ti_querys(tabix_t *t, const char *reg)
|
977
|
+
{
|
978
|
+
int tid, beg, end;
|
979
|
+
if (reg == 0) return ti_iter_first();
|
980
|
+
if (ti_lazy_index_load(t) != 0) return 0;
|
981
|
+
if (ti_parse_region(t->idx, reg, &tid, &beg, &end) < 0) return 0;
|
982
|
+
return ti_iter_query(t->idx, tid, beg, end);
|
983
|
+
}
|
984
|
+
|
985
|
+
ti_iter_t ti_query(tabix_t *t, const char *name, int beg, int end)
|
986
|
+
{
|
987
|
+
int tid;
|
988
|
+
if (name == 0) return ti_iter_first();
|
989
|
+
// then need to load the index
|
990
|
+
if (ti_lazy_index_load(t) != 0) return 0;
|
991
|
+
if ((tid = ti_get_tid(t->idx, name)) < 0) return 0;
|
992
|
+
return ti_iter_query(t->idx, tid, beg, end);
|
993
|
+
}
|
994
|
+
|
995
|
+
const char *ti_read(tabix_t *t, ti_iter_t iter, int *len)
|
996
|
+
{
|
997
|
+
return ti_iter_read(t->fp, iter, len);
|
998
|
+
}
|