ngs_server 0.4 → 0.5

Sign up to get free protection for your applications and to get access to all the features.
data/ext/tabix/NEWS ADDED
@@ -0,0 +1,126 @@
1
+ Release 0.2.4 (10 April, 2011)
2
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
3
+
4
+ Notable changes:
5
+
6
+ * Give an error if the index file is older than the data file.
7
+
8
+ * Avoid a segfault given flawed input.
9
+
10
+ * Added Python APIs contributed by Hyeshik Chang. The new APIs do not bind to
11
+ the dynamic library and are reported to be faster. Pysam also comes with a
12
+ tabix binding.
13
+
14
+ * Added option "-r" for efficient header replacement.
15
+
16
+ * Added BED support.
17
+
18
+ * Synchronized the BGZF library between tabix and samtools.
19
+
20
+ (0.2.4: 10 April 2011, r949)
21
+
22
+
23
+
24
+ Beta Release 0.2.3 (8 December, 2010)
25
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
26
+
27
+ Notable changes:
28
+
29
+ * Fixed a minor bug where the first record in a headerless file may be
30
+ missed.
31
+
32
+ * Added an option to print header lines.
33
+
34
+ * Fixed a rare bug which may occasionally happen when retrieving data
35
+ from a region without any records.
36
+
37
+ * Enhanced error reporting.
38
+
39
+ * Fixed a bug in bgzip which may delete the original file even if not
40
+ intended.
41
+
42
+ (0.2.3: 8 December 2010, r876)
43
+
44
+
45
+
46
+ Beta Release 0.2.2 (28 June, 2010)
47
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
48
+
49
+ Notable changes:
50
+
51
+ * Dropped the VCF3 support. Added VCF4 support.
52
+
53
+ * Avoided the function name collision with samtools.
54
+
55
+ (0.2.2: 28 June 2010, r603)
56
+
57
+
58
+
59
+ Beta Release 0.2.1 (3 June, 2010)
60
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
61
+
62
+ Notable changes:
63
+
64
+ * Allow shared library to be compiled. Added python binding to the
65
+ shared library.
66
+
67
+ (0.2.1: 3 June 2010, r582)
68
+
69
+
70
+
71
+ Beta Release 0.2.0 (11 May, 2010)
72
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
73
+
74
+ Notable changes:
75
+
76
+ * Fixed an issue for random access given an interval end larger than
77
+ 2^29.
78
+
79
+ * Updated the Java binding.
80
+
81
+ * Added a Perl module using XS.
82
+
83
+ * Improved the C APIs.
84
+
85
+ (0.2.0: 11 May 2010, r574)
86
+
87
+
88
+
89
+ Beta Release 0.1.6 (9 May, 2010)
90
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
91
+
92
+ Notable changes:
93
+
94
+ * Improved backward compatibility. Release 0.1.5 does not work with the
95
+ buggy index file generated by 0.1.2.
96
+
97
+ * Fixed a bug in building linear index. The bug does not affect the
98
+ results, only affects efficiency in rare cases.
99
+
100
+ * Reduced the number of seek calls given an index generated by old
101
+ version of tabix.
102
+
103
+ * Added new APIs for retrieving data via an iterator. The old callback
104
+ APIs are not changed, although internally it uses iterator to
105
+ retrieve data.
106
+
107
+ I am trying to freeze tabix. I just hope I am committing new bugs.
108
+
109
+ (0.1.6: 9 May 2010, r563)
110
+
111
+
112
+
113
+ Beta Release 0.1.5 (5 May, 2010)
114
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
115
+
116
+ Notable changes:
117
+
118
+ * Clarified that tabix is released under MIT/X11.
119
+
120
+ * Improved the robustness of indexing and retrieval.
121
+
122
+ * Reduced the number of seek calls when the specified region starts
123
+ from a 16kb block with no data. The index format is the same, but the
124
+ content is changed a little.
125
+
126
+ (0.1.5: 5 May 2010, r560)
@@ -0,0 +1,395 @@
1
+ /* The MIT License
2
+
3
+ Copyright (c) 2010 Broad Institute.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23
+ SOFTWARE.
24
+ */
25
+
26
+ /* Contact: Heng Li <hengli@broadinstitute.org> */
27
+
28
+ import net.sf.samtools.util.BlockCompressedInputStream;
29
+
30
+ import java.io.*;
31
+ import java.nio.*;
32
+ import java.util.HashMap;
33
+ import java.util.ArrayList;
34
+ import java.util.Arrays;
35
+ import java.lang.StringBuffer;
36
+
37
+ public class TabixReader
38
+ {
39
+ private String mFn;
40
+ private BlockCompressedInputStream mFp;
41
+
42
+ private int mPreset;
43
+ private int mSc;
44
+ private int mBc;
45
+ private int mEc;
46
+ private int mMeta;
47
+ private int mSkip;
48
+ private String[] mSeq;
49
+
50
+ private HashMap<String, Integer> mChr2tid;
51
+
52
+ private static int MAX_BIN = 37450;
53
+ private static int TAD_MIN_CHUNK_GAP = 32768;
54
+ private static int TAD_LIDX_SHIFT = 14;
55
+
56
+ private class TPair64 implements Comparable<TPair64> {
57
+ long u, v;
58
+ public TPair64(final long _u, final long _v) {
59
+ u = _u; v = _v;
60
+ }
61
+ public TPair64(final TPair64 p) {
62
+ u = p.u; v = p.v;
63
+ }
64
+ public int compareTo(final TPair64 p) {
65
+ return u == p.u? 0 : ((u < p.u) ^ (u < 0) ^ (p.u < 0))? -1 : 1; // unsigned 64-bit comparison
66
+ }
67
+ };
68
+
69
+ private class TIndex {
70
+ HashMap<Integer, TPair64[]> b; // binning index
71
+ long[] l; // linear index
72
+ };
73
+ private TIndex[] mIndex;
74
+
75
+ private class TIntv {
76
+ int tid, beg, end;
77
+ };
78
+
79
+ private static boolean less64(final long u, final long v) { // unsigned 64-bit comparison
80
+ return (u < v) ^ (u < 0) ^ (v < 0);
81
+ }
82
+
83
+ /**
84
+ * The constructor
85
+ *
86
+ * @param fn File name of the data file
87
+ */
88
+ public TabixReader(final String fn) throws IOException {
89
+ mFn = fn;
90
+ mFp = new BlockCompressedInputStream(new File(fn));
91
+ readIndex();
92
+ }
93
+
94
+ private static int reg2bins(final int beg, final int _end, final int[] list) {
95
+ int i = 0, k, end = _end;
96
+ if (beg >= end) return 0;
97
+ if (end >= 1<<29) end = 1<<29;
98
+ --end;
99
+ list[i++] = 0;
100
+ for (k = 1 + (beg>>26); k <= 1 + (end>>26); ++k) list[i++] = k;
101
+ for (k = 9 + (beg>>23); k <= 9 + (end>>23); ++k) list[i++] = k;
102
+ for (k = 73 + (beg>>20); k <= 73 + (end>>20); ++k) list[i++] = k;
103
+ for (k = 585 + (beg>>17); k <= 585 + (end>>17); ++k) list[i++] = k;
104
+ for (k = 4681 + (beg>>14); k <= 4681 + (end>>14); ++k) list[i++] = k;
105
+ return i;
106
+ }
107
+
108
+ public static int readInt(final InputStream is) throws IOException {
109
+ byte[] buf = new byte[4];
110
+ is.read(buf);
111
+ return ByteBuffer.wrap(buf).order(ByteOrder.LITTLE_ENDIAN).getInt();
112
+ }
113
+
114
+ public static long readLong(final InputStream is) throws IOException {
115
+ byte[] buf = new byte[8];
116
+ is.read(buf);
117
+ return ByteBuffer.wrap(buf).order(ByteOrder.LITTLE_ENDIAN).getLong();
118
+ }
119
+
120
+ public static String readLine(final InputStream is) throws IOException {
121
+ StringBuffer buf = new StringBuffer();
122
+ int c;
123
+ while ((c = is.read()) >= 0 && c != '\n')
124
+ buf.append((char)c);
125
+ if (c < 0) return null;
126
+ return buf.toString();
127
+ }
128
+
129
+ /**
130
+ * Read the Tabix index from a file
131
+ *
132
+ * @param fp File pointer
133
+ */
134
+ public void readIndex(final File fp) throws IOException {
135
+ if (fp == null) return;
136
+ BlockCompressedInputStream is = new BlockCompressedInputStream(fp);
137
+ byte[] buf = new byte[4];
138
+
139
+ is.read(buf, 0, 4); // read "TBI\1"
140
+ mSeq = new String[readInt(is)]; // # sequences
141
+ mChr2tid = new HashMap<String, Integer>();
142
+ mPreset = readInt(is);
143
+ mSc = readInt(is);
144
+ mBc = readInt(is);
145
+ mEc = readInt(is);
146
+ mMeta = readInt(is);
147
+ mSkip = readInt(is);
148
+ // read sequence dictionary
149
+ int i, j, k, l = readInt(is);
150
+ buf = new byte[l];
151
+ is.read(buf);
152
+ for (i = j = k = 0; i < buf.length; ++i) {
153
+ if (buf[i] == 0) {
154
+ byte[] b = new byte[i - j];
155
+ System.arraycopy(buf, j, b, 0, b.length);
156
+ String s = new String(b);
157
+ mChr2tid.put(s, k);
158
+ mSeq[k++] = s;
159
+ j = i + 1;
160
+ }
161
+ }
162
+ // read the index
163
+ mIndex = new TIndex[mSeq.length];
164
+ for (i = 0; i < mSeq.length; ++i) {
165
+ // the binning index
166
+ int n_bin = readInt(is);
167
+ mIndex[i] = new TIndex();
168
+ mIndex[i].b = new HashMap<Integer, TPair64[]>();
169
+ for (j = 0; j < n_bin; ++j) {
170
+ int bin = readInt(is);
171
+ TPair64[] chunks = new TPair64[readInt(is)];
172
+ for (k = 0; k < chunks.length; ++k) {
173
+ long u = readLong(is);
174
+ long v = readLong(is);
175
+ chunks[k] = new TPair64(u, v); // in C, this is inefficient
176
+ }
177
+ mIndex[i].b.put(bin, chunks);
178
+ }
179
+ // the linear index
180
+ mIndex[i].l = new long[readInt(is)];
181
+ for (k = 0; k < mIndex[i].l.length; ++k)
182
+ mIndex[i].l[k] = readLong(is);
183
+ }
184
+ // close
185
+ is.close();
186
+ }
187
+
188
+ /**
189
+ * Read the Tabix index from the default file.
190
+ */
191
+ public void readIndex() throws IOException {
192
+ readIndex(new File(mFn + ".tbi"));
193
+ }
194
+
195
+ /**
196
+ * Read one line from the data file.
197
+ */
198
+ public String readLine() throws IOException {
199
+ return readLine(mFp);
200
+ }
201
+
202
+ private int chr2tid(final String chr) {
203
+ if (mChr2tid.containsKey(chr)) return mChr2tid.get(chr);
204
+ else return -1;
205
+ }
206
+
207
+ /**
208
+ * Parse a region in the format of "chr1", "chr1:100" or "chr1:100-1000"
209
+ *
210
+ * @param reg Region string
211
+ * @return An array where the three elements are sequence_id,
212
+ * region_begin and region_end. On failure, sequence_id==-1.
213
+ */
214
+ public int[] parseReg(final String reg) { // FIXME: NOT working when the sequence name contains : or -.
215
+ String chr;
216
+ int colon, hyphen;
217
+ int[] ret = new int[3];
218
+ colon = reg.indexOf(':'); hyphen = reg.indexOf('-');
219
+ chr = colon >= 0? reg.substring(0, colon) : reg;
220
+ ret[1] = colon >= 0? Integer.parseInt(reg.substring(colon+1, hyphen >= 0? hyphen : reg.length())) - 1 : 0;
221
+ ret[2] = hyphen >= 0? Integer.parseInt(reg.substring(hyphen+1)) : 0x7fffffff;
222
+ ret[0] = chr2tid(chr);
223
+ return ret;
224
+ }
225
+
226
+ private TIntv getIntv(final String s) {
227
+ TIntv intv = new TIntv();
228
+ int col = 0, end = 0, beg = 0;
229
+ while ((end = s.indexOf('\t', beg)) >= 0 || end == -1) {
230
+ ++col;
231
+ if (col == mSc) {
232
+ intv.tid = chr2tid(s.substring(beg, end));
233
+ } else if (col == mBc) {
234
+ intv.beg = intv.end = Integer.parseInt(s.substring(beg, end));
235
+ if ((mPreset&0x10000) != 0) ++intv.end;
236
+ else --intv.beg;
237
+ if (intv.beg < 0) intv.beg = 0;
238
+ if (intv.end < 1) intv.end = 1;
239
+ } else { // FIXME: SAM supports are not tested yet
240
+ if ((mPreset&0xffff) == 0) { // generic
241
+ if (col == mEc)
242
+ intv.end = Integer.parseInt(s.substring(beg, end));
243
+ } else if ((mPreset&0xffff) == 1) { // SAM
244
+ if (col == 6) { // CIGAR
245
+ int l = 0, i, j;
246
+ String cigar = s.substring(beg, end);
247
+ for (i = j = 0; i < cigar.length(); ++i) {
248
+ if (cigar.charAt(i) > '9') {
249
+ int op = cigar.charAt(i);
250
+ if (op == 'M' || op == 'D' || op == 'N')
251
+ l += Integer.parseInt(cigar.substring(j, i));
252
+ }
253
+ }
254
+ intv.end = intv.beg + l;
255
+ }
256
+ } else if ((mPreset&0xffff) == 2) { // VCF
257
+ String alt;
258
+ alt = end >= 0? s.substring(beg, end) : s.substring(beg);
259
+ if (col == 4) { // REF
260
+ if (alt.length() > 0) intv.end = intv.beg + alt.length();
261
+ } else if (col == 8) { // INFO
262
+ int e_off = -1, i = alt.indexOf("END=");
263
+ if (i == 0) e_off = 4;
264
+ else if (i > 0) {
265
+ i = alt.indexOf(";END=");
266
+ if (i >= 0) e_off = i + 5;
267
+ }
268
+ if (e_off > 0) {
269
+ i = alt.indexOf(";", e_off);
270
+ intv.end = Integer.parseInt(i > e_off? alt.substring(e_off, i) : alt.substring(e_off));
271
+ }
272
+ }
273
+ }
274
+ }
275
+ if (end == -1) break;
276
+ beg = end + 1;
277
+ }
278
+ return intv;
279
+ }
280
+
281
+ public class Iterator {
282
+ private int i, n_seeks;
283
+ private int tid, beg, end;
284
+ private TPair64[] off;
285
+ private long curr_off;
286
+ private boolean iseof;
287
+
288
+ public Iterator(final int _tid, final int _beg, final int _end, final TPair64[] _off) {
289
+ i = -1; n_seeks = 0; curr_off = 0; iseof = false;
290
+ off = _off; tid = _tid; beg = _beg; end = _end;
291
+ }
292
+
293
+ public String next() throws IOException {
294
+ if (iseof) return null;
295
+ for (;;) {
296
+ if (curr_off == 0 || !less64(curr_off, off[i].v)) { // then jump to the next chunk
297
+ if (i == off.length - 1) break; // no more chunks
298
+ if (i >= 0) assert(curr_off == off[i].v); // otherwise bug
299
+ if (i < 0 || off[i].v != off[i+1].u) { // not adjacent chunks; then seek
300
+ mFp.seek(off[i+1].u);
301
+ curr_off = mFp.getFilePointer();
302
+ ++n_seeks;
303
+ }
304
+ ++i;
305
+ }
306
+ String s;
307
+ if ((s = readLine(mFp)) != null) {
308
+ TIntv intv;
309
+ char[] str = s.toCharArray();
310
+ curr_off = mFp.getFilePointer();
311
+ if (str.length == 0 || str[0] == mMeta) continue;
312
+ intv = getIntv(s);
313
+ if (intv.tid != tid || intv.beg >= end) break; // no need to proceed
314
+ else if (intv.end > beg && intv.beg < end) return s; // overlap; return
315
+ } else break; // end of file
316
+ }
317
+ iseof = true;
318
+ return null;
319
+ }
320
+ };
321
+
322
+ public Iterator query(final int tid, final int beg, final int end) {
323
+ TPair64[] off, chunks;
324
+ long min_off;
325
+ TIndex idx = mIndex[tid];
326
+ int[] bins = new int[MAX_BIN];
327
+ int i, l, n_off, n_bins = reg2bins(beg, end, bins);
328
+ if (idx.l.length > 0)
329
+ min_off = (beg>>TAD_LIDX_SHIFT >= idx.l.length)? idx.l[idx.l.length-1] : idx.l[beg>>TAD_LIDX_SHIFT];
330
+ else min_off = 0;
331
+ for (i = n_off = 0; i < n_bins; ++i) {
332
+ if ((chunks = idx.b.get(bins[i])) != null)
333
+ n_off += chunks.length;
334
+ }
335
+ if (n_off == 0) return null;
336
+ off = new TPair64[n_off];
337
+ for (i = n_off = 0; i < n_bins; ++i)
338
+ if ((chunks = idx.b.get(bins[i])) != null)
339
+ for (int j = 0; j < chunks.length; ++j)
340
+ if (less64(min_off, chunks[j].v))
341
+ off[n_off++] = new TPair64(chunks[j]);
342
+ if (n_off == 0) return null;
343
+ Arrays.sort(off, 0, n_off);
344
+ // resolve completely contained adjacent blocks
345
+ for (i = 1, l = 0; i < n_off; ++i) {
346
+ if (less64(off[l].v, off[i].v)) {
347
+ ++l;
348
+ off[l].u = off[i].u; off[l].v = off[i].v;
349
+ }
350
+ }
351
+ n_off = l + 1;
352
+ // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
353
+ for (i = 1; i < n_off; ++i)
354
+ if (!less64(off[i-1].v, off[i].u)) off[i-1].v = off[i].u;
355
+ // merge adjacent blocks
356
+ for (i = 1, l = 0; i < n_off; ++i) {
357
+ if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;
358
+ else {
359
+ ++l;
360
+ off[l].u = off[i].u;
361
+ off[l].v = off[i].v;
362
+ }
363
+ }
364
+ n_off = l + 1;
365
+ // return
366
+ TPair64[] ret = new TPair64[n_off];
367
+ for (i = 0; i < n_off; ++i) ret[i] = new TPair64(off[i].u, off[i].v); // in C, this is inefficient
368
+ return new TabixReader.Iterator(tid, beg, end, ret);
369
+ }
370
+
371
+ public Iterator query(final String reg) {
372
+ int[] x = parseReg(reg);
373
+ return query(x[0], x[1], x[2]);
374
+ }
375
+
376
+ public static void main(String[] args) {
377
+ if (args.length < 1) {
378
+ System.out.println("Usage: java -cp .:sam.jar TabixReader <in.gz> [region]");
379
+ System.exit(1);
380
+ }
381
+ try {
382
+ TabixReader tr = new TabixReader(args[0]);
383
+ String s;
384
+ if (args.length == 1) { // no region is specified; print the whole file
385
+ while ((s = tr.readLine()) != null)
386
+ System.out.println(s);
387
+ } else { // a region is specified; random access
388
+ TabixReader.Iterator iter = tr.query(args[1]); // get the iterator
389
+ while (iter != null && (s = iter.next()) != null)
390
+ System.out.println(s);
391
+ }
392
+ } catch (IOException e) {
393
+ }
394
+ }
395
+ }
@@ -0,0 +1,42 @@
1
+ #ifndef BAM_ENDIAN_H
2
+ #define BAM_ENDIAN_H
3
+
4
+ #include <stdint.h>
5
+
6
+ static inline int bam_is_big_endian()
7
+ {
8
+ long one= 1;
9
+ return !(*((char *)(&one)));
10
+ }
11
+ static inline uint16_t bam_swap_endian_2(uint16_t v)
12
+ {
13
+ return (uint16_t)(((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8));
14
+ }
15
+ static inline void *bam_swap_endian_2p(void *x)
16
+ {
17
+ *(uint16_t*)x = bam_swap_endian_2(*(uint16_t*)x);
18
+ return x;
19
+ }
20
+ static inline uint32_t bam_swap_endian_4(uint32_t v)
21
+ {
22
+ v = ((v & 0x0000FFFFU) << 16) | (v >> 16);
23
+ return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8);
24
+ }
25
+ static inline void *bam_swap_endian_4p(void *x)
26
+ {
27
+ *(uint32_t*)x = bam_swap_endian_4(*(uint32_t*)x);
28
+ return x;
29
+ }
30
+ static inline uint64_t bam_swap_endian_8(uint64_t v)
31
+ {
32
+ v = ((v & 0x00000000FFFFFFFFLLU) << 32) | (v >> 32);
33
+ v = ((v & 0x0000FFFF0000FFFFLLU) << 16) | ((v & 0xFFFF0000FFFF0000LLU) >> 16);
34
+ return ((v & 0x00FF00FF00FF00FFLLU) << 8) | ((v & 0xFF00FF00FF00FF00LLU) >> 8);
35
+ }
36
+ static inline void *bam_swap_endian_8p(void *x)
37
+ {
38
+ *(uint64_t*)x = bam_swap_endian_8(*(uint64_t*)x);
39
+ return x;
40
+ }
41
+
42
+ #endif
@@ -0,0 +1,156 @@
1
+ #include <stdlib.h>
2
+ #include <stdint.h>
3
+ #include <string.h>
4
+ #include <stdio.h>
5
+ #include <zlib.h>
6
+
7
+ #include "ksort.h"
8
+ KSORT_INIT_GENERIC(uint64_t)
9
+
10
+ #include "kseq.h"
11
+ KSTREAM_INIT(gzFile, gzread, 8192)
12
+
13
+ typedef struct {
14
+ int n, m;
15
+ uint64_t *a;
16
+ int *idx;
17
+ } bed_reglist_t;
18
+
19
+ #include "khash.h"
20
+ KHASH_MAP_INIT_STR(reg, bed_reglist_t)
21
+
22
+ #define LIDX_SHIFT 13
23
+
24
+ typedef kh_reg_t reghash_t;
25
+
26
+ int *bed_index_core(int n, uint64_t *a, int *n_idx)
27
+ {
28
+ int i, j, m, *idx;
29
+ m = *n_idx = 0; idx = 0;
30
+ for (i = 0; i < n; ++i) {
31
+ int beg, end;
32
+ beg = a[i]>>32 >> LIDX_SHIFT; end = ((uint32_t)a[i]) >> LIDX_SHIFT;
33
+ if (m < end + 1) {
34
+ int oldm = m;
35
+ m = end + 1;
36
+ kroundup32(m);
37
+ idx = realloc(idx, m * sizeof(int));
38
+ for (j = oldm; j < m; ++j) idx[j] = -1;
39
+ }
40
+ if (beg == end) {
41
+ if (idx[beg] < 0) idx[beg] = i;
42
+ } else {
43
+ for (j = beg; j <= end; ++j)
44
+ if (idx[j] < 0) idx[j] = i;
45
+ }
46
+ *n_idx = end + 1;
47
+ }
48
+ return idx;
49
+ }
50
+
51
+ void bed_index(void *_h)
52
+ {
53
+ reghash_t *h = (reghash_t*)_h;
54
+ khint_t k;
55
+ for (k = 0; k < kh_end(h); ++k) {
56
+ if (kh_exist(h, k)) {
57
+ bed_reglist_t *p = &kh_val(h, k);
58
+ if (p->idx) free(p->idx);
59
+ ks_introsort(uint64_t, p->n, p->a);
60
+ p->idx = bed_index_core(p->n, p->a, &p->m);
61
+ }
62
+ }
63
+ }
64
+
65
+ int bed_overlap_core(const bed_reglist_t *p, int beg, int end)
66
+ {
67
+ int i, min_off;
68
+ if (p->n == 0) return 0;
69
+ min_off = (beg>>LIDX_SHIFT >= p->n)? p->idx[p->n-1] : p->idx[beg>>LIDX_SHIFT];
70
+ if (min_off < 0) { // TODO: this block can be improved, but speed should not matter too much here
71
+ int n = beg>>LIDX_SHIFT;
72
+ if (n > p->n) n = p->n;
73
+ for (i = n - 1; i >= 0; --i)
74
+ if (p->idx[i] >= 0) break;
75
+ min_off = i >= 0? p->idx[i] : 0;
76
+ }
77
+ for (i = min_off; i < p->n; ++i) {
78
+ if ((int)(p->a[i]>>32) >= end) break; // out of range; no need to proceed
79
+ if ((int32_t)p->a[i] > beg && (int32_t)(p->a[i]>>32) < end)
80
+ return 1; // find the overlap; return
81
+ }
82
+ return 0;
83
+ }
84
+
85
+ int bed_overlap(const void *_h, const char *chr, int beg, int end)
86
+ {
87
+ const reghash_t *h = (const reghash_t*)_h;
88
+ khint_t k;
89
+ if (!h) return 0;
90
+ k = kh_get(reg, h, chr);
91
+ if (k == kh_end(h)) return 0;
92
+ return bed_overlap_core(&kh_val(h, k), beg, end);
93
+ }
94
+
95
+ void *bed_read(const char *fn)
96
+ {
97
+ reghash_t *h = kh_init(reg);
98
+ gzFile fp;
99
+ kstream_t *ks;
100
+ int dret;
101
+ kstring_t *str;
102
+ // read the list
103
+ fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
104
+ if (fp == 0) return 0;
105
+ str = calloc(1, sizeof(kstring_t));
106
+ ks = ks_init(fp);
107
+ while (ks_getuntil(ks, 0, str, &dret) >= 0) { // read the chr name
108
+ int beg = -1, end = -1;
109
+ bed_reglist_t *p;
110
+ khint_t k = kh_get(reg, h, str->s);
111
+ if (k == kh_end(h)) { // absent from the hash table
112
+ int ret;
113
+ char *s = strdup(str->s);
114
+ k = kh_put(reg, h, s, &ret);
115
+ memset(&kh_val(h, k), 0, sizeof(bed_reglist_t));
116
+ }
117
+ p = &kh_val(h, k);
118
+ if (dret != '\n') { // if the lines has other characters
119
+ if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) {
120
+ beg = atoi(str->s); // begin
121
+ if (dret != '\n') {
122
+ if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0]))
123
+ end = atoi(str->s); // end
124
+ }
125
+ }
126
+ }
127
+ if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n'); // skip the rest of the line
128
+ if (end < 0 && beg > 0) end = beg, beg = beg - 1; // if there is only one column
129
+ if (beg >= 0 && end > beg) {
130
+ if (p->n == p->m) {
131
+ p->m = p->m? p->m<<1 : 4;
132
+ p->a = realloc(p->a, p->m * 8);
133
+ }
134
+ p->a[p->n++] = (uint64_t)beg<<32 | end;
135
+ }
136
+ }
137
+ ks_destroy(ks);
138
+ gzclose(fp);
139
+ free(str->s); free(str);
140
+ bed_index(h);
141
+ return h;
142
+ }
143
+
144
+ void bed_destroy(void *_h)
145
+ {
146
+ reghash_t *h = (reghash_t*)_h;
147
+ khint_t k;
148
+ for (k = 0; k < kh_end(h); ++k) {
149
+ if (kh_exist(h, k)) {
150
+ free(kh_val(h, k).a);
151
+ free(kh_val(h, k).idx);
152
+ free((char*)kh_key(h, k));
153
+ }
154
+ }
155
+ kh_destroy(reg, h);
156
+ }