ngs_server 0.4 → 0.5
Sign up to get free protection for your applications and to get access to all the features.
- data/ext/tabix/ChangeLog +593 -0
- data/ext/tabix/Makefile +65 -0
- data/ext/tabix/NEWS +126 -0
- data/ext/tabix/TabixReader.java +395 -0
- data/ext/tabix/bam_endian.h +42 -0
- data/ext/tabix/bedidx.c +156 -0
- data/ext/tabix/bgzf.c +714 -0
- data/ext/tabix/bgzf.h +157 -0
- data/ext/tabix/bgzip.c +206 -0
- data/ext/tabix/example.gtf.gz +0 -0
- data/ext/tabix/example.gtf.gz.tbi +0 -0
- data/ext/tabix/extconf.rb +1 -0
- data/ext/tabix/index.c +998 -0
- data/ext/tabix/khash.h +486 -0
- data/ext/tabix/knetfile.c +632 -0
- data/ext/tabix/knetfile.h +75 -0
- data/ext/tabix/kseq.h +227 -0
- data/ext/tabix/ksort.h +271 -0
- data/ext/tabix/kstring.c +165 -0
- data/ext/tabix/kstring.h +68 -0
- data/ext/tabix/main.c +290 -0
- data/ext/tabix/perl/MANIFEST +8 -0
- data/ext/tabix/perl/Makefile.PL +8 -0
- data/ext/tabix/perl/Tabix.pm +76 -0
- data/ext/tabix/perl/Tabix.xs +71 -0
- data/ext/tabix/perl/TabixIterator.pm +41 -0
- data/ext/tabix/perl/t/01local.t +28 -0
- data/ext/tabix/perl/t/02remote.t +28 -0
- data/ext/tabix/perl/typemap +3 -0
- data/ext/tabix/python/setup.py +55 -0
- data/ext/tabix/python/tabixmodule.c +408 -0
- data/ext/tabix/python/test.py +91 -0
- data/ext/tabix/tabix.1 +132 -0
- data/ext/tabix/tabix.h +145 -0
- data/ext/tabix/tabix.py +87 -0
- data/ext/tabix/tabix.tex +121 -0
- data/ext/vcftools/perl/Vcf.pm +5 -3
- data/ext/vcftools/perl/vcf-query +2 -0
- data/lib/ngs_server/version.rb +1 -1
- data/lib/ngs_server.rb +12 -11
- data/ngs_server.gemspec +1 -2
- metadata +39 -2
@@ -0,0 +1,408 @@
|
|
1
|
+
/*-
|
2
|
+
* The MIT License
|
3
|
+
*
|
4
|
+
* Copyright (c) 2011 Seoul National University.
|
5
|
+
*
|
6
|
+
* Permission is hereby granted, free of charge, to any person obtaining
|
7
|
+
* a copy of this software and associated documentation files (the
|
8
|
+
* "Software"), to deal in the Software without restriction, including
|
9
|
+
* without limitation the rights to use, copy, modify, merge, publish,
|
10
|
+
* distribute, sublicense, and/or sell copies of the Software, and to
|
11
|
+
* permit persons to whom the Software is furnished to do so, subject to
|
12
|
+
* the following conditions:
|
13
|
+
*
|
14
|
+
* The above copyright notice and this permission notice shall be
|
15
|
+
* included in all copies or substantial portions of the Software.
|
16
|
+
*
|
17
|
+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
18
|
+
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
19
|
+
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
20
|
+
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
21
|
+
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
22
|
+
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
23
|
+
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
24
|
+
* SOFTWARE.
|
25
|
+
*/
|
26
|
+
|
27
|
+
/*
|
28
|
+
* Contact: Hyeshik Chang <hyeshik@snu.ac.kr>
|
29
|
+
*/
|
30
|
+
|
31
|
+
#define PY_SSIZE_T_CLEAN
|
32
|
+
#include "Python.h"
|
33
|
+
#include "tabix.h"
|
34
|
+
|
35
|
+
static PyObject *TabixError;
|
36
|
+
|
37
|
+
typedef struct {
|
38
|
+
PyObject_HEAD
|
39
|
+
tabix_t *tb;
|
40
|
+
char *fn;
|
41
|
+
} TabixObject;
|
42
|
+
|
43
|
+
typedef struct {
|
44
|
+
PyObject_HEAD
|
45
|
+
TabixObject *tbobj;
|
46
|
+
ti_iter_t iter;
|
47
|
+
} TabixIteratorObject;
|
48
|
+
|
49
|
+
static PyTypeObject Tabix_Type, TabixIterator_Type;
|
50
|
+
|
51
|
+
/* --- TabixIterator --------------------------------------------------- */
|
52
|
+
|
53
|
+
static PyObject *
|
54
|
+
tabixiter_create(TabixObject *parentidx, ti_iter_t iter)
|
55
|
+
{
|
56
|
+
TabixIteratorObject *self;
|
57
|
+
|
58
|
+
self = PyObject_New(TabixIteratorObject, &TabixIterator_Type);
|
59
|
+
if (self == NULL)
|
60
|
+
return NULL;
|
61
|
+
|
62
|
+
Py_INCREF(parentidx);
|
63
|
+
self->tbobj = parentidx;
|
64
|
+
self->iter = iter;
|
65
|
+
|
66
|
+
return (PyObject *)self;
|
67
|
+
}
|
68
|
+
|
69
|
+
static void
|
70
|
+
tabixiter_dealloc(TabixIteratorObject *self)
|
71
|
+
{
|
72
|
+
ti_iter_destroy(self->iter);
|
73
|
+
Py_DECREF(self->tbobj);
|
74
|
+
PyObject_Del(self);
|
75
|
+
}
|
76
|
+
|
77
|
+
static PyObject *
|
78
|
+
tabixiter_iter(PyObject *self)
|
79
|
+
{
|
80
|
+
Py_INCREF(self);
|
81
|
+
return self;
|
82
|
+
}
|
83
|
+
|
84
|
+
#if PY_MAJOR_VERSION < 3
|
85
|
+
# define PYOBJECT_FROM_STRING_AND_SIZE PyString_FromStringAndSize
|
86
|
+
#else
|
87
|
+
# define PYOBJECT_FROM_STRING_AND_SIZE PyUnicode_FromStringAndSize
|
88
|
+
#endif
|
89
|
+
|
90
|
+
static PyObject *
|
91
|
+
tabixiter_iternext(TabixIteratorObject *self)
|
92
|
+
{
|
93
|
+
const char *chunk;
|
94
|
+
int len, i;
|
95
|
+
|
96
|
+
chunk = ti_read(self->tbobj->tb, self->iter, &len);
|
97
|
+
if (chunk != NULL) {
|
98
|
+
PyObject *ret, *column;
|
99
|
+
Py_ssize_t colidx;
|
100
|
+
const char *ptr, *begin;
|
101
|
+
|
102
|
+
ret = PyList_New(0);
|
103
|
+
if (ret == NULL)
|
104
|
+
return NULL;
|
105
|
+
|
106
|
+
colidx = 0;
|
107
|
+
ptr = begin = chunk;
|
108
|
+
for (i = len; i > 0; i--, ptr++)
|
109
|
+
if (*ptr == '\t') {
|
110
|
+
column = PYOBJECT_FROM_STRING_AND_SIZE(begin,
|
111
|
+
(Py_ssize_t)(ptr - begin));
|
112
|
+
if (column == NULL || PyList_Append(ret, column) == -1) {
|
113
|
+
Py_DECREF(ret);
|
114
|
+
return NULL;
|
115
|
+
}
|
116
|
+
|
117
|
+
Py_DECREF(column);
|
118
|
+
begin = ptr + 1;
|
119
|
+
colidx++;
|
120
|
+
}
|
121
|
+
|
122
|
+
column = PYOBJECT_FROM_STRING_AND_SIZE(begin, (Py_ssize_t)(ptr - begin));
|
123
|
+
if (column == NULL || PyList_Append(ret, column) == -1) {
|
124
|
+
Py_DECREF(ret);
|
125
|
+
return NULL;
|
126
|
+
}
|
127
|
+
Py_DECREF(column);
|
128
|
+
|
129
|
+
return ret;
|
130
|
+
}
|
131
|
+
else
|
132
|
+
return NULL;
|
133
|
+
}
|
134
|
+
|
135
|
+
static PyMethodDef tabixiter_methods[] = {
|
136
|
+
{NULL, NULL} /* sentinel */
|
137
|
+
};
|
138
|
+
|
139
|
+
static PyTypeObject TabixIterator_Type = {
|
140
|
+
PyVarObject_HEAD_INIT(NULL, 0)
|
141
|
+
"tabix.TabixIterator", /*tp_name*/
|
142
|
+
sizeof(TabixIteratorObject), /*tp_basicsize*/
|
143
|
+
0, /*tp_itemsize*/
|
144
|
+
/* methods */
|
145
|
+
(destructor)tabixiter_dealloc, /*tp_dealloc*/
|
146
|
+
0, /*tp_print*/
|
147
|
+
0, /*tp_getattr*/
|
148
|
+
0, /*tp_setattr*/
|
149
|
+
0, /*tp_compare*/
|
150
|
+
0, /*tp_repr*/
|
151
|
+
0, /*tp_as_number*/
|
152
|
+
0, /*tp_as_sequence*/
|
153
|
+
0, /*tp_as_mapping*/
|
154
|
+
0, /*tp_hash*/
|
155
|
+
0, /*tp_call*/
|
156
|
+
0, /*tp_str*/
|
157
|
+
0, /*tp_getattro*/
|
158
|
+
0, /*tp_setattro*/
|
159
|
+
0, /*tp_as_buffer*/
|
160
|
+
Py_TPFLAGS_DEFAULT, /*tp_flags*/
|
161
|
+
0, /*tp_doc*/
|
162
|
+
0, /*tp_traverse*/
|
163
|
+
0, /*tp_clear*/
|
164
|
+
0, /*tp_richcompare*/
|
165
|
+
0, /*tp_weaklistoffset*/
|
166
|
+
tabixiter_iter, /*tp_iter*/
|
167
|
+
(iternextfunc)tabixiter_iternext, /*tp_iternext*/
|
168
|
+
tabixiter_methods, /*tp_methods*/
|
169
|
+
0, /*tp_members*/
|
170
|
+
0, /*tp_getset*/
|
171
|
+
0, /*tp_base*/
|
172
|
+
0, /*tp_dict*/
|
173
|
+
0, /*tp_descr_get*/
|
174
|
+
0, /*tp_descr_set*/
|
175
|
+
0, /*tp_dictoffset*/
|
176
|
+
0, /*tp_init*/
|
177
|
+
0, /*tp_alloc*/
|
178
|
+
0, /*tp_new*/
|
179
|
+
0, /*tp_free*/
|
180
|
+
0, /*tp_is_gc*/
|
181
|
+
};
|
182
|
+
|
183
|
+
|
184
|
+
/* --- Tabix ----------------------------------------------------------- */
|
185
|
+
|
186
|
+
static PyObject *
|
187
|
+
tabix_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
|
188
|
+
{
|
189
|
+
TabixObject *self;
|
190
|
+
const char *fn, *fnidx=NULL;
|
191
|
+
static char *kwnames[]={"fn", "fnidx", NULL};
|
192
|
+
tabix_t *tb;
|
193
|
+
|
194
|
+
if (!PyArg_ParseTupleAndKeywords(args, kwds, "s|z:Tabix",
|
195
|
+
kwnames, &fn, &fnidx))
|
196
|
+
return NULL;
|
197
|
+
|
198
|
+
tb = ti_open(fn, fnidx);
|
199
|
+
if (tb == NULL) {
|
200
|
+
PyErr_SetString(TabixError, "Can't open the index file.");
|
201
|
+
return NULL;
|
202
|
+
}
|
203
|
+
|
204
|
+
self = (TabixObject *)type->tp_alloc(type, 0);
|
205
|
+
if (self == NULL)
|
206
|
+
return NULL;
|
207
|
+
|
208
|
+
self->tb = tb;
|
209
|
+
self->fn = strdup(fn);
|
210
|
+
|
211
|
+
return (PyObject *)self;
|
212
|
+
}
|
213
|
+
|
214
|
+
static void
|
215
|
+
tabix_dealloc(TabixObject *self)
|
216
|
+
{
|
217
|
+
free(self->fn);
|
218
|
+
ti_close(self->tb);
|
219
|
+
PyObject_Del(self);
|
220
|
+
}
|
221
|
+
|
222
|
+
static PyObject *
|
223
|
+
tabix_query(TabixObject *self, PyObject *args)
|
224
|
+
{
|
225
|
+
char *name;
|
226
|
+
int begin, end;
|
227
|
+
ti_iter_t result;
|
228
|
+
|
229
|
+
if (!PyArg_ParseTuple(args, "sii:query", &name, &begin, &end))
|
230
|
+
return NULL;
|
231
|
+
|
232
|
+
result = ti_query(self->tb, name, begin, end);
|
233
|
+
if (result == NULL) {
|
234
|
+
PyErr_SetString(TabixError, "query failed");
|
235
|
+
return NULL;
|
236
|
+
}
|
237
|
+
|
238
|
+
return tabixiter_create(self, result);
|
239
|
+
}
|
240
|
+
|
241
|
+
static PyObject *
|
242
|
+
tabix_queryi(TabixObject *self, PyObject *args)
|
243
|
+
{
|
244
|
+
int tid, begin, end;
|
245
|
+
ti_iter_t result;
|
246
|
+
|
247
|
+
if (!PyArg_ParseTuple(args, "iii:queryi", &tid, &begin, &end))
|
248
|
+
return NULL;
|
249
|
+
|
250
|
+
result = ti_queryi(self->tb, tid, begin, end);
|
251
|
+
if (result == NULL) {
|
252
|
+
PyErr_SetString(TabixError, "query failed");
|
253
|
+
return NULL;
|
254
|
+
}
|
255
|
+
|
256
|
+
return tabixiter_create(self, result);
|
257
|
+
}
|
258
|
+
|
259
|
+
static PyObject *
|
260
|
+
tabix_querys(TabixObject *self, PyObject *args)
|
261
|
+
{
|
262
|
+
const char *reg;
|
263
|
+
ti_iter_t result;
|
264
|
+
|
265
|
+
if (!PyArg_ParseTuple(args, "s:querys", ®))
|
266
|
+
return NULL;
|
267
|
+
|
268
|
+
result = ti_querys(self->tb, reg);
|
269
|
+
if (result == NULL) {
|
270
|
+
PyErr_SetString(TabixError, "query failed");
|
271
|
+
return NULL;
|
272
|
+
}
|
273
|
+
|
274
|
+
return tabixiter_create(self, result);
|
275
|
+
}
|
276
|
+
|
277
|
+
static PyObject *
|
278
|
+
tabix_repr(TabixObject *self)
|
279
|
+
{
|
280
|
+
#if PY_MAJOR_VERSION < 3
|
281
|
+
return PyString_FromFormat("<Tabix fn=\"%s\">", self->fn);
|
282
|
+
#else
|
283
|
+
return PyUnicode_FromFormat("<Tabix fn=\"%s\">", self->fn);
|
284
|
+
#endif
|
285
|
+
}
|
286
|
+
|
287
|
+
static PyMethodDef tabix_methods[] = {
|
288
|
+
{"query", (PyCFunction)tabix_query, METH_VARARGS,
|
289
|
+
PyDoc_STR("T.query(name, begin, end) -> iterator")},
|
290
|
+
{"queryi", (PyCFunction)tabix_queryi, METH_VARARGS,
|
291
|
+
PyDoc_STR("T.queryi(tid, begin, id) -> iterator")},
|
292
|
+
{"querys", (PyCFunction)tabix_querys, METH_VARARGS,
|
293
|
+
PyDoc_STR("T.querys(region) -> iterator")},
|
294
|
+
{NULL, NULL} /* sentinel */
|
295
|
+
};
|
296
|
+
|
297
|
+
static PyTypeObject Tabix_Type = {
|
298
|
+
/* The ob_type field must be initialized in the module init function
|
299
|
+
* to be portable to Windows without using C++. */
|
300
|
+
PyVarObject_HEAD_INIT(NULL, 0)
|
301
|
+
"tabix.Tabix", /*tp_name*/
|
302
|
+
sizeof(TabixObject), /*tp_basicsize*/
|
303
|
+
0, /*tp_itemsize*/
|
304
|
+
/* methods */
|
305
|
+
(destructor)tabix_dealloc, /*tp_dealloc*/
|
306
|
+
0, /*tp_print*/
|
307
|
+
0, /*tp_getattr*/
|
308
|
+
0, /*tp_setattr*/
|
309
|
+
0, /*tp_compare*/
|
310
|
+
(reprfunc)tabix_repr, /*tp_repr*/
|
311
|
+
0, /*tp_as_number*/
|
312
|
+
0, /*tp_as_sequence*/
|
313
|
+
0, /*tp_as_mapping*/
|
314
|
+
0, /*tp_hash*/
|
315
|
+
0, /*tp_call*/
|
316
|
+
0, /*tp_str*/
|
317
|
+
0, /*tp_getattro*/
|
318
|
+
0, /*tp_setattro*/
|
319
|
+
0, /*tp_as_buffer*/
|
320
|
+
Py_TPFLAGS_DEFAULT, /*tp_flags*/
|
321
|
+
0, /*tp_doc*/
|
322
|
+
0, /*tp_traverse*/
|
323
|
+
0, /*tp_clear*/
|
324
|
+
0, /*tp_richcompare*/
|
325
|
+
0, /*tp_weaklistoffset*/
|
326
|
+
0, /*tp_iter*/
|
327
|
+
0, /*tp_iternext*/
|
328
|
+
tabix_methods, /*tp_methods*/
|
329
|
+
0, /*tp_members*/
|
330
|
+
0, /*tp_getset*/
|
331
|
+
0, /*tp_base*/
|
332
|
+
0, /*tp_dict*/
|
333
|
+
0, /*tp_descr_get*/
|
334
|
+
0, /*tp_descr_set*/
|
335
|
+
0, /*tp_dictoffset*/
|
336
|
+
0, /*tp_init*/
|
337
|
+
0, /*tp_alloc*/
|
338
|
+
(newfunc)tabix_new, /*tp_new*/
|
339
|
+
0, /*tp_free*/
|
340
|
+
0, /*tp_is_gc*/
|
341
|
+
};
|
342
|
+
/* --------------------------------------------------------------------- */
|
343
|
+
|
344
|
+
static PyMethodDef tabix_functions[] = {
|
345
|
+
{NULL, NULL} /* sentinel */
|
346
|
+
};
|
347
|
+
|
348
|
+
PyDoc_STRVAR(module_doc,
|
349
|
+
"Python interface to tabix, Heng Li's generic indexer for TAB-delimited "
|
350
|
+
"genome position filesThis is a template module just for instruction.");
|
351
|
+
|
352
|
+
#if PY_MAJOR_VERSION >= 3
|
353
|
+
static struct PyModuleDef tabixmodule = {
|
354
|
+
PyModuleDef_HEAD_INIT,
|
355
|
+
"tabix",
|
356
|
+
module_doc,
|
357
|
+
-1,
|
358
|
+
tabix_functions,
|
359
|
+
NULL,
|
360
|
+
NULL,
|
361
|
+
NULL,
|
362
|
+
NULL
|
363
|
+
};
|
364
|
+
#endif
|
365
|
+
|
366
|
+
#if PY_MAJOR_VERSION < 3
|
367
|
+
PyMODINIT_FUNC inittabix(void)
|
368
|
+
#else
|
369
|
+
PyMODINIT_FUNC PyInit_tabix(void)
|
370
|
+
#endif
|
371
|
+
{
|
372
|
+
PyObject *m;
|
373
|
+
|
374
|
+
if (PyType_Ready(&Tabix_Type) < 0)
|
375
|
+
goto fail;
|
376
|
+
if (PyType_Ready(&TabixIterator_Type) < 0)
|
377
|
+
goto fail;
|
378
|
+
|
379
|
+
#if PY_MAJOR_VERSION < 3
|
380
|
+
m = Py_InitModule3("tabix", tabix_functions, module_doc);
|
381
|
+
#else
|
382
|
+
m = PyModule_Create(&tabixmodule);
|
383
|
+
#endif
|
384
|
+
if (m == NULL)
|
385
|
+
goto fail;
|
386
|
+
|
387
|
+
if (TabixError == NULL) {
|
388
|
+
TabixError = PyErr_NewException("tabix.error", NULL, NULL);
|
389
|
+
if (TabixError == NULL)
|
390
|
+
goto fail;
|
391
|
+
}
|
392
|
+
Py_INCREF(TabixError);
|
393
|
+
PyModule_AddObject(m, "error", TabixError);
|
394
|
+
|
395
|
+
PyModule_AddObject(m, "Tabix", (PyObject *)&Tabix_Type);
|
396
|
+
PyModule_AddObject(m, "TabixIterator", (PyObject *)&TabixIterator_Type);
|
397
|
+
|
398
|
+
#if PY_MAJOR_VERSION >= 3
|
399
|
+
return m;
|
400
|
+
#endif
|
401
|
+
|
402
|
+
fail:
|
403
|
+
#if PY_MAJOR_VERSION < 3
|
404
|
+
return;
|
405
|
+
#else
|
406
|
+
return NULL;
|
407
|
+
#endif
|
408
|
+
}
|
@@ -0,0 +1,91 @@
|
|
1
|
+
#
|
2
|
+
# The MIT License
|
3
|
+
#
|
4
|
+
# Copyright (c) 2011 Seoul National University.
|
5
|
+
#
|
6
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
7
|
+
# a copy of this software and associated documentation files (the
|
8
|
+
# "Software"), to deal in the Software without restriction, including
|
9
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
10
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
11
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
12
|
+
# the following conditions:
|
13
|
+
#
|
14
|
+
# The above copyright notice and this permission notice shall be
|
15
|
+
# included in all copies or substantial portions of the Software.
|
16
|
+
#
|
17
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
18
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
19
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
20
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
21
|
+
# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
22
|
+
# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
23
|
+
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
24
|
+
# SOFTWARE.
|
25
|
+
#
|
26
|
+
# Contact: Hyeshik Chang <hyeshik@snu.ac.kr>
|
27
|
+
|
28
|
+
import unittest
|
29
|
+
import random
|
30
|
+
import gzip
|
31
|
+
import tabix
|
32
|
+
|
33
|
+
EXAMPLEFILE = '../example.gtf.gz'
|
34
|
+
|
35
|
+
def load_example_regions(path):
|
36
|
+
alldata = []
|
37
|
+
for line in gzip.GzipFile(EXAMPLEFILE):
|
38
|
+
fields = line.decode('ascii')[:-1].split('\t')
|
39
|
+
seqid = fields[0]
|
40
|
+
begin = int(fields[3])
|
41
|
+
end = int(fields[4])
|
42
|
+
alldata.append((seqid, begin, end, fields[:7]))
|
43
|
+
|
44
|
+
return alldata
|
45
|
+
|
46
|
+
def does_overlap(A, B, C, D):
|
47
|
+
return (A <= D <= B) or (C <= B <= D)
|
48
|
+
|
49
|
+
def sample_test_dataset(regions, ntests):
|
50
|
+
seqids = [seqid for seqid, _, _, _ in regions]
|
51
|
+
lowerbound = max(0, min(begin for _, begin, _, _ in regions) - 1000)
|
52
|
+
upperbound = max(end for _, _, end, _ in regions) + 1000
|
53
|
+
|
54
|
+
tests = []
|
55
|
+
for i in range(ntests):
|
56
|
+
seqid = random.choice(seqids)
|
57
|
+
low = random.randrange(lowerbound, upperbound)
|
58
|
+
high = random.randrange(low, upperbound)
|
59
|
+
|
60
|
+
# for 1-based both-end inclusive intervals
|
61
|
+
matches = [info for seq, begin, end, info in regions
|
62
|
+
if seqid == seq and does_overlap(begin, end, low, high)]
|
63
|
+
|
64
|
+
tests.append((seqid, low, high, matches))
|
65
|
+
|
66
|
+
return tests
|
67
|
+
|
68
|
+
def tbresult2excerpt(tbmatches):
|
69
|
+
return [fields[:7] for fields in tbmatches]
|
70
|
+
|
71
|
+
class TabixTest(unittest.TestCase):
|
72
|
+
regions = load_example_regions(EXAMPLEFILE)
|
73
|
+
testset = sample_test_dataset(regions, 500)
|
74
|
+
|
75
|
+
def setUp(self):
|
76
|
+
self.tb = tabix.Tabix(EXAMPLEFILE)
|
77
|
+
|
78
|
+
def testQuery(self):
|
79
|
+
for seqid, low, high, matches in self.testset:
|
80
|
+
tbresult = tbresult2excerpt(self.tb.query(seqid, low, high))
|
81
|
+
self.assertEqual(tbresult, matches)
|
82
|
+
|
83
|
+
def testQueryS(self):
|
84
|
+
for seqid, low, high, matches in self.testset:
|
85
|
+
tbresult = tbresult2excerpt(self.tb.querys('%s:%d-%d' %
|
86
|
+
(seqid, low, high)))
|
87
|
+
self.assertEqual(tbresult, matches)
|
88
|
+
|
89
|
+
|
90
|
+
if __name__ == '__main__':
|
91
|
+
unittest.main()
|
data/ext/tabix/tabix.1
ADDED
@@ -0,0 +1,132 @@
|
|
1
|
+
.TH tabix 1 "11 May 2010" "tabix-0.2.0" "Bioinformatics tools"
|
2
|
+
.SH NAME
|
3
|
+
.PP
|
4
|
+
bgzip - Block compression/decompression utility
|
5
|
+
.PP
|
6
|
+
tabix - Generic indexer for TAB-delimited genome position files
|
7
|
+
.SH SYNOPSIS
|
8
|
+
.PP
|
9
|
+
.B bgzip
|
10
|
+
.RB [ \-cdhB ]
|
11
|
+
.RB [ \-b
|
12
|
+
.IR virtualOffset ]
|
13
|
+
.RB [ \-s
|
14
|
+
.IR size ]
|
15
|
+
.RI [ file ]
|
16
|
+
.PP
|
17
|
+
.B tabix
|
18
|
+
.RB [ \-0lf ]
|
19
|
+
.RB [ \-p
|
20
|
+
.R gff|bed|sam|vcf]
|
21
|
+
.RB [ \-s
|
22
|
+
.IR seqCol ]
|
23
|
+
.RB [ \-b
|
24
|
+
.IR begCol ]
|
25
|
+
.RB [ \-e
|
26
|
+
.IR endCol ]
|
27
|
+
.RB [ \-S
|
28
|
+
.IR lineSkip ]
|
29
|
+
.RB [ \-c
|
30
|
+
.IR metaChar ]
|
31
|
+
.I in.tab.bgz
|
32
|
+
.RI [ "region1 " [ "region2 " [ ... "]]]"
|
33
|
+
|
34
|
+
.SH DESCRIPTION
|
35
|
+
.PP
|
36
|
+
Tabix indexes a TAB-delimited genome position file
|
37
|
+
.I in.tab.bgz
|
38
|
+
and creates an index file
|
39
|
+
.I in.tab.bgz.tbi
|
40
|
+
when
|
41
|
+
.I region
|
42
|
+
is absent from the command-line. The input data file must be position
|
43
|
+
sorted and compressed by
|
44
|
+
.B bgzip
|
45
|
+
which has a
|
46
|
+
.BR gzip (1)
|
47
|
+
like interface. After indexing, tabix is able to quickly retrieve data
|
48
|
+
lines overlapping
|
49
|
+
.I regions
|
50
|
+
specified in the format "chr:beginPos-endPos". Fast data retrieval also
|
51
|
+
works over network if URI is given as a file name and in this case the
|
52
|
+
index file will be downloaded if it is not present locally.
|
53
|
+
|
54
|
+
.SH OPTIONS OF TABIX
|
55
|
+
.TP 10
|
56
|
+
.BI "-p " STR
|
57
|
+
Input format for indexing. Valid values are: gff, bed, sam, vcf and
|
58
|
+
psltab. This option should not be applied together with any of
|
59
|
+
.BR \-s ", " \-b ", " \-e ", " \-c " and " \-0 ;
|
60
|
+
it is not used for data retrieval because this setting is stored in
|
61
|
+
the index file. [gff]
|
62
|
+
.TP
|
63
|
+
.BI "-s " INT
|
64
|
+
Column of sequence name. Option
|
65
|
+
.BR \-s ", " \-b ", " \-e ", " \-S ", " \-c " and " \-0
|
66
|
+
are all stored in the index file and thus not used in data retrieval. [1]
|
67
|
+
.TP
|
68
|
+
.BI "-b " INT
|
69
|
+
Column of start chromosomal position. [4]
|
70
|
+
.TP
|
71
|
+
.BI "-e " INT
|
72
|
+
Column of end chromosomal position. The end column can be the same as the
|
73
|
+
start column. [5]
|
74
|
+
.TP
|
75
|
+
.BI "-S " INT
|
76
|
+
Skip first INT lines in the data file. [0]
|
77
|
+
.TP
|
78
|
+
.BI "-c " CHAR
|
79
|
+
Skip lines started with character CHAR. [#]
|
80
|
+
.TP
|
81
|
+
.B -0
|
82
|
+
Specify that the position in the data file is 0-based (e.g. UCSC files)
|
83
|
+
rather than 1-based.
|
84
|
+
.TP
|
85
|
+
.B -h
|
86
|
+
Print the header/meta lines.
|
87
|
+
.TP
|
88
|
+
.B -B
|
89
|
+
The second argument is a BED file. When this option is in use, the input
|
90
|
+
file may not be sorted or indexed. The entire input will be read sequentially. Nonetheless,
|
91
|
+
with this option, the format of the input must be specificed correctly on the command line.
|
92
|
+
.TP
|
93
|
+
.B -f
|
94
|
+
Force to overwrite the index file if it is present.
|
95
|
+
.TP
|
96
|
+
.B -l
|
97
|
+
List the sequence names stored in the index file.
|
98
|
+
.RE
|
99
|
+
|
100
|
+
.SH EXAMPLE
|
101
|
+
(grep ^"#" in.gff; grep -v ^"#" in.gff | sort -k1,1 -k4,4n) | bgzip > sorted.gff.gz;
|
102
|
+
|
103
|
+
tabix -p gff sorted.gff.gz;
|
104
|
+
|
105
|
+
tabix sorted.gff.gz chr1:10,000,000-20,000,000;
|
106
|
+
|
107
|
+
.SH NOTES
|
108
|
+
It is straightforward to achieve overlap queries using the standard
|
109
|
+
B-tree index (with or without binning) implemented in all SQL databases,
|
110
|
+
or the R-tree index in PostgreSQL and Oracle. But there are still many
|
111
|
+
reasons to use tabix. Firstly, tabix directly works with a lot of widely
|
112
|
+
used TAB-delimited formats such as GFF/GTF and BED. We do not need to
|
113
|
+
design database schema or specialized binary formats. Data do not need
|
114
|
+
to be duplicated in different formats, either. Secondly, tabix works on
|
115
|
+
compressed data files while most SQL databases do not. The GenCode
|
116
|
+
annotation GTF can be compressed down to 4%. Thirdly, tabix is
|
117
|
+
fast. The same indexing algorithm is known to work efficiently for an
|
118
|
+
alignment with a few billion short reads. SQL databases probably cannot
|
119
|
+
easily handle data at this scale. Last but not the least, tabix supports
|
120
|
+
remote data retrieval. One can put the data file and the index at an FTP
|
121
|
+
or HTTP server, and other users or even web services will be able to get
|
122
|
+
a slice without downloading the entire file.
|
123
|
+
|
124
|
+
.SH AUTHOR
|
125
|
+
.PP
|
126
|
+
Tabix was written by Heng Li. The BGZF library was originally
|
127
|
+
implemented by Bob Handsaker and modified by Heng Li for remote file
|
128
|
+
access and in-memory caching.
|
129
|
+
|
130
|
+
.SH SEE ALSO
|
131
|
+
.PP
|
132
|
+
.BR samtools (1)
|