bio-cgranges 0.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/LICENSE.txt +21 -0
- data/README.md +65 -0
- data/Rakefile +44 -0
- data/ext/bio/cgranges/cgranges/LICENSE.txt +23 -0
- data/ext/bio/cgranges/cgranges/README.md +133 -0
- data/ext/bio/cgranges/cgranges/cgranges.c +330 -0
- data/ext/bio/cgranges/cgranges/cgranges.h +87 -0
- data/ext/bio/cgranges/cgranges/khash.h +627 -0
- data/ext/bio/cgranges/cgranges.c +342 -0
- data/ext/bio/cgranges/cgranges.h +7 -0
- data/ext/bio/cgranges/extconf.rb +9 -0
- data/lib/bio/cgranges/version.rb +7 -0
- data/lib/bio/cgranges.rb +10 -0
- metadata +57 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 618c6866b3f708b148682a529b73efda7b782bfc567f92949f72c237fd98435b
|
4
|
+
data.tar.gz: 0ebbc69b7858f934fd98f60500e49b7300774b21e05ee04c654d55746e18f00b
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 0444fad48ec7b6266072f2b7fb23684bf295703121c6d0b5b392e3c150de6dda02adbd1ecff161793b5f1a821f8d10cac564c4c60042f5fc4cc191bb26181620
|
7
|
+
data.tar.gz: 7f783598ec7ed7937c8163593901f055463175e4b16ab0073cf41bb96341f858077877524c5e1409369822e1efd5eaed80a11acc6c7678d9e27dee4ee3e14dec
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2021 kojix2
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,65 @@
|
|
1
|
+
# bio-cgranges
|
2
|
+
|
3
|
+
[![Gem Version](https://badge.fury.io/rb/bio-cgranges.svg)](https://badge.fury.io/rb/bio-cgranges)
|
4
|
+
[![test](https://github.com/ruby-on-bioc/bio-cgranges/actions/workflows/ci.yml/badge.svg)](https://github.com/ruby-on-bioc/bio-cgranges/actions/workflows/ci.yml)
|
5
|
+
[![dics](https://img.shields.io/badge/docs-stable-blue.svg)](https://rubydoc.info/gems/bio-cgranges)
|
6
|
+
|
7
|
+
Ruby bindings to [lh3/cgranges](https://github.com/lh3/cgranges).
|
8
|
+
|
9
|
+
> cgranges is a small C library for genomic interval overlap queries
|
10
|
+
|
11
|
+
## Installation
|
12
|
+
|
13
|
+
```sh
|
14
|
+
gem install bio-cgranges
|
15
|
+
```
|
16
|
+
|
17
|
+
## Usage
|
18
|
+
|
19
|
+
```ruby
|
20
|
+
require "bio/cgranges"
|
21
|
+
|
22
|
+
granges = Bio::CGRanges.new
|
23
|
+
granges.add("chr1", 10, 20, 0)
|
24
|
+
.add("chr1", 15, 25, 1)
|
25
|
+
.add("chr1", 30, 40, 2)
|
26
|
+
.add("chr1", 10, 25, 3)
|
27
|
+
.add("chr1", 15, 20, 4)
|
28
|
+
.add("chr2", 10, 20, 5)
|
29
|
+
.index
|
30
|
+
|
31
|
+
granges.overlap("chr1", 12, 22)
|
32
|
+
# [["chr1", 10, 20, 0],
|
33
|
+
# ["chr1", 10, 25, 3],
|
34
|
+
# ["chr1", 15, 25, 1],
|
35
|
+
# ["chr1", 15, 20, 4]]
|
36
|
+
|
37
|
+
granges.contain("chr1", 12, 22)
|
38
|
+
# [["chr1", 15, 20, 4]]
|
39
|
+
```
|
40
|
+
|
41
|
+
```
|
42
|
+
0.........5.........10........15........20........25........30........35........40
|
43
|
+
|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|
|
44
|
+
0-0-0-0-0-0-0-0-0-0
|
45
|
+
1-1-1-1-1-1-1-1-1-1
|
46
|
+
2-2-2-2-2-2-2-2-2-2
|
47
|
+
3-3-3-3-3-3-3-3-3-3-3-3-3-3-3
|
48
|
+
4-4-4-4-4
|
49
|
+
5-5-5-5-5
|
50
|
+
Q-Q-Q-Q-Q-Q-Q-Q-Q-Q
|
51
|
+
|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|
|
52
|
+
0.........5.........10........15........20........25........30........35........40
|
53
|
+
```
|
54
|
+
|
55
|
+
## Development
|
56
|
+
|
57
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/ruby-on-bioc/bio-cgranges.
|
58
|
+
|
59
|
+
Do you need commit rights to my repository?
|
60
|
+
Do you want to get admin rights and take over the project?
|
61
|
+
If so, please feel free to contact us @kojix2.
|
62
|
+
|
63
|
+
## License
|
64
|
+
|
65
|
+
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
data/Rakefile
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "bundler/gem_tasks"
|
4
|
+
require "rake/testtask"
|
5
|
+
|
6
|
+
Rake::TestTask.new(:test) do |t|
|
7
|
+
t.libs << "test"
|
8
|
+
t.libs << "lib"
|
9
|
+
t.test_files = FileList["test/**/*_test.rb"]
|
10
|
+
end
|
11
|
+
|
12
|
+
require "rubocop/rake_task"
|
13
|
+
|
14
|
+
RuboCop::RakeTask.new
|
15
|
+
|
16
|
+
require "rake/extensiontask"
|
17
|
+
|
18
|
+
task build: :compile
|
19
|
+
|
20
|
+
Rake::ExtensionTask.new("cgranges") do |ext|
|
21
|
+
ext.lib_dir = "lib/bio/cgranges"
|
22
|
+
ext.ext_dir = "ext/bio/cgranges"
|
23
|
+
end
|
24
|
+
|
25
|
+
desc "Remove object file"
|
26
|
+
task :remove_object_file do
|
27
|
+
Dir["ext/**/*.{o,bundle}"].each do |f|
|
28
|
+
FileUtils.rm(f)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
task default: %i[
|
33
|
+
clobber
|
34
|
+
compile
|
35
|
+
remove_object_file
|
36
|
+
test
|
37
|
+
rubocop
|
38
|
+
]
|
39
|
+
|
40
|
+
task cleanall: %i[
|
41
|
+
clobber
|
42
|
+
remove_object_file
|
43
|
+
clean
|
44
|
+
]
|
@@ -0,0 +1,23 @@
|
|
1
|
+
The MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2019 Dana-Farber Cancer Institute
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
20
|
+
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
21
|
+
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
22
|
+
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
23
|
+
SOFTWARE.
|
@@ -0,0 +1,133 @@
|
|
1
|
+
## Introduction
|
2
|
+
|
3
|
+
cgranges is a small C library for genomic interval overlap queries: given a
|
4
|
+
genomic region *r* and a set of regions *R*, finding all regions in *R* that
|
5
|
+
overlaps *r*. Although this library is based on [interval tree][itree], a well
|
6
|
+
known data structure, the core algorithm of cgranges is distinct from all
|
7
|
+
existing implementations to the best of our knowledge. Specifically, the
|
8
|
+
interval tree in cgranges is implicitly encoded as a plain sorted array
|
9
|
+
(similar to [binary heap][bheap] but packed differently). Tree
|
10
|
+
traversal is achieved by jumping between array indices. This treatment makes
|
11
|
+
cgranges very efficient and compact in memory. The core algorithm can be
|
12
|
+
implemented in ~50 lines of C++ code, much shorter than others as well. Please
|
13
|
+
see the code comments in [cpp/IITree.h](cpp/IITree.h) for details.
|
14
|
+
|
15
|
+
## Usage
|
16
|
+
|
17
|
+
### Test with BED coverage
|
18
|
+
|
19
|
+
For testing purposes, this repo implements the [bedtools coverage][bedcov] tool
|
20
|
+
with cgranges. The source code is located in the [test/](test) directory. You
|
21
|
+
can compile and run the test with:
|
22
|
+
```sh
|
23
|
+
cd test && make
|
24
|
+
./bedcov-cr test1.bed test2.bed
|
25
|
+
```
|
26
|
+
The first BED file is loaded into RAM and indexed. The depth and the breadth of
|
27
|
+
coverage of each region in the second file is computed by query against the
|
28
|
+
index of the first file.
|
29
|
+
|
30
|
+
The [test/](test) directory also contains a few other implementations based on
|
31
|
+
[IntervalTree.h][ekg-itree] in C++, [quicksect][quicksect] in Cython and
|
32
|
+
[ncls][ncls] in Cython. The table below shows timing and peak memory on two
|
33
|
+
test BEDs available in the release page. The first BED contains GenCode
|
34
|
+
annotations with ~1.2 million lines, mixing all types of features. The second
|
35
|
+
contains ~10 million direct-RNA mappings. Time1a/Mem1a indexes the GenCode BED
|
36
|
+
into memory. Time1b adds whole chromosome intervals to the GenCode BED when
|
37
|
+
indexing. Time2/Mem2 indexes the RNA-mapping BED into memory. Numbers are
|
38
|
+
averaged over 5 runs.
|
39
|
+
|
40
|
+
|Algo. |Lang. |Cov|Program |Time1a|Time1b|Mem1a |Time2 |Mem2 |
|
41
|
+
|:-------|:-----|:-:|:---------------|-----:|-----:|-------:|-----:|-------:|
|
42
|
+
|IAITree |C |Y |cgranges |9.0s |13.9s |19.1MB |4.6s |138.4MB |
|
43
|
+
|IAITree |C++ |Y |cpp/iitree.h |11.1s |24.5s |22.4MB |5.8s |160.4MB |
|
44
|
+
|CITree |C++ |Y |IntervalTree.h |17.4s |17.4s |27.2MB |10.5s |179.5MB |
|
45
|
+
|IAITree |C |N |cgranges |7.6s |13.0s |19.1MB |4.1s |138.4MB |
|
46
|
+
|AIList |C |N |3rd-party/AIList|7.9s |8.1s |14.4MB |6.5s |104.8MB |
|
47
|
+
|NCList |C |N |3rd-party/NCList|13.0s |13.4s |21.4MB |10.6s |183.0MB |
|
48
|
+
|AITree |C |N |3rd-party/AITree|16.8s |18.4s |73.4MB |27.3s |546.4MB |
|
49
|
+
|IAITree |Cython|N |cgranges |56.6s |63.9s |23.4MB |43.9s |143.1MB |
|
50
|
+
|binning |C++ |Y |bedtools |201.9s|280.4s|478.5MB |149.1s|3438.1MB|
|
51
|
+
|
52
|
+
Here, IAITree = implicit augmented interval tree, used by cgranges;
|
53
|
+
CITree = centered interval tree, used by [Erik Garrison's
|
54
|
+
IntervalTree][itree]; AIList = augmented interval list, by [Feng et
|
55
|
+
al][ailist]; NCList = nested containment list, taken from [ncls][ncls] by Feng
|
56
|
+
et al; AITree = augmented interval tree, from [kerneltree][kerneltree].
|
57
|
+
"Cov" indicates whether the program calculates breadth of coverage.
|
58
|
+
Comments:
|
59
|
+
|
60
|
+
* AIList keeps start and end only. IAITree and CITree addtionally store a
|
61
|
+
4-byte "ID" field per interval to reference the source of interval. This is
|
62
|
+
partly why AIList uses the least memory.
|
63
|
+
|
64
|
+
* IAITree is more sensitive to the worse case: the presence of an interval
|
65
|
+
spanning the whole chromosome.
|
66
|
+
|
67
|
+
* IAITree uses an efficient radix sort. CITree uses std::sort from STL, which
|
68
|
+
is ok. AIList and NCList use qsort from libc, which is slow. Faster sorting
|
69
|
+
leads to faster indexing.
|
70
|
+
|
71
|
+
* IAITree in C++ uses identical core algorithm to the C version, but limited by
|
72
|
+
its APIs, it wastes time on memory locality and management. CITree has a
|
73
|
+
similar issue.
|
74
|
+
|
75
|
+
* Computing coverage is better done when the returned list of intervals are
|
76
|
+
start sorted. IAITree returns sorted list. CITree doesn't. Not sure about
|
77
|
+
others. Computing coverage takes a couple of seconds. Sorting will be slower.
|
78
|
+
|
79
|
+
* Printing intervals also takes a noticeable fraction of time. Custom printf
|
80
|
+
equivalent would be faster.
|
81
|
+
|
82
|
+
* IAITree+Cython is a wrapper around the C version of cgranges. Cython adds
|
83
|
+
significant overhead.
|
84
|
+
|
85
|
+
* Bedtools is designed for a variety of applications in addition to computing
|
86
|
+
coverage. It may keep other information in its internal data structure. This
|
87
|
+
micro-benchmark may be unfair to bedtools.
|
88
|
+
|
89
|
+
* In general, the performance is affected a lot by subtle implementation
|
90
|
+
details. CITree, IAITree, NCList and AIList are all broadly comparable in
|
91
|
+
performance. AITree is not recommended when indexed intervals are immutable.
|
92
|
+
|
93
|
+
### Use cgranges as a C library
|
94
|
+
|
95
|
+
```c
|
96
|
+
cgranges_t *cr = cr_init(); // initialize a cgranges_t object
|
97
|
+
cr_add(cr, "chr1", 20, 30, 0); // add a genomic interval
|
98
|
+
cr_add(cr, "chr2", 10, 30, 1);
|
99
|
+
cr_add(cr, "chr1", 10, 25, 2);
|
100
|
+
cr_index(cr); // index
|
101
|
+
|
102
|
+
int64_t i, n, *b = 0, max_b = 0;
|
103
|
+
n = cr_overlap(cr, "chr1", 15, 22, &b, &max_b); // overlap query; output array b[] can be reused
|
104
|
+
for (i = 0; i < n; ++i) // traverse overlapping intervals
|
105
|
+
printf("%d\t%d\t%d\n", cr_start(cr, b[i]), cr_end(cr, b[i]), cr_label(cr, b[i]));
|
106
|
+
free(b); // b[] is allocated by malloc() inside cr_overlap(), so needs to be freed with free()
|
107
|
+
|
108
|
+
cr_destroy(cr);
|
109
|
+
```
|
110
|
+
|
111
|
+
### Use IITree as a C++ library
|
112
|
+
|
113
|
+
```cpp
|
114
|
+
IITree<int, int> tree;
|
115
|
+
tree.add(12, 34, 0); // add an interval
|
116
|
+
tree.add(0, 23, 1);
|
117
|
+
tree.add(34, 56, 2);
|
118
|
+
tree.index(); // index
|
119
|
+
std::vector<size_t> a;
|
120
|
+
tree.overlap(22, 25, a); // retrieve overlaps
|
121
|
+
for (size_t i = 0; i < a.size(); ++i)
|
122
|
+
printf("%d\t%d\t%d\n", tree.start(a[i]), tree.end(a[i]), tree.data(a[i]));
|
123
|
+
```
|
124
|
+
|
125
|
+
[bedcov]: https://bedtools.readthedocs.io/en/latest/content/tools/coverage.html
|
126
|
+
[ekg-itree]: https://github.com/ekg/intervaltree
|
127
|
+
[quicksect]: https://github.com/brentp/quicksect
|
128
|
+
[ncls]: https://github.com/hunt-genes/ncls
|
129
|
+
[citree]: https://en.wikipedia.org/wiki/Interval_tree#Centered_interval_tree
|
130
|
+
[itree]: https://en.wikipedia.org/wiki/Interval_tree
|
131
|
+
[bheap]: https://en.wikipedia.org/wiki/Binary_heap
|
132
|
+
[ailist]: https://www.biorxiv.org/content/10.1101/593657v1
|
133
|
+
[kerneltree]: https://github.com/biocore-ntnu/kerneltree
|
@@ -0,0 +1,330 @@
|
|
1
|
+
#include <stdio.h>
|
2
|
+
#include <assert.h>
|
3
|
+
#include "cgranges.h"
|
4
|
+
#include "khash.h"
|
5
|
+
|
6
|
+
/**************
|
7
|
+
* Radix sort *
|
8
|
+
**************/
|
9
|
+
|
10
|
+
#define RS_MIN_SIZE 64
|
11
|
+
#define RS_MAX_BITS 8
|
12
|
+
|
13
|
+
#define KRADIX_SORT_INIT(name, rstype_t, rskey, sizeof_key) \
|
14
|
+
typedef struct { \
|
15
|
+
rstype_t *b, *e; \
|
16
|
+
} rsbucket_##name##_t; \
|
17
|
+
void rs_insertsort_##name(rstype_t *beg, rstype_t *end) \
|
18
|
+
{ \
|
19
|
+
rstype_t *i; \
|
20
|
+
for (i = beg + 1; i < end; ++i) \
|
21
|
+
if (rskey(*i) < rskey(*(i - 1))) { \
|
22
|
+
rstype_t *j, tmp = *i; \
|
23
|
+
for (j = i; j > beg && rskey(tmp) < rskey(*(j-1)); --j) \
|
24
|
+
*j = *(j - 1); \
|
25
|
+
*j = tmp; \
|
26
|
+
} \
|
27
|
+
} \
|
28
|
+
void rs_sort_##name(rstype_t *beg, rstype_t *end, int n_bits, int s) \
|
29
|
+
{ \
|
30
|
+
rstype_t *i; \
|
31
|
+
int size = 1<<n_bits, m = size - 1; \
|
32
|
+
rsbucket_##name##_t *k, b[1<<RS_MAX_BITS], *be = b + size; \
|
33
|
+
assert(n_bits <= RS_MAX_BITS); \
|
34
|
+
for (k = b; k != be; ++k) k->b = k->e = beg; \
|
35
|
+
for (i = beg; i != end; ++i) ++b[rskey(*i)>>s&m].e; \
|
36
|
+
for (k = b + 1; k != be; ++k) \
|
37
|
+
k->e += (k-1)->e - beg, k->b = (k-1)->e; \
|
38
|
+
for (k = b; k != be;) { \
|
39
|
+
if (k->b != k->e) { \
|
40
|
+
rsbucket_##name##_t *l; \
|
41
|
+
if ((l = b + (rskey(*k->b)>>s&m)) != k) { \
|
42
|
+
rstype_t tmp = *k->b, swap; \
|
43
|
+
do { \
|
44
|
+
swap = tmp; tmp = *l->b; *l->b++ = swap; \
|
45
|
+
l = b + (rskey(tmp)>>s&m); \
|
46
|
+
} while (l != k); \
|
47
|
+
*k->b++ = tmp; \
|
48
|
+
} else ++k->b; \
|
49
|
+
} else ++k; \
|
50
|
+
} \
|
51
|
+
for (b->b = beg, k = b + 1; k != be; ++k) k->b = (k-1)->e; \
|
52
|
+
if (s) { \
|
53
|
+
s = s > n_bits? s - n_bits : 0; \
|
54
|
+
for (k = b; k != be; ++k) \
|
55
|
+
if (k->e - k->b > RS_MIN_SIZE) rs_sort_##name(k->b, k->e, n_bits, s); \
|
56
|
+
else if (k->e - k->b > 1) rs_insertsort_##name(k->b, k->e); \
|
57
|
+
} \
|
58
|
+
} \
|
59
|
+
void radix_sort_##name(rstype_t *beg, rstype_t *end) \
|
60
|
+
{ \
|
61
|
+
if (end - beg <= RS_MIN_SIZE) rs_insertsort_##name(beg, end); \
|
62
|
+
else rs_sort_##name(beg, end, RS_MAX_BITS, (sizeof_key - 1) * RS_MAX_BITS); \
|
63
|
+
}
|
64
|
+
|
65
|
+
/*********************
|
66
|
+
* Convenient macros *
|
67
|
+
*********************/
|
68
|
+
|
69
|
+
#ifndef kroundup32
|
70
|
+
#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
|
71
|
+
#endif
|
72
|
+
|
73
|
+
#define CALLOC(type, len) ((type*)calloc((len), sizeof(type)))
|
74
|
+
#define REALLOC(ptr, len) ((ptr) = (__typeof__(ptr))realloc((ptr), (len) * sizeof(*(ptr))))
|
75
|
+
|
76
|
+
#define EXPAND(a, m) do { \
|
77
|
+
(m) = (m)? (m) + ((m)>>1) : 16; \
|
78
|
+
REALLOC((a), (m)); \
|
79
|
+
} while (0)
|
80
|
+
|
81
|
+
/********************
|
82
|
+
* Basic operations *
|
83
|
+
********************/
|
84
|
+
|
85
|
+
#define cr_intv_key(r) ((r).x)
|
86
|
+
KRADIX_SORT_INIT(cr_intv, cr_intv_t, cr_intv_key, 8)
|
87
|
+
|
88
|
+
KHASH_MAP_INIT_STR(str, int32_t)
|
89
|
+
typedef khash_t(str) strhash_t;
|
90
|
+
|
91
|
+
cgranges_t *cr_init(void)
|
92
|
+
{
|
93
|
+
cgranges_t *cr;
|
94
|
+
cr = CALLOC(cgranges_t, 1);
|
95
|
+
cr->hc = kh_init(str);
|
96
|
+
return cr;
|
97
|
+
}
|
98
|
+
|
99
|
+
void cr_destroy(cgranges_t *cr)
|
100
|
+
{
|
101
|
+
int32_t i;
|
102
|
+
if (cr == 0) return;
|
103
|
+
for (i = 0; i < cr->n_ctg; ++i)
|
104
|
+
free(cr->ctg[i].name);
|
105
|
+
free(cr->ctg);
|
106
|
+
kh_destroy(str, (strhash_t*)cr->hc);
|
107
|
+
free(cr);
|
108
|
+
}
|
109
|
+
|
110
|
+
int32_t cr_add_ctg(cgranges_t *cr, const char *ctg, int32_t len)
|
111
|
+
{
|
112
|
+
int absent;
|
113
|
+
khint_t k;
|
114
|
+
strhash_t *h = (strhash_t*)cr->hc;
|
115
|
+
k = kh_put(str, h, ctg, &absent);
|
116
|
+
if (absent) {
|
117
|
+
cr_ctg_t *p;
|
118
|
+
if (cr->n_ctg == cr->m_ctg)
|
119
|
+
EXPAND(cr->ctg, cr->m_ctg);
|
120
|
+
kh_val(h, k) = cr->n_ctg;
|
121
|
+
p = &cr->ctg[cr->n_ctg++];
|
122
|
+
p->name = strdup(ctg);
|
123
|
+
kh_key(h, k) = p->name;
|
124
|
+
p->len = len;
|
125
|
+
p->n = 0, p->off = -1;
|
126
|
+
}
|
127
|
+
if (len > cr->ctg[kh_val(h, k)].len)
|
128
|
+
cr->ctg[kh_val(h, k)].len = len;
|
129
|
+
return kh_val(h, k);
|
130
|
+
}
|
131
|
+
|
132
|
+
int32_t cr_get_ctg(const cgranges_t *cr, const char *ctg)
|
133
|
+
{
|
134
|
+
khint_t k;
|
135
|
+
strhash_t *h = (strhash_t*)cr->hc;
|
136
|
+
k = kh_get(str, h, ctg);
|
137
|
+
return k == kh_end(h)? -1 : kh_val(h, k);
|
138
|
+
}
|
139
|
+
|
140
|
+
cr_intv_t *cr_add(cgranges_t *cr, const char *ctg, int32_t st, int32_t en, int32_t label_int)
|
141
|
+
{
|
142
|
+
cr_intv_t *p;
|
143
|
+
int32_t k;
|
144
|
+
if (st > en) return 0;
|
145
|
+
k = cr_add_ctg(cr, ctg, 0);
|
146
|
+
if (cr->n_r == cr->m_r)
|
147
|
+
EXPAND(cr->r, cr->m_r);
|
148
|
+
p = &cr->r[cr->n_r++];
|
149
|
+
p->x = (uint64_t)k << 32 | st;
|
150
|
+
p->y = en;
|
151
|
+
p->label = label_int;
|
152
|
+
if (cr->ctg[k].len < en)
|
153
|
+
cr->ctg[k].len = en;
|
154
|
+
return p;
|
155
|
+
}
|
156
|
+
|
157
|
+
void cr_sort(cgranges_t *cr)
|
158
|
+
{
|
159
|
+
if (cr->n_ctg == 0 || cr->n_r == 0) return;
|
160
|
+
radix_sort_cr_intv(cr->r, cr->r + cr->n_r);
|
161
|
+
}
|
162
|
+
|
163
|
+
int32_t cr_is_sorted(const cgranges_t *cr)
|
164
|
+
{
|
165
|
+
uint64_t i;
|
166
|
+
for (i = 1; i < cr->n_r; ++i)
|
167
|
+
if (cr->r[i-1].x > cr->r[i].x)
|
168
|
+
break;
|
169
|
+
return (i == cr->n_r);
|
170
|
+
}
|
171
|
+
|
172
|
+
/************
|
173
|
+
* Indexing *
|
174
|
+
************/
|
175
|
+
|
176
|
+
void cr_index_prepare(cgranges_t *cr)
|
177
|
+
{
|
178
|
+
int64_t i, st;
|
179
|
+
if (!cr_is_sorted(cr)) cr_sort(cr);
|
180
|
+
for (st = 0, i = 1; i <= cr->n_r; ++i) {
|
181
|
+
if (i == cr->n_r || cr->r[i].x>>32 != cr->r[st].x>>32) {
|
182
|
+
int32_t ctg = cr->r[st].x>>32;
|
183
|
+
cr->ctg[ctg].off = st;
|
184
|
+
cr->ctg[ctg].n = i - st;
|
185
|
+
st = i;
|
186
|
+
}
|
187
|
+
}
|
188
|
+
for (i = 0; i < cr->n_r; ++i) {
|
189
|
+
cr_intv_t *r = &cr->r[i];
|
190
|
+
r->x = r->x<<32 | r->y;
|
191
|
+
r->y = 0;
|
192
|
+
}
|
193
|
+
}
|
194
|
+
|
195
|
+
int32_t cr_index1(cr_intv_t *a, int64_t n)
|
196
|
+
{
|
197
|
+
int64_t i, last_i;
|
198
|
+
int32_t last, k;
|
199
|
+
if (n <= 0) return -1;
|
200
|
+
for (i = 0; i < n; i += 2) last_i = i, last = a[i].y = (int32_t)a[i].x;
|
201
|
+
for (k = 1; 1LL<<k <= n; ++k) {
|
202
|
+
int64_t x = 1LL<<(k-1), i0 = (x<<1) - 1, step = x<<2;
|
203
|
+
for (i = i0; i < n; i += step) {
|
204
|
+
int32_t el = a[i - x].y;
|
205
|
+
int32_t er = i + x < n? a[i + x].y : last;
|
206
|
+
int32_t e = (int32_t)a[i].x;
|
207
|
+
e = e > el? e : el;
|
208
|
+
e = e > er? e : er;
|
209
|
+
a[i].y = e;
|
210
|
+
}
|
211
|
+
last_i = last_i>>k&1? last_i - x : last_i + x;
|
212
|
+
if (last_i < n && a[last_i].y > last)
|
213
|
+
last = a[last_i].y;
|
214
|
+
}
|
215
|
+
return k - 1;
|
216
|
+
}
|
217
|
+
|
218
|
+
void cr_index(cgranges_t *cr)
|
219
|
+
{
|
220
|
+
int32_t i;
|
221
|
+
cr_index_prepare(cr);
|
222
|
+
for (i = 0; i < cr->n_ctg; ++i)
|
223
|
+
cr->ctg[i].root_k = cr_index1(&cr->r[cr->ctg[i].off], cr->ctg[i].n);
|
224
|
+
}
|
225
|
+
|
226
|
+
/*********
|
227
|
+
* Query *
|
228
|
+
*********/
|
229
|
+
|
230
|
+
int64_t cr_min_start_int(const cgranges_t *cr, int32_t ctg_id, int32_t st) // find the smallest i such that cr_st(&r[i]) >= st
|
231
|
+
{
|
232
|
+
int64_t left, right;
|
233
|
+
const cr_ctg_t *c;
|
234
|
+
const cr_intv_t *r;
|
235
|
+
|
236
|
+
if (ctg_id < 0 || ctg_id >= cr->n_ctg) return -1;
|
237
|
+
c = &cr->ctg[ctg_id];
|
238
|
+
r = &cr->r[c->off];
|
239
|
+
if (c->n == 0) return -1;
|
240
|
+
left = 0, right = c->n;
|
241
|
+
while (right > left) {
|
242
|
+
int64_t mid = left + ((right - left) >> 1);
|
243
|
+
if (cr_st(&r[mid]) >= st) right = mid;
|
244
|
+
else left = mid + 1;
|
245
|
+
}
|
246
|
+
assert(left == right);
|
247
|
+
return left == c->n? -1 : c->off + left;
|
248
|
+
}
|
249
|
+
|
250
|
+
typedef struct {
|
251
|
+
int64_t x;
|
252
|
+
int32_t k, w;
|
253
|
+
} istack_t;
|
254
|
+
|
255
|
+
int64_t cr_overlap_int(const cgranges_t *cr, int32_t ctg_id, int32_t st, int32_t en, int64_t **b_, int64_t *m_b_)
|
256
|
+
{
|
257
|
+
int32_t t = 0;
|
258
|
+
const cr_ctg_t *c;
|
259
|
+
const cr_intv_t *r;
|
260
|
+
int64_t *b = *b_, m_b = *m_b_, n = 0;
|
261
|
+
istack_t stack[64], *p;
|
262
|
+
|
263
|
+
if (ctg_id < 0 || ctg_id >= cr->n_ctg) return 0;
|
264
|
+
c = &cr->ctg[ctg_id];
|
265
|
+
r = &cr->r[c->off];
|
266
|
+
p = &stack[t++];
|
267
|
+
p->k = c->root_k, p->x = (1LL<<p->k) - 1, p->w = 0; // push the root into the stack
|
268
|
+
while (t) { // stack is not empyt
|
269
|
+
istack_t z = stack[--t];
|
270
|
+
if (z.k <= 3) { // the subtree is no larger than (1<<(z.k+1))-1; do a linear scan
|
271
|
+
int64_t i, i0 = z.x >> z.k << z.k, i1 = i0 + (1LL<<(z.k+1)) - 1;
|
272
|
+
if (i1 >= c->n) i1 = c->n;
|
273
|
+
for (i = i0; i < i1 && cr_st(&r[i]) < en; ++i)
|
274
|
+
if (st < cr_en(&r[i])) {
|
275
|
+
if (n == m_b) EXPAND(b, m_b);
|
276
|
+
b[n++] = c->off + i;
|
277
|
+
}
|
278
|
+
} else if (z.w == 0) { // if left child not processed
|
279
|
+
int64_t y = z.x - (1LL<<(z.k-1));
|
280
|
+
p = &stack[t++];
|
281
|
+
p->k = z.k, p->x = z.x, p->w = 1;
|
282
|
+
if (y >= c->n || r[y].y > st) {
|
283
|
+
p = &stack[t++];
|
284
|
+
p->k = z.k - 1, p->x = y, p->w = 0; // push the left child to the stack
|
285
|
+
}
|
286
|
+
} else if (z.x < c->n && cr_st(&r[z.x]) < en) {
|
287
|
+
if (st < cr_en(&r[z.x])) { // then z.x overlaps the query; write to the output array
|
288
|
+
if (n == m_b) EXPAND(b, m_b);
|
289
|
+
b[n++] = c->off + z.x;
|
290
|
+
}
|
291
|
+
p = &stack[t++];
|
292
|
+
p->k = z.k - 1, p->x = z.x + (1LL<<(z.k-1)), p->w = 0; // push the right child
|
293
|
+
}
|
294
|
+
}
|
295
|
+
*b_ = b, *m_b_ = m_b;
|
296
|
+
return n;
|
297
|
+
}
|
298
|
+
|
299
|
+
int64_t cr_contain_int(const cgranges_t *cr, int32_t ctg_id, int32_t st, int32_t en, int64_t **b_, int64_t *m_b_)
|
300
|
+
{
|
301
|
+
int64_t n = 0, i, s, e, *b = *b_, m_b = *m_b_;
|
302
|
+
s = cr_min_start_int(cr, ctg_id, st);
|
303
|
+
if (s < 0) return 0;
|
304
|
+
e = cr->ctg[ctg_id].off + cr->ctg[ctg_id].n;
|
305
|
+
for (i = s; i < e; ++i) {
|
306
|
+
const cr_intv_t *r = &cr->r[i];
|
307
|
+
if (cr_st(r) >= en) break;
|
308
|
+
if (cr_st(r) >= st && cr_en(r) <= en) {
|
309
|
+
if (n == m_b) EXPAND(b, m_b);
|
310
|
+
b[n++] = i;
|
311
|
+
}
|
312
|
+
}
|
313
|
+
*b_ = b, *m_b_ = m_b;
|
314
|
+
return n;
|
315
|
+
}
|
316
|
+
|
317
|
+
int64_t cr_min_start(const cgranges_t *cr, const char *ctg, int32_t st)
|
318
|
+
{
|
319
|
+
return cr_min_start_int(cr, cr_get_ctg(cr, ctg), st);
|
320
|
+
}
|
321
|
+
|
322
|
+
int64_t cr_overlap(const cgranges_t *cr, const char *ctg, int32_t st, int32_t en, int64_t **b_, int64_t *m_b_)
|
323
|
+
{
|
324
|
+
return cr_overlap_int(cr, cr_get_ctg(cr, ctg), st, en, b_, m_b_);
|
325
|
+
}
|
326
|
+
|
327
|
+
int64_t cr_contain(const cgranges_t *cr, const char *ctg, int32_t st, int32_t en, int64_t **b_, int64_t *m_b_)
|
328
|
+
{
|
329
|
+
return cr_contain_int(cr, cr_get_ctg(cr, ctg), st, en, b_, m_b_);
|
330
|
+
}
|