bio-cgranges 0.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE.txt +21 -0
- data/README.md +65 -0
- data/Rakefile +44 -0
- data/ext/bio/cgranges/cgranges/LICENSE.txt +23 -0
- data/ext/bio/cgranges/cgranges/README.md +133 -0
- data/ext/bio/cgranges/cgranges/cgranges.c +330 -0
- data/ext/bio/cgranges/cgranges/cgranges.h +87 -0
- data/ext/bio/cgranges/cgranges/khash.h +627 -0
- data/ext/bio/cgranges/cgranges.c +342 -0
- data/ext/bio/cgranges/cgranges.h +7 -0
- data/ext/bio/cgranges/extconf.rb +9 -0
- data/lib/bio/cgranges/version.rb +7 -0
- data/lib/bio/cgranges.rb +10 -0
- metadata +57 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 618c6866b3f708b148682a529b73efda7b782bfc567f92949f72c237fd98435b
|
4
|
+
data.tar.gz: 0ebbc69b7858f934fd98f60500e49b7300774b21e05ee04c654d55746e18f00b
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 0444fad48ec7b6266072f2b7fb23684bf295703121c6d0b5b392e3c150de6dda02adbd1ecff161793b5f1a821f8d10cac564c4c60042f5fc4cc191bb26181620
|
7
|
+
data.tar.gz: 7f783598ec7ed7937c8163593901f055463175e4b16ab0073cf41bb96341f858077877524c5e1409369822e1efd5eaed80a11acc6c7678d9e27dee4ee3e14dec
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2021 kojix2
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,65 @@
|
|
1
|
+
# bio-cgranges
|
2
|
+
|
3
|
+
[](https://badge.fury.io/rb/bio-cgranges)
|
4
|
+
[](https://github.com/ruby-on-bioc/bio-cgranges/actions/workflows/ci.yml)
|
5
|
+
[](https://rubydoc.info/gems/bio-cgranges)
|
6
|
+
|
7
|
+
Ruby bindings to [lh3/cgranges](https://github.com/lh3/cgranges).
|
8
|
+
|
9
|
+
> cgranges is a small C library for genomic interval overlap queries
|
10
|
+
|
11
|
+
## Installation
|
12
|
+
|
13
|
+
```sh
|
14
|
+
gem install bio-cgranges
|
15
|
+
```
|
16
|
+
|
17
|
+
## Usage
|
18
|
+
|
19
|
+
```ruby
|
20
|
+
require "bio/cgranges"
|
21
|
+
|
22
|
+
granges = Bio::CGRanges.new
|
23
|
+
granges.add("chr1", 10, 20, 0)
|
24
|
+
.add("chr1", 15, 25, 1)
|
25
|
+
.add("chr1", 30, 40, 2)
|
26
|
+
.add("chr1", 10, 25, 3)
|
27
|
+
.add("chr1", 15, 20, 4)
|
28
|
+
.add("chr2", 10, 20, 5)
|
29
|
+
.index
|
30
|
+
|
31
|
+
granges.overlap("chr1", 12, 22)
|
32
|
+
# [["chr1", 10, 20, 0],
|
33
|
+
# ["chr1", 10, 25, 3],
|
34
|
+
# ["chr1", 15, 25, 1],
|
35
|
+
# ["chr1", 15, 20, 4]]
|
36
|
+
|
37
|
+
granges.contain("chr1", 12, 22)
|
38
|
+
# [["chr1", 15, 20, 4]]
|
39
|
+
```
|
40
|
+
|
41
|
+
```
|
42
|
+
0.........5.........10........15........20........25........30........35........40
|
43
|
+
|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|
|
44
|
+
0-0-0-0-0-0-0-0-0-0
|
45
|
+
1-1-1-1-1-1-1-1-1-1
|
46
|
+
2-2-2-2-2-2-2-2-2-2
|
47
|
+
3-3-3-3-3-3-3-3-3-3-3-3-3-3-3
|
48
|
+
4-4-4-4-4
|
49
|
+
5-5-5-5-5
|
50
|
+
Q-Q-Q-Q-Q-Q-Q-Q-Q-Q
|
51
|
+
|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|
|
52
|
+
0.........5.........10........15........20........25........30........35........40
|
53
|
+
```
|
54
|
+
|
55
|
+
## Development
|
56
|
+
|
57
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/ruby-on-bioc/bio-cgranges.
|
58
|
+
|
59
|
+
Do you need commit rights to my repository?
|
60
|
+
Do you want to get admin rights and take over the project?
|
61
|
+
If so, please feel free to contact us @kojix2.
|
62
|
+
|
63
|
+
## License
|
64
|
+
|
65
|
+
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
data/Rakefile
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "bundler/gem_tasks"
|
4
|
+
require "rake/testtask"
|
5
|
+
|
6
|
+
Rake::TestTask.new(:test) do |t|
|
7
|
+
t.libs << "test"
|
8
|
+
t.libs << "lib"
|
9
|
+
t.test_files = FileList["test/**/*_test.rb"]
|
10
|
+
end
|
11
|
+
|
12
|
+
require "rubocop/rake_task"
|
13
|
+
|
14
|
+
RuboCop::RakeTask.new
|
15
|
+
|
16
|
+
require "rake/extensiontask"
|
17
|
+
|
18
|
+
task build: :compile
|
19
|
+
|
20
|
+
Rake::ExtensionTask.new("cgranges") do |ext|
|
21
|
+
ext.lib_dir = "lib/bio/cgranges"
|
22
|
+
ext.ext_dir = "ext/bio/cgranges"
|
23
|
+
end
|
24
|
+
|
25
|
+
desc "Remove object file"
|
26
|
+
task :remove_object_file do
|
27
|
+
Dir["ext/**/*.{o,bundle}"].each do |f|
|
28
|
+
FileUtils.rm(f)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
task default: %i[
|
33
|
+
clobber
|
34
|
+
compile
|
35
|
+
remove_object_file
|
36
|
+
test
|
37
|
+
rubocop
|
38
|
+
]
|
39
|
+
|
40
|
+
task cleanall: %i[
|
41
|
+
clobber
|
42
|
+
remove_object_file
|
43
|
+
clean
|
44
|
+
]
|
@@ -0,0 +1,23 @@
|
|
1
|
+
The MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2019 Dana-Farber Cancer Institute
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
20
|
+
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
21
|
+
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
22
|
+
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
23
|
+
SOFTWARE.
|
@@ -0,0 +1,133 @@
|
|
1
|
+
## Introduction
|
2
|
+
|
3
|
+
cgranges is a small C library for genomic interval overlap queries: given a
|
4
|
+
genomic region *r* and a set of regions *R*, finding all regions in *R* that
|
5
|
+
overlaps *r*. Although this library is based on [interval tree][itree], a well
|
6
|
+
known data structure, the core algorithm of cgranges is distinct from all
|
7
|
+
existing implementations to the best of our knowledge. Specifically, the
|
8
|
+
interval tree in cgranges is implicitly encoded as a plain sorted array
|
9
|
+
(similar to [binary heap][bheap] but packed differently). Tree
|
10
|
+
traversal is achieved by jumping between array indices. This treatment makes
|
11
|
+
cgranges very efficient and compact in memory. The core algorithm can be
|
12
|
+
implemented in ~50 lines of C++ code, much shorter than others as well. Please
|
13
|
+
see the code comments in [cpp/IITree.h](cpp/IITree.h) for details.
|
14
|
+
|
15
|
+
## Usage
|
16
|
+
|
17
|
+
### Test with BED coverage
|
18
|
+
|
19
|
+
For testing purposes, this repo implements the [bedtools coverage][bedcov] tool
|
20
|
+
with cgranges. The source code is located in the [test/](test) directory. You
|
21
|
+
can compile and run the test with:
|
22
|
+
```sh
|
23
|
+
cd test && make
|
24
|
+
./bedcov-cr test1.bed test2.bed
|
25
|
+
```
|
26
|
+
The first BED file is loaded into RAM and indexed. The depth and the breadth of
|
27
|
+
coverage of each region in the second file is computed by query against the
|
28
|
+
index of the first file.
|
29
|
+
|
30
|
+
The [test/](test) directory also contains a few other implementations based on
|
31
|
+
[IntervalTree.h][ekg-itree] in C++, [quicksect][quicksect] in Cython and
|
32
|
+
[ncls][ncls] in Cython. The table below shows timing and peak memory on two
|
33
|
+
test BEDs available in the release page. The first BED contains GenCode
|
34
|
+
annotations with ~1.2 million lines, mixing all types of features. The second
|
35
|
+
contains ~10 million direct-RNA mappings. Time1a/Mem1a indexes the GenCode BED
|
36
|
+
into memory. Time1b adds whole chromosome intervals to the GenCode BED when
|
37
|
+
indexing. Time2/Mem2 indexes the RNA-mapping BED into memory. Numbers are
|
38
|
+
averaged over 5 runs.
|
39
|
+
|
40
|
+
|Algo. |Lang. |Cov|Program |Time1a|Time1b|Mem1a |Time2 |Mem2 |
|
41
|
+
|:-------|:-----|:-:|:---------------|-----:|-----:|-------:|-----:|-------:|
|
42
|
+
|IAITree |C |Y |cgranges |9.0s |13.9s |19.1MB |4.6s |138.4MB |
|
43
|
+
|IAITree |C++ |Y |cpp/iitree.h |11.1s |24.5s |22.4MB |5.8s |160.4MB |
|
44
|
+
|CITree |C++ |Y |IntervalTree.h |17.4s |17.4s |27.2MB |10.5s |179.5MB |
|
45
|
+
|IAITree |C |N |cgranges |7.6s |13.0s |19.1MB |4.1s |138.4MB |
|
46
|
+
|AIList |C |N |3rd-party/AIList|7.9s |8.1s |14.4MB |6.5s |104.8MB |
|
47
|
+
|NCList |C |N |3rd-party/NCList|13.0s |13.4s |21.4MB |10.6s |183.0MB |
|
48
|
+
|AITree |C |N |3rd-party/AITree|16.8s |18.4s |73.4MB |27.3s |546.4MB |
|
49
|
+
|IAITree |Cython|N |cgranges |56.6s |63.9s |23.4MB |43.9s |143.1MB |
|
50
|
+
|binning |C++ |Y |bedtools |201.9s|280.4s|478.5MB |149.1s|3438.1MB|
|
51
|
+
|
52
|
+
Here, IAITree = implicit augmented interval tree, used by cgranges;
|
53
|
+
CITree = centered interval tree, used by [Erik Garrison's
|
54
|
+
IntervalTree][itree]; AIList = augmented interval list, by [Feng et
|
55
|
+
al][ailist]; NCList = nested containment list, taken from [ncls][ncls] by Feng
|
56
|
+
et al; AITree = augmented interval tree, from [kerneltree][kerneltree].
|
57
|
+
"Cov" indicates whether the program calculates breadth of coverage.
|
58
|
+
Comments:
|
59
|
+
|
60
|
+
* AIList keeps start and end only. IAITree and CITree addtionally store a
|
61
|
+
4-byte "ID" field per interval to reference the source of interval. This is
|
62
|
+
partly why AIList uses the least memory.
|
63
|
+
|
64
|
+
* IAITree is more sensitive to the worse case: the presence of an interval
|
65
|
+
spanning the whole chromosome.
|
66
|
+
|
67
|
+
* IAITree uses an efficient radix sort. CITree uses std::sort from STL, which
|
68
|
+
is ok. AIList and NCList use qsort from libc, which is slow. Faster sorting
|
69
|
+
leads to faster indexing.
|
70
|
+
|
71
|
+
* IAITree in C++ uses identical core algorithm to the C version, but limited by
|
72
|
+
its APIs, it wastes time on memory locality and management. CITree has a
|
73
|
+
similar issue.
|
74
|
+
|
75
|
+
* Computing coverage is better done when the returned list of intervals are
|
76
|
+
start sorted. IAITree returns sorted list. CITree doesn't. Not sure about
|
77
|
+
others. Computing coverage takes a couple of seconds. Sorting will be slower.
|
78
|
+
|
79
|
+
* Printing intervals also takes a noticeable fraction of time. Custom printf
|
80
|
+
equivalent would be faster.
|
81
|
+
|
82
|
+
* IAITree+Cython is a wrapper around the C version of cgranges. Cython adds
|
83
|
+
significant overhead.
|
84
|
+
|
85
|
+
* Bedtools is designed for a variety of applications in addition to computing
|
86
|
+
coverage. It may keep other information in its internal data structure. This
|
87
|
+
micro-benchmark may be unfair to bedtools.
|
88
|
+
|
89
|
+
* In general, the performance is affected a lot by subtle implementation
|
90
|
+
details. CITree, IAITree, NCList and AIList are all broadly comparable in
|
91
|
+
performance. AITree is not recommended when indexed intervals are immutable.
|
92
|
+
|
93
|
+
### Use cgranges as a C library
|
94
|
+
|
95
|
+
```c
|
96
|
+
cgranges_t *cr = cr_init(); // initialize a cgranges_t object
|
97
|
+
cr_add(cr, "chr1", 20, 30, 0); // add a genomic interval
|
98
|
+
cr_add(cr, "chr2", 10, 30, 1);
|
99
|
+
cr_add(cr, "chr1", 10, 25, 2);
|
100
|
+
cr_index(cr); // index
|
101
|
+
|
102
|
+
int64_t i, n, *b = 0, max_b = 0;
|
103
|
+
n = cr_overlap(cr, "chr1", 15, 22, &b, &max_b); // overlap query; output array b[] can be reused
|
104
|
+
for (i = 0; i < n; ++i) // traverse overlapping intervals
|
105
|
+
printf("%d\t%d\t%d\n", cr_start(cr, b[i]), cr_end(cr, b[i]), cr_label(cr, b[i]));
|
106
|
+
free(b); // b[] is allocated by malloc() inside cr_overlap(), so needs to be freed with free()
|
107
|
+
|
108
|
+
cr_destroy(cr);
|
109
|
+
```
|
110
|
+
|
111
|
+
### Use IITree as a C++ library
|
112
|
+
|
113
|
+
```cpp
|
114
|
+
IITree<int, int> tree;
|
115
|
+
tree.add(12, 34, 0); // add an interval
|
116
|
+
tree.add(0, 23, 1);
|
117
|
+
tree.add(34, 56, 2);
|
118
|
+
tree.index(); // index
|
119
|
+
std::vector<size_t> a;
|
120
|
+
tree.overlap(22, 25, a); // retrieve overlaps
|
121
|
+
for (size_t i = 0; i < a.size(); ++i)
|
122
|
+
printf("%d\t%d\t%d\n", tree.start(a[i]), tree.end(a[i]), tree.data(a[i]));
|
123
|
+
```
|
124
|
+
|
125
|
+
[bedcov]: https://bedtools.readthedocs.io/en/latest/content/tools/coverage.html
|
126
|
+
[ekg-itree]: https://github.com/ekg/intervaltree
|
127
|
+
[quicksect]: https://github.com/brentp/quicksect
|
128
|
+
[ncls]: https://github.com/hunt-genes/ncls
|
129
|
+
[citree]: https://en.wikipedia.org/wiki/Interval_tree#Centered_interval_tree
|
130
|
+
[itree]: https://en.wikipedia.org/wiki/Interval_tree
|
131
|
+
[bheap]: https://en.wikipedia.org/wiki/Binary_heap
|
132
|
+
[ailist]: https://www.biorxiv.org/content/10.1101/593657v1
|
133
|
+
[kerneltree]: https://github.com/biocore-ntnu/kerneltree
|
@@ -0,0 +1,330 @@
|
|
1
|
+
#include <stdio.h>
|
2
|
+
#include <assert.h>
|
3
|
+
#include "cgranges.h"
|
4
|
+
#include "khash.h"
|
5
|
+
|
6
|
+
/**************
|
7
|
+
* Radix sort *
|
8
|
+
**************/
|
9
|
+
|
10
|
+
#define RS_MIN_SIZE 64
|
11
|
+
#define RS_MAX_BITS 8
|
12
|
+
|
13
|
+
#define KRADIX_SORT_INIT(name, rstype_t, rskey, sizeof_key) \
|
14
|
+
typedef struct { \
|
15
|
+
rstype_t *b, *e; \
|
16
|
+
} rsbucket_##name##_t; \
|
17
|
+
void rs_insertsort_##name(rstype_t *beg, rstype_t *end) \
|
18
|
+
{ \
|
19
|
+
rstype_t *i; \
|
20
|
+
for (i = beg + 1; i < end; ++i) \
|
21
|
+
if (rskey(*i) < rskey(*(i - 1))) { \
|
22
|
+
rstype_t *j, tmp = *i; \
|
23
|
+
for (j = i; j > beg && rskey(tmp) < rskey(*(j-1)); --j) \
|
24
|
+
*j = *(j - 1); \
|
25
|
+
*j = tmp; \
|
26
|
+
} \
|
27
|
+
} \
|
28
|
+
void rs_sort_##name(rstype_t *beg, rstype_t *end, int n_bits, int s) \
|
29
|
+
{ \
|
30
|
+
rstype_t *i; \
|
31
|
+
int size = 1<<n_bits, m = size - 1; \
|
32
|
+
rsbucket_##name##_t *k, b[1<<RS_MAX_BITS], *be = b + size; \
|
33
|
+
assert(n_bits <= RS_MAX_BITS); \
|
34
|
+
for (k = b; k != be; ++k) k->b = k->e = beg; \
|
35
|
+
for (i = beg; i != end; ++i) ++b[rskey(*i)>>s&m].e; \
|
36
|
+
for (k = b + 1; k != be; ++k) \
|
37
|
+
k->e += (k-1)->e - beg, k->b = (k-1)->e; \
|
38
|
+
for (k = b; k != be;) { \
|
39
|
+
if (k->b != k->e) { \
|
40
|
+
rsbucket_##name##_t *l; \
|
41
|
+
if ((l = b + (rskey(*k->b)>>s&m)) != k) { \
|
42
|
+
rstype_t tmp = *k->b, swap; \
|
43
|
+
do { \
|
44
|
+
swap = tmp; tmp = *l->b; *l->b++ = swap; \
|
45
|
+
l = b + (rskey(tmp)>>s&m); \
|
46
|
+
} while (l != k); \
|
47
|
+
*k->b++ = tmp; \
|
48
|
+
} else ++k->b; \
|
49
|
+
} else ++k; \
|
50
|
+
} \
|
51
|
+
for (b->b = beg, k = b + 1; k != be; ++k) k->b = (k-1)->e; \
|
52
|
+
if (s) { \
|
53
|
+
s = s > n_bits? s - n_bits : 0; \
|
54
|
+
for (k = b; k != be; ++k) \
|
55
|
+
if (k->e - k->b > RS_MIN_SIZE) rs_sort_##name(k->b, k->e, n_bits, s); \
|
56
|
+
else if (k->e - k->b > 1) rs_insertsort_##name(k->b, k->e); \
|
57
|
+
} \
|
58
|
+
} \
|
59
|
+
void radix_sort_##name(rstype_t *beg, rstype_t *end) \
|
60
|
+
{ \
|
61
|
+
if (end - beg <= RS_MIN_SIZE) rs_insertsort_##name(beg, end); \
|
62
|
+
else rs_sort_##name(beg, end, RS_MAX_BITS, (sizeof_key - 1) * RS_MAX_BITS); \
|
63
|
+
}
|
64
|
+
|
65
|
+
/*********************
|
66
|
+
* Convenient macros *
|
67
|
+
*********************/
|
68
|
+
|
69
|
+
#ifndef kroundup32
|
70
|
+
#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
|
71
|
+
#endif
|
72
|
+
|
73
|
+
#define CALLOC(type, len) ((type*)calloc((len), sizeof(type)))
|
74
|
+
#define REALLOC(ptr, len) ((ptr) = (__typeof__(ptr))realloc((ptr), (len) * sizeof(*(ptr))))
|
75
|
+
|
76
|
+
#define EXPAND(a, m) do { \
|
77
|
+
(m) = (m)? (m) + ((m)>>1) : 16; \
|
78
|
+
REALLOC((a), (m)); \
|
79
|
+
} while (0)
|
80
|
+
|
81
|
+
/********************
|
82
|
+
* Basic operations *
|
83
|
+
********************/
|
84
|
+
|
85
|
+
#define cr_intv_key(r) ((r).x)
|
86
|
+
KRADIX_SORT_INIT(cr_intv, cr_intv_t, cr_intv_key, 8)
|
87
|
+
|
88
|
+
KHASH_MAP_INIT_STR(str, int32_t)
|
89
|
+
typedef khash_t(str) strhash_t;
|
90
|
+
|
91
|
+
cgranges_t *cr_init(void)
|
92
|
+
{
|
93
|
+
cgranges_t *cr;
|
94
|
+
cr = CALLOC(cgranges_t, 1);
|
95
|
+
cr->hc = kh_init(str);
|
96
|
+
return cr;
|
97
|
+
}
|
98
|
+
|
99
|
+
void cr_destroy(cgranges_t *cr)
|
100
|
+
{
|
101
|
+
int32_t i;
|
102
|
+
if (cr == 0) return;
|
103
|
+
for (i = 0; i < cr->n_ctg; ++i)
|
104
|
+
free(cr->ctg[i].name);
|
105
|
+
free(cr->ctg);
|
106
|
+
kh_destroy(str, (strhash_t*)cr->hc);
|
107
|
+
free(cr);
|
108
|
+
}
|
109
|
+
|
110
|
+
int32_t cr_add_ctg(cgranges_t *cr, const char *ctg, int32_t len)
|
111
|
+
{
|
112
|
+
int absent;
|
113
|
+
khint_t k;
|
114
|
+
strhash_t *h = (strhash_t*)cr->hc;
|
115
|
+
k = kh_put(str, h, ctg, &absent);
|
116
|
+
if (absent) {
|
117
|
+
cr_ctg_t *p;
|
118
|
+
if (cr->n_ctg == cr->m_ctg)
|
119
|
+
EXPAND(cr->ctg, cr->m_ctg);
|
120
|
+
kh_val(h, k) = cr->n_ctg;
|
121
|
+
p = &cr->ctg[cr->n_ctg++];
|
122
|
+
p->name = strdup(ctg);
|
123
|
+
kh_key(h, k) = p->name;
|
124
|
+
p->len = len;
|
125
|
+
p->n = 0, p->off = -1;
|
126
|
+
}
|
127
|
+
if (len > cr->ctg[kh_val(h, k)].len)
|
128
|
+
cr->ctg[kh_val(h, k)].len = len;
|
129
|
+
return kh_val(h, k);
|
130
|
+
}
|
131
|
+
|
132
|
+
int32_t cr_get_ctg(const cgranges_t *cr, const char *ctg)
|
133
|
+
{
|
134
|
+
khint_t k;
|
135
|
+
strhash_t *h = (strhash_t*)cr->hc;
|
136
|
+
k = kh_get(str, h, ctg);
|
137
|
+
return k == kh_end(h)? -1 : kh_val(h, k);
|
138
|
+
}
|
139
|
+
|
140
|
+
cr_intv_t *cr_add(cgranges_t *cr, const char *ctg, int32_t st, int32_t en, int32_t label_int)
|
141
|
+
{
|
142
|
+
cr_intv_t *p;
|
143
|
+
int32_t k;
|
144
|
+
if (st > en) return 0;
|
145
|
+
k = cr_add_ctg(cr, ctg, 0);
|
146
|
+
if (cr->n_r == cr->m_r)
|
147
|
+
EXPAND(cr->r, cr->m_r);
|
148
|
+
p = &cr->r[cr->n_r++];
|
149
|
+
p->x = (uint64_t)k << 32 | st;
|
150
|
+
p->y = en;
|
151
|
+
p->label = label_int;
|
152
|
+
if (cr->ctg[k].len < en)
|
153
|
+
cr->ctg[k].len = en;
|
154
|
+
return p;
|
155
|
+
}
|
156
|
+
|
157
|
+
void cr_sort(cgranges_t *cr)
|
158
|
+
{
|
159
|
+
if (cr->n_ctg == 0 || cr->n_r == 0) return;
|
160
|
+
radix_sort_cr_intv(cr->r, cr->r + cr->n_r);
|
161
|
+
}
|
162
|
+
|
163
|
+
int32_t cr_is_sorted(const cgranges_t *cr)
|
164
|
+
{
|
165
|
+
uint64_t i;
|
166
|
+
for (i = 1; i < cr->n_r; ++i)
|
167
|
+
if (cr->r[i-1].x > cr->r[i].x)
|
168
|
+
break;
|
169
|
+
return (i == cr->n_r);
|
170
|
+
}
|
171
|
+
|
172
|
+
/************
|
173
|
+
* Indexing *
|
174
|
+
************/
|
175
|
+
|
176
|
+
void cr_index_prepare(cgranges_t *cr)
|
177
|
+
{
|
178
|
+
int64_t i, st;
|
179
|
+
if (!cr_is_sorted(cr)) cr_sort(cr);
|
180
|
+
for (st = 0, i = 1; i <= cr->n_r; ++i) {
|
181
|
+
if (i == cr->n_r || cr->r[i].x>>32 != cr->r[st].x>>32) {
|
182
|
+
int32_t ctg = cr->r[st].x>>32;
|
183
|
+
cr->ctg[ctg].off = st;
|
184
|
+
cr->ctg[ctg].n = i - st;
|
185
|
+
st = i;
|
186
|
+
}
|
187
|
+
}
|
188
|
+
for (i = 0; i < cr->n_r; ++i) {
|
189
|
+
cr_intv_t *r = &cr->r[i];
|
190
|
+
r->x = r->x<<32 | r->y;
|
191
|
+
r->y = 0;
|
192
|
+
}
|
193
|
+
}
|
194
|
+
|
195
|
+
int32_t cr_index1(cr_intv_t *a, int64_t n)
|
196
|
+
{
|
197
|
+
int64_t i, last_i;
|
198
|
+
int32_t last, k;
|
199
|
+
if (n <= 0) return -1;
|
200
|
+
for (i = 0; i < n; i += 2) last_i = i, last = a[i].y = (int32_t)a[i].x;
|
201
|
+
for (k = 1; 1LL<<k <= n; ++k) {
|
202
|
+
int64_t x = 1LL<<(k-1), i0 = (x<<1) - 1, step = x<<2;
|
203
|
+
for (i = i0; i < n; i += step) {
|
204
|
+
int32_t el = a[i - x].y;
|
205
|
+
int32_t er = i + x < n? a[i + x].y : last;
|
206
|
+
int32_t e = (int32_t)a[i].x;
|
207
|
+
e = e > el? e : el;
|
208
|
+
e = e > er? e : er;
|
209
|
+
a[i].y = e;
|
210
|
+
}
|
211
|
+
last_i = last_i>>k&1? last_i - x : last_i + x;
|
212
|
+
if (last_i < n && a[last_i].y > last)
|
213
|
+
last = a[last_i].y;
|
214
|
+
}
|
215
|
+
return k - 1;
|
216
|
+
}
|
217
|
+
|
218
|
+
void cr_index(cgranges_t *cr)
|
219
|
+
{
|
220
|
+
int32_t i;
|
221
|
+
cr_index_prepare(cr);
|
222
|
+
for (i = 0; i < cr->n_ctg; ++i)
|
223
|
+
cr->ctg[i].root_k = cr_index1(&cr->r[cr->ctg[i].off], cr->ctg[i].n);
|
224
|
+
}
|
225
|
+
|
226
|
+
/*********
|
227
|
+
* Query *
|
228
|
+
*********/
|
229
|
+
|
230
|
+
int64_t cr_min_start_int(const cgranges_t *cr, int32_t ctg_id, int32_t st) // find the smallest i such that cr_st(&r[i]) >= st
|
231
|
+
{
|
232
|
+
int64_t left, right;
|
233
|
+
const cr_ctg_t *c;
|
234
|
+
const cr_intv_t *r;
|
235
|
+
|
236
|
+
if (ctg_id < 0 || ctg_id >= cr->n_ctg) return -1;
|
237
|
+
c = &cr->ctg[ctg_id];
|
238
|
+
r = &cr->r[c->off];
|
239
|
+
if (c->n == 0) return -1;
|
240
|
+
left = 0, right = c->n;
|
241
|
+
while (right > left) {
|
242
|
+
int64_t mid = left + ((right - left) >> 1);
|
243
|
+
if (cr_st(&r[mid]) >= st) right = mid;
|
244
|
+
else left = mid + 1;
|
245
|
+
}
|
246
|
+
assert(left == right);
|
247
|
+
return left == c->n? -1 : c->off + left;
|
248
|
+
}
|
249
|
+
|
250
|
+
typedef struct {
|
251
|
+
int64_t x;
|
252
|
+
int32_t k, w;
|
253
|
+
} istack_t;
|
254
|
+
|
255
|
+
int64_t cr_overlap_int(const cgranges_t *cr, int32_t ctg_id, int32_t st, int32_t en, int64_t **b_, int64_t *m_b_)
|
256
|
+
{
|
257
|
+
int32_t t = 0;
|
258
|
+
const cr_ctg_t *c;
|
259
|
+
const cr_intv_t *r;
|
260
|
+
int64_t *b = *b_, m_b = *m_b_, n = 0;
|
261
|
+
istack_t stack[64], *p;
|
262
|
+
|
263
|
+
if (ctg_id < 0 || ctg_id >= cr->n_ctg) return 0;
|
264
|
+
c = &cr->ctg[ctg_id];
|
265
|
+
r = &cr->r[c->off];
|
266
|
+
p = &stack[t++];
|
267
|
+
p->k = c->root_k, p->x = (1LL<<p->k) - 1, p->w = 0; // push the root into the stack
|
268
|
+
while (t) { // stack is not empyt
|
269
|
+
istack_t z = stack[--t];
|
270
|
+
if (z.k <= 3) { // the subtree is no larger than (1<<(z.k+1))-1; do a linear scan
|
271
|
+
int64_t i, i0 = z.x >> z.k << z.k, i1 = i0 + (1LL<<(z.k+1)) - 1;
|
272
|
+
if (i1 >= c->n) i1 = c->n;
|
273
|
+
for (i = i0; i < i1 && cr_st(&r[i]) < en; ++i)
|
274
|
+
if (st < cr_en(&r[i])) {
|
275
|
+
if (n == m_b) EXPAND(b, m_b);
|
276
|
+
b[n++] = c->off + i;
|
277
|
+
}
|
278
|
+
} else if (z.w == 0) { // if left child not processed
|
279
|
+
int64_t y = z.x - (1LL<<(z.k-1));
|
280
|
+
p = &stack[t++];
|
281
|
+
p->k = z.k, p->x = z.x, p->w = 1;
|
282
|
+
if (y >= c->n || r[y].y > st) {
|
283
|
+
p = &stack[t++];
|
284
|
+
p->k = z.k - 1, p->x = y, p->w = 0; // push the left child to the stack
|
285
|
+
}
|
286
|
+
} else if (z.x < c->n && cr_st(&r[z.x]) < en) {
|
287
|
+
if (st < cr_en(&r[z.x])) { // then z.x overlaps the query; write to the output array
|
288
|
+
if (n == m_b) EXPAND(b, m_b);
|
289
|
+
b[n++] = c->off + z.x;
|
290
|
+
}
|
291
|
+
p = &stack[t++];
|
292
|
+
p->k = z.k - 1, p->x = z.x + (1LL<<(z.k-1)), p->w = 0; // push the right child
|
293
|
+
}
|
294
|
+
}
|
295
|
+
*b_ = b, *m_b_ = m_b;
|
296
|
+
return n;
|
297
|
+
}
|
298
|
+
|
299
|
+
int64_t cr_contain_int(const cgranges_t *cr, int32_t ctg_id, int32_t st, int32_t en, int64_t **b_, int64_t *m_b_)
|
300
|
+
{
|
301
|
+
int64_t n = 0, i, s, e, *b = *b_, m_b = *m_b_;
|
302
|
+
s = cr_min_start_int(cr, ctg_id, st);
|
303
|
+
if (s < 0) return 0;
|
304
|
+
e = cr->ctg[ctg_id].off + cr->ctg[ctg_id].n;
|
305
|
+
for (i = s; i < e; ++i) {
|
306
|
+
const cr_intv_t *r = &cr->r[i];
|
307
|
+
if (cr_st(r) >= en) break;
|
308
|
+
if (cr_st(r) >= st && cr_en(r) <= en) {
|
309
|
+
if (n == m_b) EXPAND(b, m_b);
|
310
|
+
b[n++] = i;
|
311
|
+
}
|
312
|
+
}
|
313
|
+
*b_ = b, *m_b_ = m_b;
|
314
|
+
return n;
|
315
|
+
}
|
316
|
+
|
317
|
+
int64_t cr_min_start(const cgranges_t *cr, const char *ctg, int32_t st)
|
318
|
+
{
|
319
|
+
return cr_min_start_int(cr, cr_get_ctg(cr, ctg), st);
|
320
|
+
}
|
321
|
+
|
322
|
+
int64_t cr_overlap(const cgranges_t *cr, const char *ctg, int32_t st, int32_t en, int64_t **b_, int64_t *m_b_)
|
323
|
+
{
|
324
|
+
return cr_overlap_int(cr, cr_get_ctg(cr, ctg), st, en, b_, m_b_);
|
325
|
+
}
|
326
|
+
|
327
|
+
int64_t cr_contain(const cgranges_t *cr, const char *ctg, int32_t st, int32_t en, int64_t **b_, int64_t *m_b_)
|
328
|
+
{
|
329
|
+
return cr_contain_int(cr, cr_get_ctg(cr, ctg), st, en, b_, m_b_);
|
330
|
+
}
|