bio-cgranges 0.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 618c6866b3f708b148682a529b73efda7b782bfc567f92949f72c237fd98435b
4
+ data.tar.gz: 0ebbc69b7858f934fd98f60500e49b7300774b21e05ee04c654d55746e18f00b
5
+ SHA512:
6
+ metadata.gz: 0444fad48ec7b6266072f2b7fb23684bf295703121c6d0b5b392e3c150de6dda02adbd1ecff161793b5f1a821f8d10cac564c4c60042f5fc4cc191bb26181620
7
+ data.tar.gz: 7f783598ec7ed7937c8163593901f055463175e4b16ab0073cf41bb96341f858077877524c5e1409369822e1efd5eaed80a11acc6c7678d9e27dee4ee3e14dec
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2021 kojix2
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,65 @@
1
+ # bio-cgranges
2
+
3
+ [![Gem Version](https://badge.fury.io/rb/bio-cgranges.svg)](https://badge.fury.io/rb/bio-cgranges)
4
+ [![test](https://github.com/ruby-on-bioc/bio-cgranges/actions/workflows/ci.yml/badge.svg)](https://github.com/ruby-on-bioc/bio-cgranges/actions/workflows/ci.yml)
5
+ [![dics](https://img.shields.io/badge/docs-stable-blue.svg)](https://rubydoc.info/gems/bio-cgranges)
6
+
7
+ Ruby bindings to [lh3/cgranges](https://github.com/lh3/cgranges).
8
+
9
+ > cgranges is a small C library for genomic interval overlap queries
10
+
11
+ ## Installation
12
+
13
+ ```sh
14
+ gem install bio-cgranges
15
+ ```
16
+
17
+ ## Usage
18
+
19
+ ```ruby
20
+ require "bio/cgranges"
21
+
22
+ granges = Bio::CGRanges.new
23
+ granges.add("chr1", 10, 20, 0)
24
+ .add("chr1", 15, 25, 1)
25
+ .add("chr1", 30, 40, 2)
26
+ .add("chr1", 10, 25, 3)
27
+ .add("chr1", 15, 20, 4)
28
+ .add("chr2", 10, 20, 5)
29
+ .index
30
+
31
+ granges.overlap("chr1", 12, 22)
32
+ # [["chr1", 10, 20, 0],
33
+ # ["chr1", 10, 25, 3],
34
+ # ["chr1", 15, 25, 1],
35
+ # ["chr1", 15, 20, 4]]
36
+
37
+ granges.contain("chr1", 12, 22)
38
+ # [["chr1", 15, 20, 4]]
39
+ ```
40
+
41
+ ```
42
+ 0.........5.........10........15........20........25........30........35........40
43
+ |-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|
44
+ 0-0-0-0-0-0-0-0-0-0
45
+ 1-1-1-1-1-1-1-1-1-1
46
+ 2-2-2-2-2-2-2-2-2-2
47
+ 3-3-3-3-3-3-3-3-3-3-3-3-3-3-3
48
+ 4-4-4-4-4
49
+ 5-5-5-5-5
50
+ Q-Q-Q-Q-Q-Q-Q-Q-Q-Q
51
+ |-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|
52
+ 0.........5.........10........15........20........25........30........35........40
53
+ ```
54
+
55
+ ## Development
56
+
57
+ Bug reports and pull requests are welcome on GitHub at https://github.com/ruby-on-bioc/bio-cgranges.
58
+
59
+ Do you need commit rights to my repository?
60
+ Do you want to get admin rights and take over the project?
61
+ If so, please feel free to contact us @kojix2.
62
+
63
+ ## License
64
+
65
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
data/Rakefile ADDED
@@ -0,0 +1,44 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "bundler/gem_tasks"
4
+ require "rake/testtask"
5
+
6
+ Rake::TestTask.new(:test) do |t|
7
+ t.libs << "test"
8
+ t.libs << "lib"
9
+ t.test_files = FileList["test/**/*_test.rb"]
10
+ end
11
+
12
+ require "rubocop/rake_task"
13
+
14
+ RuboCop::RakeTask.new
15
+
16
+ require "rake/extensiontask"
17
+
18
+ task build: :compile
19
+
20
+ Rake::ExtensionTask.new("cgranges") do |ext|
21
+ ext.lib_dir = "lib/bio/cgranges"
22
+ ext.ext_dir = "ext/bio/cgranges"
23
+ end
24
+
25
+ desc "Remove object file"
26
+ task :remove_object_file do
27
+ Dir["ext/**/*.{o,bundle}"].each do |f|
28
+ FileUtils.rm(f)
29
+ end
30
+ end
31
+
32
+ task default: %i[
33
+ clobber
34
+ compile
35
+ remove_object_file
36
+ test
37
+ rubocop
38
+ ]
39
+
40
+ task cleanall: %i[
41
+ clobber
42
+ remove_object_file
43
+ clean
44
+ ]
@@ -0,0 +1,23 @@
1
+ The MIT License
2
+
3
+ Copyright (c) 2019 Dana-Farber Cancer Institute
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23
+ SOFTWARE.
@@ -0,0 +1,133 @@
1
+ ## Introduction
2
+
3
+ cgranges is a small C library for genomic interval overlap queries: given a
4
+ genomic region *r* and a set of regions *R*, finding all regions in *R* that
5
+ overlaps *r*. Although this library is based on [interval tree][itree], a well
6
+ known data structure, the core algorithm of cgranges is distinct from all
7
+ existing implementations to the best of our knowledge. Specifically, the
8
+ interval tree in cgranges is implicitly encoded as a plain sorted array
9
+ (similar to [binary heap][bheap] but packed differently). Tree
10
+ traversal is achieved by jumping between array indices. This treatment makes
11
+ cgranges very efficient and compact in memory. The core algorithm can be
12
+ implemented in ~50 lines of C++ code, much shorter than others as well. Please
13
+ see the code comments in [cpp/IITree.h](cpp/IITree.h) for details.
14
+
15
+ ## Usage
16
+
17
+ ### Test with BED coverage
18
+
19
+ For testing purposes, this repo implements the [bedtools coverage][bedcov] tool
20
+ with cgranges. The source code is located in the [test/](test) directory. You
21
+ can compile and run the test with:
22
+ ```sh
23
+ cd test && make
24
+ ./bedcov-cr test1.bed test2.bed
25
+ ```
26
+ The first BED file is loaded into RAM and indexed. The depth and the breadth of
27
+ coverage of each region in the second file is computed by query against the
28
+ index of the first file.
29
+
30
+ The [test/](test) directory also contains a few other implementations based on
31
+ [IntervalTree.h][ekg-itree] in C++, [quicksect][quicksect] in Cython and
32
+ [ncls][ncls] in Cython. The table below shows timing and peak memory on two
33
+ test BEDs available in the release page. The first BED contains GenCode
34
+ annotations with ~1.2 million lines, mixing all types of features. The second
35
+ contains ~10 million direct-RNA mappings. Time1a/Mem1a indexes the GenCode BED
36
+ into memory. Time1b adds whole chromosome intervals to the GenCode BED when
37
+ indexing. Time2/Mem2 indexes the RNA-mapping BED into memory. Numbers are
38
+ averaged over 5 runs.
39
+
40
+ |Algo. |Lang. |Cov|Program |Time1a|Time1b|Mem1a |Time2 |Mem2 |
41
+ |:-------|:-----|:-:|:---------------|-----:|-----:|-------:|-----:|-------:|
42
+ |IAITree |C |Y |cgranges |9.0s |13.9s |19.1MB |4.6s |138.4MB |
43
+ |IAITree |C++ |Y |cpp/iitree.h |11.1s |24.5s |22.4MB |5.8s |160.4MB |
44
+ |CITree |C++ |Y |IntervalTree.h |17.4s |17.4s |27.2MB |10.5s |179.5MB |
45
+ |IAITree |C |N |cgranges |7.6s |13.0s |19.1MB |4.1s |138.4MB |
46
+ |AIList |C |N |3rd-party/AIList|7.9s |8.1s |14.4MB |6.5s |104.8MB |
47
+ |NCList |C |N |3rd-party/NCList|13.0s |13.4s |21.4MB |10.6s |183.0MB |
48
+ |AITree |C |N |3rd-party/AITree|16.8s |18.4s |73.4MB |27.3s |546.4MB |
49
+ |IAITree |Cython|N |cgranges |56.6s |63.9s |23.4MB |43.9s |143.1MB |
50
+ |binning |C++ |Y |bedtools |201.9s|280.4s|478.5MB |149.1s|3438.1MB|
51
+
52
+ Here, IAITree = implicit augmented interval tree, used by cgranges;
53
+ CITree = centered interval tree, used by [Erik Garrison's
54
+ IntervalTree][itree]; AIList = augmented interval list, by [Feng et
55
+ al][ailist]; NCList = nested containment list, taken from [ncls][ncls] by Feng
56
+ et al; AITree = augmented interval tree, from [kerneltree][kerneltree].
57
+ "Cov" indicates whether the program calculates breadth of coverage.
58
+ Comments:
59
+
60
+ * AIList keeps start and end only. IAITree and CITree addtionally store a
61
+ 4-byte "ID" field per interval to reference the source of interval. This is
62
+ partly why AIList uses the least memory.
63
+
64
+ * IAITree is more sensitive to the worse case: the presence of an interval
65
+ spanning the whole chromosome.
66
+
67
+ * IAITree uses an efficient radix sort. CITree uses std::sort from STL, which
68
+ is ok. AIList and NCList use qsort from libc, which is slow. Faster sorting
69
+ leads to faster indexing.
70
+
71
+ * IAITree in C++ uses identical core algorithm to the C version, but limited by
72
+ its APIs, it wastes time on memory locality and management. CITree has a
73
+ similar issue.
74
+
75
+ * Computing coverage is better done when the returned list of intervals are
76
+ start sorted. IAITree returns sorted list. CITree doesn't. Not sure about
77
+ others. Computing coverage takes a couple of seconds. Sorting will be slower.
78
+
79
+ * Printing intervals also takes a noticeable fraction of time. Custom printf
80
+ equivalent would be faster.
81
+
82
+ * IAITree+Cython is a wrapper around the C version of cgranges. Cython adds
83
+ significant overhead.
84
+
85
+ * Bedtools is designed for a variety of applications in addition to computing
86
+ coverage. It may keep other information in its internal data structure. This
87
+ micro-benchmark may be unfair to bedtools.
88
+
89
+ * In general, the performance is affected a lot by subtle implementation
90
+ details. CITree, IAITree, NCList and AIList are all broadly comparable in
91
+ performance. AITree is not recommended when indexed intervals are immutable.
92
+
93
+ ### Use cgranges as a C library
94
+
95
+ ```c
96
+ cgranges_t *cr = cr_init(); // initialize a cgranges_t object
97
+ cr_add(cr, "chr1", 20, 30, 0); // add a genomic interval
98
+ cr_add(cr, "chr2", 10, 30, 1);
99
+ cr_add(cr, "chr1", 10, 25, 2);
100
+ cr_index(cr); // index
101
+
102
+ int64_t i, n, *b = 0, max_b = 0;
103
+ n = cr_overlap(cr, "chr1", 15, 22, &b, &max_b); // overlap query; output array b[] can be reused
104
+ for (i = 0; i < n; ++i) // traverse overlapping intervals
105
+ printf("%d\t%d\t%d\n", cr_start(cr, b[i]), cr_end(cr, b[i]), cr_label(cr, b[i]));
106
+ free(b); // b[] is allocated by malloc() inside cr_overlap(), so needs to be freed with free()
107
+
108
+ cr_destroy(cr);
109
+ ```
110
+
111
+ ### Use IITree as a C++ library
112
+
113
+ ```cpp
114
+ IITree<int, int> tree;
115
+ tree.add(12, 34, 0); // add an interval
116
+ tree.add(0, 23, 1);
117
+ tree.add(34, 56, 2);
118
+ tree.index(); // index
119
+ std::vector<size_t> a;
120
+ tree.overlap(22, 25, a); // retrieve overlaps
121
+ for (size_t i = 0; i < a.size(); ++i)
122
+ printf("%d\t%d\t%d\n", tree.start(a[i]), tree.end(a[i]), tree.data(a[i]));
123
+ ```
124
+
125
+ [bedcov]: https://bedtools.readthedocs.io/en/latest/content/tools/coverage.html
126
+ [ekg-itree]: https://github.com/ekg/intervaltree
127
+ [quicksect]: https://github.com/brentp/quicksect
128
+ [ncls]: https://github.com/hunt-genes/ncls
129
+ [citree]: https://en.wikipedia.org/wiki/Interval_tree#Centered_interval_tree
130
+ [itree]: https://en.wikipedia.org/wiki/Interval_tree
131
+ [bheap]: https://en.wikipedia.org/wiki/Binary_heap
132
+ [ailist]: https://www.biorxiv.org/content/10.1101/593657v1
133
+ [kerneltree]: https://github.com/biocore-ntnu/kerneltree
@@ -0,0 +1,330 @@
1
+ #include <stdio.h>
2
+ #include <assert.h>
3
+ #include "cgranges.h"
4
+ #include "khash.h"
5
+
6
+ /**************
7
+ * Radix sort *
8
+ **************/
9
+
10
+ #define RS_MIN_SIZE 64
11
+ #define RS_MAX_BITS 8
12
+
13
+ #define KRADIX_SORT_INIT(name, rstype_t, rskey, sizeof_key) \
14
+ typedef struct { \
15
+ rstype_t *b, *e; \
16
+ } rsbucket_##name##_t; \
17
+ void rs_insertsort_##name(rstype_t *beg, rstype_t *end) \
18
+ { \
19
+ rstype_t *i; \
20
+ for (i = beg + 1; i < end; ++i) \
21
+ if (rskey(*i) < rskey(*(i - 1))) { \
22
+ rstype_t *j, tmp = *i; \
23
+ for (j = i; j > beg && rskey(tmp) < rskey(*(j-1)); --j) \
24
+ *j = *(j - 1); \
25
+ *j = tmp; \
26
+ } \
27
+ } \
28
+ void rs_sort_##name(rstype_t *beg, rstype_t *end, int n_bits, int s) \
29
+ { \
30
+ rstype_t *i; \
31
+ int size = 1<<n_bits, m = size - 1; \
32
+ rsbucket_##name##_t *k, b[1<<RS_MAX_BITS], *be = b + size; \
33
+ assert(n_bits <= RS_MAX_BITS); \
34
+ for (k = b; k != be; ++k) k->b = k->e = beg; \
35
+ for (i = beg; i != end; ++i) ++b[rskey(*i)>>s&m].e; \
36
+ for (k = b + 1; k != be; ++k) \
37
+ k->e += (k-1)->e - beg, k->b = (k-1)->e; \
38
+ for (k = b; k != be;) { \
39
+ if (k->b != k->e) { \
40
+ rsbucket_##name##_t *l; \
41
+ if ((l = b + (rskey(*k->b)>>s&m)) != k) { \
42
+ rstype_t tmp = *k->b, swap; \
43
+ do { \
44
+ swap = tmp; tmp = *l->b; *l->b++ = swap; \
45
+ l = b + (rskey(tmp)>>s&m); \
46
+ } while (l != k); \
47
+ *k->b++ = tmp; \
48
+ } else ++k->b; \
49
+ } else ++k; \
50
+ } \
51
+ for (b->b = beg, k = b + 1; k != be; ++k) k->b = (k-1)->e; \
52
+ if (s) { \
53
+ s = s > n_bits? s - n_bits : 0; \
54
+ for (k = b; k != be; ++k) \
55
+ if (k->e - k->b > RS_MIN_SIZE) rs_sort_##name(k->b, k->e, n_bits, s); \
56
+ else if (k->e - k->b > 1) rs_insertsort_##name(k->b, k->e); \
57
+ } \
58
+ } \
59
+ void radix_sort_##name(rstype_t *beg, rstype_t *end) \
60
+ { \
61
+ if (end - beg <= RS_MIN_SIZE) rs_insertsort_##name(beg, end); \
62
+ else rs_sort_##name(beg, end, RS_MAX_BITS, (sizeof_key - 1) * RS_MAX_BITS); \
63
+ }
64
+
65
+ /*********************
66
+ * Convenient macros *
67
+ *********************/
68
+
69
+ #ifndef kroundup32
70
+ #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
71
+ #endif
72
+
73
+ #define CALLOC(type, len) ((type*)calloc((len), sizeof(type)))
74
+ #define REALLOC(ptr, len) ((ptr) = (__typeof__(ptr))realloc((ptr), (len) * sizeof(*(ptr))))
75
+
76
+ #define EXPAND(a, m) do { \
77
+ (m) = (m)? (m) + ((m)>>1) : 16; \
78
+ REALLOC((a), (m)); \
79
+ } while (0)
80
+
81
+ /********************
82
+ * Basic operations *
83
+ ********************/
84
+
85
+ #define cr_intv_key(r) ((r).x)
86
+ KRADIX_SORT_INIT(cr_intv, cr_intv_t, cr_intv_key, 8)
87
+
88
+ KHASH_MAP_INIT_STR(str, int32_t)
89
+ typedef khash_t(str) strhash_t;
90
+
91
+ cgranges_t *cr_init(void)
92
+ {
93
+ cgranges_t *cr;
94
+ cr = CALLOC(cgranges_t, 1);
95
+ cr->hc = kh_init(str);
96
+ return cr;
97
+ }
98
+
99
+ void cr_destroy(cgranges_t *cr)
100
+ {
101
+ int32_t i;
102
+ if (cr == 0) return;
103
+ for (i = 0; i < cr->n_ctg; ++i)
104
+ free(cr->ctg[i].name);
105
+ free(cr->ctg);
106
+ kh_destroy(str, (strhash_t*)cr->hc);
107
+ free(cr);
108
+ }
109
+
110
+ int32_t cr_add_ctg(cgranges_t *cr, const char *ctg, int32_t len)
111
+ {
112
+ int absent;
113
+ khint_t k;
114
+ strhash_t *h = (strhash_t*)cr->hc;
115
+ k = kh_put(str, h, ctg, &absent);
116
+ if (absent) {
117
+ cr_ctg_t *p;
118
+ if (cr->n_ctg == cr->m_ctg)
119
+ EXPAND(cr->ctg, cr->m_ctg);
120
+ kh_val(h, k) = cr->n_ctg;
121
+ p = &cr->ctg[cr->n_ctg++];
122
+ p->name = strdup(ctg);
123
+ kh_key(h, k) = p->name;
124
+ p->len = len;
125
+ p->n = 0, p->off = -1;
126
+ }
127
+ if (len > cr->ctg[kh_val(h, k)].len)
128
+ cr->ctg[kh_val(h, k)].len = len;
129
+ return kh_val(h, k);
130
+ }
131
+
132
+ int32_t cr_get_ctg(const cgranges_t *cr, const char *ctg)
133
+ {
134
+ khint_t k;
135
+ strhash_t *h = (strhash_t*)cr->hc;
136
+ k = kh_get(str, h, ctg);
137
+ return k == kh_end(h)? -1 : kh_val(h, k);
138
+ }
139
+
140
+ cr_intv_t *cr_add(cgranges_t *cr, const char *ctg, int32_t st, int32_t en, int32_t label_int)
141
+ {
142
+ cr_intv_t *p;
143
+ int32_t k;
144
+ if (st > en) return 0;
145
+ k = cr_add_ctg(cr, ctg, 0);
146
+ if (cr->n_r == cr->m_r)
147
+ EXPAND(cr->r, cr->m_r);
148
+ p = &cr->r[cr->n_r++];
149
+ p->x = (uint64_t)k << 32 | st;
150
+ p->y = en;
151
+ p->label = label_int;
152
+ if (cr->ctg[k].len < en)
153
+ cr->ctg[k].len = en;
154
+ return p;
155
+ }
156
+
157
+ void cr_sort(cgranges_t *cr)
158
+ {
159
+ if (cr->n_ctg == 0 || cr->n_r == 0) return;
160
+ radix_sort_cr_intv(cr->r, cr->r + cr->n_r);
161
+ }
162
+
163
+ int32_t cr_is_sorted(const cgranges_t *cr)
164
+ {
165
+ uint64_t i;
166
+ for (i = 1; i < cr->n_r; ++i)
167
+ if (cr->r[i-1].x > cr->r[i].x)
168
+ break;
169
+ return (i == cr->n_r);
170
+ }
171
+
172
+ /************
173
+ * Indexing *
174
+ ************/
175
+
176
+ void cr_index_prepare(cgranges_t *cr)
177
+ {
178
+ int64_t i, st;
179
+ if (!cr_is_sorted(cr)) cr_sort(cr);
180
+ for (st = 0, i = 1; i <= cr->n_r; ++i) {
181
+ if (i == cr->n_r || cr->r[i].x>>32 != cr->r[st].x>>32) {
182
+ int32_t ctg = cr->r[st].x>>32;
183
+ cr->ctg[ctg].off = st;
184
+ cr->ctg[ctg].n = i - st;
185
+ st = i;
186
+ }
187
+ }
188
+ for (i = 0; i < cr->n_r; ++i) {
189
+ cr_intv_t *r = &cr->r[i];
190
+ r->x = r->x<<32 | r->y;
191
+ r->y = 0;
192
+ }
193
+ }
194
+
195
+ int32_t cr_index1(cr_intv_t *a, int64_t n)
196
+ {
197
+ int64_t i, last_i;
198
+ int32_t last, k;
199
+ if (n <= 0) return -1;
200
+ for (i = 0; i < n; i += 2) last_i = i, last = a[i].y = (int32_t)a[i].x;
201
+ for (k = 1; 1LL<<k <= n; ++k) {
202
+ int64_t x = 1LL<<(k-1), i0 = (x<<1) - 1, step = x<<2;
203
+ for (i = i0; i < n; i += step) {
204
+ int32_t el = a[i - x].y;
205
+ int32_t er = i + x < n? a[i + x].y : last;
206
+ int32_t e = (int32_t)a[i].x;
207
+ e = e > el? e : el;
208
+ e = e > er? e : er;
209
+ a[i].y = e;
210
+ }
211
+ last_i = last_i>>k&1? last_i - x : last_i + x;
212
+ if (last_i < n && a[last_i].y > last)
213
+ last = a[last_i].y;
214
+ }
215
+ return k - 1;
216
+ }
217
+
218
+ void cr_index(cgranges_t *cr)
219
+ {
220
+ int32_t i;
221
+ cr_index_prepare(cr);
222
+ for (i = 0; i < cr->n_ctg; ++i)
223
+ cr->ctg[i].root_k = cr_index1(&cr->r[cr->ctg[i].off], cr->ctg[i].n);
224
+ }
225
+
226
+ /*********
227
+ * Query *
228
+ *********/
229
+
230
+ int64_t cr_min_start_int(const cgranges_t *cr, int32_t ctg_id, int32_t st) // find the smallest i such that cr_st(&r[i]) >= st
231
+ {
232
+ int64_t left, right;
233
+ const cr_ctg_t *c;
234
+ const cr_intv_t *r;
235
+
236
+ if (ctg_id < 0 || ctg_id >= cr->n_ctg) return -1;
237
+ c = &cr->ctg[ctg_id];
238
+ r = &cr->r[c->off];
239
+ if (c->n == 0) return -1;
240
+ left = 0, right = c->n;
241
+ while (right > left) {
242
+ int64_t mid = left + ((right - left) >> 1);
243
+ if (cr_st(&r[mid]) >= st) right = mid;
244
+ else left = mid + 1;
245
+ }
246
+ assert(left == right);
247
+ return left == c->n? -1 : c->off + left;
248
+ }
249
+
250
+ typedef struct {
251
+ int64_t x;
252
+ int32_t k, w;
253
+ } istack_t;
254
+
255
+ int64_t cr_overlap_int(const cgranges_t *cr, int32_t ctg_id, int32_t st, int32_t en, int64_t **b_, int64_t *m_b_)
256
+ {
257
+ int32_t t = 0;
258
+ const cr_ctg_t *c;
259
+ const cr_intv_t *r;
260
+ int64_t *b = *b_, m_b = *m_b_, n = 0;
261
+ istack_t stack[64], *p;
262
+
263
+ if (ctg_id < 0 || ctg_id >= cr->n_ctg) return 0;
264
+ c = &cr->ctg[ctg_id];
265
+ r = &cr->r[c->off];
266
+ p = &stack[t++];
267
+ p->k = c->root_k, p->x = (1LL<<p->k) - 1, p->w = 0; // push the root into the stack
268
+ while (t) { // stack is not empyt
269
+ istack_t z = stack[--t];
270
+ if (z.k <= 3) { // the subtree is no larger than (1<<(z.k+1))-1; do a linear scan
271
+ int64_t i, i0 = z.x >> z.k << z.k, i1 = i0 + (1LL<<(z.k+1)) - 1;
272
+ if (i1 >= c->n) i1 = c->n;
273
+ for (i = i0; i < i1 && cr_st(&r[i]) < en; ++i)
274
+ if (st < cr_en(&r[i])) {
275
+ if (n == m_b) EXPAND(b, m_b);
276
+ b[n++] = c->off + i;
277
+ }
278
+ } else if (z.w == 0) { // if left child not processed
279
+ int64_t y = z.x - (1LL<<(z.k-1));
280
+ p = &stack[t++];
281
+ p->k = z.k, p->x = z.x, p->w = 1;
282
+ if (y >= c->n || r[y].y > st) {
283
+ p = &stack[t++];
284
+ p->k = z.k - 1, p->x = y, p->w = 0; // push the left child to the stack
285
+ }
286
+ } else if (z.x < c->n && cr_st(&r[z.x]) < en) {
287
+ if (st < cr_en(&r[z.x])) { // then z.x overlaps the query; write to the output array
288
+ if (n == m_b) EXPAND(b, m_b);
289
+ b[n++] = c->off + z.x;
290
+ }
291
+ p = &stack[t++];
292
+ p->k = z.k - 1, p->x = z.x + (1LL<<(z.k-1)), p->w = 0; // push the right child
293
+ }
294
+ }
295
+ *b_ = b, *m_b_ = m_b;
296
+ return n;
297
+ }
298
+
299
+ int64_t cr_contain_int(const cgranges_t *cr, int32_t ctg_id, int32_t st, int32_t en, int64_t **b_, int64_t *m_b_)
300
+ {
301
+ int64_t n = 0, i, s, e, *b = *b_, m_b = *m_b_;
302
+ s = cr_min_start_int(cr, ctg_id, st);
303
+ if (s < 0) return 0;
304
+ e = cr->ctg[ctg_id].off + cr->ctg[ctg_id].n;
305
+ for (i = s; i < e; ++i) {
306
+ const cr_intv_t *r = &cr->r[i];
307
+ if (cr_st(r) >= en) break;
308
+ if (cr_st(r) >= st && cr_en(r) <= en) {
309
+ if (n == m_b) EXPAND(b, m_b);
310
+ b[n++] = i;
311
+ }
312
+ }
313
+ *b_ = b, *m_b_ = m_b;
314
+ return n;
315
+ }
316
+
317
+ int64_t cr_min_start(const cgranges_t *cr, const char *ctg, int32_t st)
318
+ {
319
+ return cr_min_start_int(cr, cr_get_ctg(cr, ctg), st);
320
+ }
321
+
322
+ int64_t cr_overlap(const cgranges_t *cr, const char *ctg, int32_t st, int32_t en, int64_t **b_, int64_t *m_b_)
323
+ {
324
+ return cr_overlap_int(cr, cr_get_ctg(cr, ctg), st, en, b_, m_b_);
325
+ }
326
+
327
+ int64_t cr_contain(const cgranges_t *cr, const char *ctg, int32_t st, int32_t en, int64_t **b_, int64_t *m_b_)
328
+ {
329
+ return cr_contain_int(cr, cr_get_ctg(cr, ctg), st, en, b_, m_b_);
330
+ }