bio-cgranges 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 618c6866b3f708b148682a529b73efda7b782bfc567f92949f72c237fd98435b
4
+ data.tar.gz: 0ebbc69b7858f934fd98f60500e49b7300774b21e05ee04c654d55746e18f00b
5
+ SHA512:
6
+ metadata.gz: 0444fad48ec7b6266072f2b7fb23684bf295703121c6d0b5b392e3c150de6dda02adbd1ecff161793b5f1a821f8d10cac564c4c60042f5fc4cc191bb26181620
7
+ data.tar.gz: 7f783598ec7ed7937c8163593901f055463175e4b16ab0073cf41bb96341f858077877524c5e1409369822e1efd5eaed80a11acc6c7678d9e27dee4ee3e14dec
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2021 kojix2
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,65 @@
1
+ # bio-cgranges
2
+
3
+ [![Gem Version](https://badge.fury.io/rb/bio-cgranges.svg)](https://badge.fury.io/rb/bio-cgranges)
4
+ [![test](https://github.com/ruby-on-bioc/bio-cgranges/actions/workflows/ci.yml/badge.svg)](https://github.com/ruby-on-bioc/bio-cgranges/actions/workflows/ci.yml)
5
+ [![dics](https://img.shields.io/badge/docs-stable-blue.svg)](https://rubydoc.info/gems/bio-cgranges)
6
+
7
+ Ruby bindings to [lh3/cgranges](https://github.com/lh3/cgranges).
8
+
9
+ > cgranges is a small C library for genomic interval overlap queries
10
+
11
+ ## Installation
12
+
13
+ ```sh
14
+ gem install bio-cgranges
15
+ ```
16
+
17
+ ## Usage
18
+
19
+ ```ruby
20
+ require "bio/cgranges"
21
+
22
+ granges = Bio::CGRanges.new
23
+ granges.add("chr1", 10, 20, 0)
24
+ .add("chr1", 15, 25, 1)
25
+ .add("chr1", 30, 40, 2)
26
+ .add("chr1", 10, 25, 3)
27
+ .add("chr1", 15, 20, 4)
28
+ .add("chr2", 10, 20, 5)
29
+ .index
30
+
31
+ granges.overlap("chr1", 12, 22)
32
+ # [["chr1", 10, 20, 0],
33
+ # ["chr1", 10, 25, 3],
34
+ # ["chr1", 15, 25, 1],
35
+ # ["chr1", 15, 20, 4]]
36
+
37
+ granges.contain("chr1", 12, 22)
38
+ # [["chr1", 15, 20, 4]]
39
+ ```
40
+
41
+ ```
42
+ 0.........5.........10........15........20........25........30........35........40
43
+ |-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|
44
+ 0-0-0-0-0-0-0-0-0-0
45
+ 1-1-1-1-1-1-1-1-1-1
46
+ 2-2-2-2-2-2-2-2-2-2
47
+ 3-3-3-3-3-3-3-3-3-3-3-3-3-3-3
48
+ 4-4-4-4-4
49
+ 5-5-5-5-5
50
+ Q-Q-Q-Q-Q-Q-Q-Q-Q-Q
51
+ |-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|
52
+ 0.........5.........10........15........20........25........30........35........40
53
+ ```
54
+
55
+ ## Development
56
+
57
+ Bug reports and pull requests are welcome on GitHub at https://github.com/ruby-on-bioc/bio-cgranges.
58
+
59
+ Do you need commit rights to my repository?
60
+ Do you want to get admin rights and take over the project?
61
+ If so, please feel free to contact us @kojix2.
62
+
63
+ ## License
64
+
65
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
data/Rakefile ADDED
@@ -0,0 +1,44 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "bundler/gem_tasks"
4
+ require "rake/testtask"
5
+
6
+ Rake::TestTask.new(:test) do |t|
7
+ t.libs << "test"
8
+ t.libs << "lib"
9
+ t.test_files = FileList["test/**/*_test.rb"]
10
+ end
11
+
12
+ require "rubocop/rake_task"
13
+
14
+ RuboCop::RakeTask.new
15
+
16
+ require "rake/extensiontask"
17
+
18
+ task build: :compile
19
+
20
+ Rake::ExtensionTask.new("cgranges") do |ext|
21
+ ext.lib_dir = "lib/bio/cgranges"
22
+ ext.ext_dir = "ext/bio/cgranges"
23
+ end
24
+
25
+ desc "Remove object file"
26
+ task :remove_object_file do
27
+ Dir["ext/**/*.{o,bundle}"].each do |f|
28
+ FileUtils.rm(f)
29
+ end
30
+ end
31
+
32
+ task default: %i[
33
+ clobber
34
+ compile
35
+ remove_object_file
36
+ test
37
+ rubocop
38
+ ]
39
+
40
+ task cleanall: %i[
41
+ clobber
42
+ remove_object_file
43
+ clean
44
+ ]
@@ -0,0 +1,23 @@
1
+ The MIT License
2
+
3
+ Copyright (c) 2019 Dana-Farber Cancer Institute
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23
+ SOFTWARE.
@@ -0,0 +1,133 @@
1
+ ## Introduction
2
+
3
+ cgranges is a small C library for genomic interval overlap queries: given a
4
+ genomic region *r* and a set of regions *R*, finding all regions in *R* that
5
+ overlaps *r*. Although this library is based on [interval tree][itree], a well
6
+ known data structure, the core algorithm of cgranges is distinct from all
7
+ existing implementations to the best of our knowledge. Specifically, the
8
+ interval tree in cgranges is implicitly encoded as a plain sorted array
9
+ (similar to [binary heap][bheap] but packed differently). Tree
10
+ traversal is achieved by jumping between array indices. This treatment makes
11
+ cgranges very efficient and compact in memory. The core algorithm can be
12
+ implemented in ~50 lines of C++ code, much shorter than others as well. Please
13
+ see the code comments in [cpp/IITree.h](cpp/IITree.h) for details.
14
+
15
+ ## Usage
16
+
17
+ ### Test with BED coverage
18
+
19
+ For testing purposes, this repo implements the [bedtools coverage][bedcov] tool
20
+ with cgranges. The source code is located in the [test/](test) directory. You
21
+ can compile and run the test with:
22
+ ```sh
23
+ cd test && make
24
+ ./bedcov-cr test1.bed test2.bed
25
+ ```
26
+ The first BED file is loaded into RAM and indexed. The depth and the breadth of
27
+ coverage of each region in the second file is computed by query against the
28
+ index of the first file.
29
+
30
+ The [test/](test) directory also contains a few other implementations based on
31
+ [IntervalTree.h][ekg-itree] in C++, [quicksect][quicksect] in Cython and
32
+ [ncls][ncls] in Cython. The table below shows timing and peak memory on two
33
+ test BEDs available in the release page. The first BED contains GenCode
34
+ annotations with ~1.2 million lines, mixing all types of features. The second
35
+ contains ~10 million direct-RNA mappings. Time1a/Mem1a indexes the GenCode BED
36
+ into memory. Time1b adds whole chromosome intervals to the GenCode BED when
37
+ indexing. Time2/Mem2 indexes the RNA-mapping BED into memory. Numbers are
38
+ averaged over 5 runs.
39
+
40
+ |Algo. |Lang. |Cov|Program |Time1a|Time1b|Mem1a |Time2 |Mem2 |
41
+ |:-------|:-----|:-:|:---------------|-----:|-----:|-------:|-----:|-------:|
42
+ |IAITree |C |Y |cgranges |9.0s |13.9s |19.1MB |4.6s |138.4MB |
43
+ |IAITree |C++ |Y |cpp/iitree.h |11.1s |24.5s |22.4MB |5.8s |160.4MB |
44
+ |CITree |C++ |Y |IntervalTree.h |17.4s |17.4s |27.2MB |10.5s |179.5MB |
45
+ |IAITree |C |N |cgranges |7.6s |13.0s |19.1MB |4.1s |138.4MB |
46
+ |AIList |C |N |3rd-party/AIList|7.9s |8.1s |14.4MB |6.5s |104.8MB |
47
+ |NCList |C |N |3rd-party/NCList|13.0s |13.4s |21.4MB |10.6s |183.0MB |
48
+ |AITree |C |N |3rd-party/AITree|16.8s |18.4s |73.4MB |27.3s |546.4MB |
49
+ |IAITree |Cython|N |cgranges |56.6s |63.9s |23.4MB |43.9s |143.1MB |
50
+ |binning |C++ |Y |bedtools |201.9s|280.4s|478.5MB |149.1s|3438.1MB|
51
+
52
+ Here, IAITree = implicit augmented interval tree, used by cgranges;
53
+ CITree = centered interval tree, used by [Erik Garrison's
54
+ IntervalTree][itree]; AIList = augmented interval list, by [Feng et
55
+ al][ailist]; NCList = nested containment list, taken from [ncls][ncls] by Feng
56
+ et al; AITree = augmented interval tree, from [kerneltree][kerneltree].
57
+ "Cov" indicates whether the program calculates breadth of coverage.
58
+ Comments:
59
+
60
+ * AIList keeps start and end only. IAITree and CITree addtionally store a
61
+ 4-byte "ID" field per interval to reference the source of interval. This is
62
+ partly why AIList uses the least memory.
63
+
64
+ * IAITree is more sensitive to the worse case: the presence of an interval
65
+ spanning the whole chromosome.
66
+
67
+ * IAITree uses an efficient radix sort. CITree uses std::sort from STL, which
68
+ is ok. AIList and NCList use qsort from libc, which is slow. Faster sorting
69
+ leads to faster indexing.
70
+
71
+ * IAITree in C++ uses identical core algorithm to the C version, but limited by
72
+ its APIs, it wastes time on memory locality and management. CITree has a
73
+ similar issue.
74
+
75
+ * Computing coverage is better done when the returned list of intervals are
76
+ start sorted. IAITree returns sorted list. CITree doesn't. Not sure about
77
+ others. Computing coverage takes a couple of seconds. Sorting will be slower.
78
+
79
+ * Printing intervals also takes a noticeable fraction of time. Custom printf
80
+ equivalent would be faster.
81
+
82
+ * IAITree+Cython is a wrapper around the C version of cgranges. Cython adds
83
+ significant overhead.
84
+
85
+ * Bedtools is designed for a variety of applications in addition to computing
86
+ coverage. It may keep other information in its internal data structure. This
87
+ micro-benchmark may be unfair to bedtools.
88
+
89
+ * In general, the performance is affected a lot by subtle implementation
90
+ details. CITree, IAITree, NCList and AIList are all broadly comparable in
91
+ performance. AITree is not recommended when indexed intervals are immutable.
92
+
93
+ ### Use cgranges as a C library
94
+
95
+ ```c
96
+ cgranges_t *cr = cr_init(); // initialize a cgranges_t object
97
+ cr_add(cr, "chr1", 20, 30, 0); // add a genomic interval
98
+ cr_add(cr, "chr2", 10, 30, 1);
99
+ cr_add(cr, "chr1", 10, 25, 2);
100
+ cr_index(cr); // index
101
+
102
+ int64_t i, n, *b = 0, max_b = 0;
103
+ n = cr_overlap(cr, "chr1", 15, 22, &b, &max_b); // overlap query; output array b[] can be reused
104
+ for (i = 0; i < n; ++i) // traverse overlapping intervals
105
+ printf("%d\t%d\t%d\n", cr_start(cr, b[i]), cr_end(cr, b[i]), cr_label(cr, b[i]));
106
+ free(b); // b[] is allocated by malloc() inside cr_overlap(), so needs to be freed with free()
107
+
108
+ cr_destroy(cr);
109
+ ```
110
+
111
+ ### Use IITree as a C++ library
112
+
113
+ ```cpp
114
+ IITree<int, int> tree;
115
+ tree.add(12, 34, 0); // add an interval
116
+ tree.add(0, 23, 1);
117
+ tree.add(34, 56, 2);
118
+ tree.index(); // index
119
+ std::vector<size_t> a;
120
+ tree.overlap(22, 25, a); // retrieve overlaps
121
+ for (size_t i = 0; i < a.size(); ++i)
122
+ printf("%d\t%d\t%d\n", tree.start(a[i]), tree.end(a[i]), tree.data(a[i]));
123
+ ```
124
+
125
+ [bedcov]: https://bedtools.readthedocs.io/en/latest/content/tools/coverage.html
126
+ [ekg-itree]: https://github.com/ekg/intervaltree
127
+ [quicksect]: https://github.com/brentp/quicksect
128
+ [ncls]: https://github.com/hunt-genes/ncls
129
+ [citree]: https://en.wikipedia.org/wiki/Interval_tree#Centered_interval_tree
130
+ [itree]: https://en.wikipedia.org/wiki/Interval_tree
131
+ [bheap]: https://en.wikipedia.org/wiki/Binary_heap
132
+ [ailist]: https://www.biorxiv.org/content/10.1101/593657v1
133
+ [kerneltree]: https://github.com/biocore-ntnu/kerneltree
@@ -0,0 +1,330 @@
1
+ #include <stdio.h>
2
+ #include <assert.h>
3
+ #include "cgranges.h"
4
+ #include "khash.h"
5
+
6
+ /**************
7
+ * Radix sort *
8
+ **************/
9
+
10
+ #define RS_MIN_SIZE 64
11
+ #define RS_MAX_BITS 8
12
+
13
+ #define KRADIX_SORT_INIT(name, rstype_t, rskey, sizeof_key) \
14
+ typedef struct { \
15
+ rstype_t *b, *e; \
16
+ } rsbucket_##name##_t; \
17
+ void rs_insertsort_##name(rstype_t *beg, rstype_t *end) \
18
+ { \
19
+ rstype_t *i; \
20
+ for (i = beg + 1; i < end; ++i) \
21
+ if (rskey(*i) < rskey(*(i - 1))) { \
22
+ rstype_t *j, tmp = *i; \
23
+ for (j = i; j > beg && rskey(tmp) < rskey(*(j-1)); --j) \
24
+ *j = *(j - 1); \
25
+ *j = tmp; \
26
+ } \
27
+ } \
28
+ void rs_sort_##name(rstype_t *beg, rstype_t *end, int n_bits, int s) \
29
+ { \
30
+ rstype_t *i; \
31
+ int size = 1<<n_bits, m = size - 1; \
32
+ rsbucket_##name##_t *k, b[1<<RS_MAX_BITS], *be = b + size; \
33
+ assert(n_bits <= RS_MAX_BITS); \
34
+ for (k = b; k != be; ++k) k->b = k->e = beg; \
35
+ for (i = beg; i != end; ++i) ++b[rskey(*i)>>s&m].e; \
36
+ for (k = b + 1; k != be; ++k) \
37
+ k->e += (k-1)->e - beg, k->b = (k-1)->e; \
38
+ for (k = b; k != be;) { \
39
+ if (k->b != k->e) { \
40
+ rsbucket_##name##_t *l; \
41
+ if ((l = b + (rskey(*k->b)>>s&m)) != k) { \
42
+ rstype_t tmp = *k->b, swap; \
43
+ do { \
44
+ swap = tmp; tmp = *l->b; *l->b++ = swap; \
45
+ l = b + (rskey(tmp)>>s&m); \
46
+ } while (l != k); \
47
+ *k->b++ = tmp; \
48
+ } else ++k->b; \
49
+ } else ++k; \
50
+ } \
51
+ for (b->b = beg, k = b + 1; k != be; ++k) k->b = (k-1)->e; \
52
+ if (s) { \
53
+ s = s > n_bits? s - n_bits : 0; \
54
+ for (k = b; k != be; ++k) \
55
+ if (k->e - k->b > RS_MIN_SIZE) rs_sort_##name(k->b, k->e, n_bits, s); \
56
+ else if (k->e - k->b > 1) rs_insertsort_##name(k->b, k->e); \
57
+ } \
58
+ } \
59
+ void radix_sort_##name(rstype_t *beg, rstype_t *end) \
60
+ { \
61
+ if (end - beg <= RS_MIN_SIZE) rs_insertsort_##name(beg, end); \
62
+ else rs_sort_##name(beg, end, RS_MAX_BITS, (sizeof_key - 1) * RS_MAX_BITS); \
63
+ }
64
+
65
+ /*********************
66
+ * Convenient macros *
67
+ *********************/
68
+
69
+ #ifndef kroundup32
70
+ #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
71
+ #endif
72
+
73
+ #define CALLOC(type, len) ((type*)calloc((len), sizeof(type)))
74
+ #define REALLOC(ptr, len) ((ptr) = (__typeof__(ptr))realloc((ptr), (len) * sizeof(*(ptr))))
75
+
76
+ #define EXPAND(a, m) do { \
77
+ (m) = (m)? (m) + ((m)>>1) : 16; \
78
+ REALLOC((a), (m)); \
79
+ } while (0)
80
+
81
+ /********************
82
+ * Basic operations *
83
+ ********************/
84
+
85
+ #define cr_intv_key(r) ((r).x)
86
+ KRADIX_SORT_INIT(cr_intv, cr_intv_t, cr_intv_key, 8)
87
+
88
+ KHASH_MAP_INIT_STR(str, int32_t)
89
+ typedef khash_t(str) strhash_t;
90
+
91
+ cgranges_t *cr_init(void)
92
+ {
93
+ cgranges_t *cr;
94
+ cr = CALLOC(cgranges_t, 1);
95
+ cr->hc = kh_init(str);
96
+ return cr;
97
+ }
98
+
99
+ void cr_destroy(cgranges_t *cr)
100
+ {
101
+ int32_t i;
102
+ if (cr == 0) return;
103
+ for (i = 0; i < cr->n_ctg; ++i)
104
+ free(cr->ctg[i].name);
105
+ free(cr->ctg);
106
+ kh_destroy(str, (strhash_t*)cr->hc);
107
+ free(cr);
108
+ }
109
+
110
+ int32_t cr_add_ctg(cgranges_t *cr, const char *ctg, int32_t len)
111
+ {
112
+ int absent;
113
+ khint_t k;
114
+ strhash_t *h = (strhash_t*)cr->hc;
115
+ k = kh_put(str, h, ctg, &absent);
116
+ if (absent) {
117
+ cr_ctg_t *p;
118
+ if (cr->n_ctg == cr->m_ctg)
119
+ EXPAND(cr->ctg, cr->m_ctg);
120
+ kh_val(h, k) = cr->n_ctg;
121
+ p = &cr->ctg[cr->n_ctg++];
122
+ p->name = strdup(ctg);
123
+ kh_key(h, k) = p->name;
124
+ p->len = len;
125
+ p->n = 0, p->off = -1;
126
+ }
127
+ if (len > cr->ctg[kh_val(h, k)].len)
128
+ cr->ctg[kh_val(h, k)].len = len;
129
+ return kh_val(h, k);
130
+ }
131
+
132
+ int32_t cr_get_ctg(const cgranges_t *cr, const char *ctg)
133
+ {
134
+ khint_t k;
135
+ strhash_t *h = (strhash_t*)cr->hc;
136
+ k = kh_get(str, h, ctg);
137
+ return k == kh_end(h)? -1 : kh_val(h, k);
138
+ }
139
+
140
+ cr_intv_t *cr_add(cgranges_t *cr, const char *ctg, int32_t st, int32_t en, int32_t label_int)
141
+ {
142
+ cr_intv_t *p;
143
+ int32_t k;
144
+ if (st > en) return 0;
145
+ k = cr_add_ctg(cr, ctg, 0);
146
+ if (cr->n_r == cr->m_r)
147
+ EXPAND(cr->r, cr->m_r);
148
+ p = &cr->r[cr->n_r++];
149
+ p->x = (uint64_t)k << 32 | st;
150
+ p->y = en;
151
+ p->label = label_int;
152
+ if (cr->ctg[k].len < en)
153
+ cr->ctg[k].len = en;
154
+ return p;
155
+ }
156
+
157
+ void cr_sort(cgranges_t *cr)
158
+ {
159
+ if (cr->n_ctg == 0 || cr->n_r == 0) return;
160
+ radix_sort_cr_intv(cr->r, cr->r + cr->n_r);
161
+ }
162
+
163
+ int32_t cr_is_sorted(const cgranges_t *cr)
164
+ {
165
+ uint64_t i;
166
+ for (i = 1; i < cr->n_r; ++i)
167
+ if (cr->r[i-1].x > cr->r[i].x)
168
+ break;
169
+ return (i == cr->n_r);
170
+ }
171
+
172
+ /************
173
+ * Indexing *
174
+ ************/
175
+
176
+ void cr_index_prepare(cgranges_t *cr)
177
+ {
178
+ int64_t i, st;
179
+ if (!cr_is_sorted(cr)) cr_sort(cr);
180
+ for (st = 0, i = 1; i <= cr->n_r; ++i) {
181
+ if (i == cr->n_r || cr->r[i].x>>32 != cr->r[st].x>>32) {
182
+ int32_t ctg = cr->r[st].x>>32;
183
+ cr->ctg[ctg].off = st;
184
+ cr->ctg[ctg].n = i - st;
185
+ st = i;
186
+ }
187
+ }
188
+ for (i = 0; i < cr->n_r; ++i) {
189
+ cr_intv_t *r = &cr->r[i];
190
+ r->x = r->x<<32 | r->y;
191
+ r->y = 0;
192
+ }
193
+ }
194
+
195
+ int32_t cr_index1(cr_intv_t *a, int64_t n)
196
+ {
197
+ int64_t i, last_i;
198
+ int32_t last, k;
199
+ if (n <= 0) return -1;
200
+ for (i = 0; i < n; i += 2) last_i = i, last = a[i].y = (int32_t)a[i].x;
201
+ for (k = 1; 1LL<<k <= n; ++k) {
202
+ int64_t x = 1LL<<(k-1), i0 = (x<<1) - 1, step = x<<2;
203
+ for (i = i0; i < n; i += step) {
204
+ int32_t el = a[i - x].y;
205
+ int32_t er = i + x < n? a[i + x].y : last;
206
+ int32_t e = (int32_t)a[i].x;
207
+ e = e > el? e : el;
208
+ e = e > er? e : er;
209
+ a[i].y = e;
210
+ }
211
+ last_i = last_i>>k&1? last_i - x : last_i + x;
212
+ if (last_i < n && a[last_i].y > last)
213
+ last = a[last_i].y;
214
+ }
215
+ return k - 1;
216
+ }
217
+
218
+ void cr_index(cgranges_t *cr)
219
+ {
220
+ int32_t i;
221
+ cr_index_prepare(cr);
222
+ for (i = 0; i < cr->n_ctg; ++i)
223
+ cr->ctg[i].root_k = cr_index1(&cr->r[cr->ctg[i].off], cr->ctg[i].n);
224
+ }
225
+
226
+ /*********
227
+ * Query *
228
+ *********/
229
+
230
+ int64_t cr_min_start_int(const cgranges_t *cr, int32_t ctg_id, int32_t st) // find the smallest i such that cr_st(&r[i]) >= st
231
+ {
232
+ int64_t left, right;
233
+ const cr_ctg_t *c;
234
+ const cr_intv_t *r;
235
+
236
+ if (ctg_id < 0 || ctg_id >= cr->n_ctg) return -1;
237
+ c = &cr->ctg[ctg_id];
238
+ r = &cr->r[c->off];
239
+ if (c->n == 0) return -1;
240
+ left = 0, right = c->n;
241
+ while (right > left) {
242
+ int64_t mid = left + ((right - left) >> 1);
243
+ if (cr_st(&r[mid]) >= st) right = mid;
244
+ else left = mid + 1;
245
+ }
246
+ assert(left == right);
247
+ return left == c->n? -1 : c->off + left;
248
+ }
249
+
250
+ typedef struct {
251
+ int64_t x;
252
+ int32_t k, w;
253
+ } istack_t;
254
+
255
+ int64_t cr_overlap_int(const cgranges_t *cr, int32_t ctg_id, int32_t st, int32_t en, int64_t **b_, int64_t *m_b_)
256
+ {
257
+ int32_t t = 0;
258
+ const cr_ctg_t *c;
259
+ const cr_intv_t *r;
260
+ int64_t *b = *b_, m_b = *m_b_, n = 0;
261
+ istack_t stack[64], *p;
262
+
263
+ if (ctg_id < 0 || ctg_id >= cr->n_ctg) return 0;
264
+ c = &cr->ctg[ctg_id];
265
+ r = &cr->r[c->off];
266
+ p = &stack[t++];
267
+ p->k = c->root_k, p->x = (1LL<<p->k) - 1, p->w = 0; // push the root into the stack
268
+ while (t) { // stack is not empyt
269
+ istack_t z = stack[--t];
270
+ if (z.k <= 3) { // the subtree is no larger than (1<<(z.k+1))-1; do a linear scan
271
+ int64_t i, i0 = z.x >> z.k << z.k, i1 = i0 + (1LL<<(z.k+1)) - 1;
272
+ if (i1 >= c->n) i1 = c->n;
273
+ for (i = i0; i < i1 && cr_st(&r[i]) < en; ++i)
274
+ if (st < cr_en(&r[i])) {
275
+ if (n == m_b) EXPAND(b, m_b);
276
+ b[n++] = c->off + i;
277
+ }
278
+ } else if (z.w == 0) { // if left child not processed
279
+ int64_t y = z.x - (1LL<<(z.k-1));
280
+ p = &stack[t++];
281
+ p->k = z.k, p->x = z.x, p->w = 1;
282
+ if (y >= c->n || r[y].y > st) {
283
+ p = &stack[t++];
284
+ p->k = z.k - 1, p->x = y, p->w = 0; // push the left child to the stack
285
+ }
286
+ } else if (z.x < c->n && cr_st(&r[z.x]) < en) {
287
+ if (st < cr_en(&r[z.x])) { // then z.x overlaps the query; write to the output array
288
+ if (n == m_b) EXPAND(b, m_b);
289
+ b[n++] = c->off + z.x;
290
+ }
291
+ p = &stack[t++];
292
+ p->k = z.k - 1, p->x = z.x + (1LL<<(z.k-1)), p->w = 0; // push the right child
293
+ }
294
+ }
295
+ *b_ = b, *m_b_ = m_b;
296
+ return n;
297
+ }
298
+
299
+ int64_t cr_contain_int(const cgranges_t *cr, int32_t ctg_id, int32_t st, int32_t en, int64_t **b_, int64_t *m_b_)
300
+ {
301
+ int64_t n = 0, i, s, e, *b = *b_, m_b = *m_b_;
302
+ s = cr_min_start_int(cr, ctg_id, st);
303
+ if (s < 0) return 0;
304
+ e = cr->ctg[ctg_id].off + cr->ctg[ctg_id].n;
305
+ for (i = s; i < e; ++i) {
306
+ const cr_intv_t *r = &cr->r[i];
307
+ if (cr_st(r) >= en) break;
308
+ if (cr_st(r) >= st && cr_en(r) <= en) {
309
+ if (n == m_b) EXPAND(b, m_b);
310
+ b[n++] = i;
311
+ }
312
+ }
313
+ *b_ = b, *m_b_ = m_b;
314
+ return n;
315
+ }
316
+
317
+ int64_t cr_min_start(const cgranges_t *cr, const char *ctg, int32_t st)
318
+ {
319
+ return cr_min_start_int(cr, cr_get_ctg(cr, ctg), st);
320
+ }
321
+
322
+ int64_t cr_overlap(const cgranges_t *cr, const char *ctg, int32_t st, int32_t en, int64_t **b_, int64_t *m_b_)
323
+ {
324
+ return cr_overlap_int(cr, cr_get_ctg(cr, ctg), st, en, b_, m_b_);
325
+ }
326
+
327
+ int64_t cr_contain(const cgranges_t *cr, const char *ctg, int32_t st, int32_t en, int64_t **b_, int64_t *m_b_)
328
+ {
329
+ return cr_contain_int(cr, cr_get_ctg(cr, ctg), st, en, b_, m_b_);
330
+ }