minimap2 0.2.27.0 → 0.2.29.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -0
- data/ext/cmappy/cmappy.c +3 -3
- data/ext/cmappy/cmappy.h +1 -1
- data/ext/minimap2/FAQ.md +1 -1
- data/ext/minimap2/Makefile +4 -3
- data/ext/minimap2/NEWS.md +68 -0
- data/ext/minimap2/README.md +30 -14
- data/ext/minimap2/align.c +136 -52
- data/ext/minimap2/cookbook.md +2 -2
- data/ext/minimap2/format.c +59 -5
- data/ext/minimap2/hit.c +14 -6
- data/ext/minimap2/index.c +304 -13
- data/ext/minimap2/jump.c +201 -0
- data/ext/minimap2/kalloc.h +8 -0
- data/ext/minimap2/ksw2.h +5 -2
- data/ext/minimap2/ksw2_dispatch.c +5 -5
- data/ext/minimap2/ksw2_exts2_sse.c +17 -6
- data/ext/minimap2/lchain.c +5 -5
- data/ext/minimap2/main.c +64 -12
- data/ext/minimap2/map.c +35 -8
- data/ext/minimap2/minimap.h +14 -3
- data/ext/minimap2/minimap2.1 +98 -46
- data/ext/minimap2/misc/README.md +2 -1
- data/ext/minimap2/misc/pafcluster.js +241 -0
- data/ext/minimap2/misc/paftools.js +17 -6
- data/ext/minimap2/mmpriv.h +25 -4
- data/ext/minimap2/options.c +36 -3
- data/ext/minimap2/python/cmappy.h +3 -3
- data/ext/minimap2/python/cmappy.pxd +5 -2
- data/ext/minimap2/python/mappy.pyx +20 -7
- data/ext/minimap2/python/minimap2.py +5 -3
- data/ext/minimap2/seed.c +2 -1
- data/ext/minimap2/setup.py +2 -2
- data/ext/minimap2.patch +2 -2
- data/lib/minimap2/aligner.rb +19 -12
- data/lib/minimap2/alignment.rb +1 -0
- data/lib/minimap2/ffi/constants.rb +10 -2
- data/lib/minimap2/ffi/functions.rb +145 -6
- data/lib/minimap2/ffi/mappy.rb +1 -1
- data/lib/minimap2/version.rb +1 -1
- data/lib/minimap2.rb +2 -2
- metadata +8 -7
- data/ext/minimap2/misc/mmphase.js +0 -335
data/ext/minimap2/format.c
CHANGED
@@ -253,6 +253,52 @@ static void write_cs_ds_core(kstring_t *s, const uint8_t *tseq, const uint8_t *q
|
|
253
253
|
assert(t_off == r->re - r->rs && q_off == r->qe - r->qs);
|
254
254
|
}
|
255
255
|
|
256
|
+
static inline void revcomp_splice(uint8_t s[2])
|
257
|
+
{
|
258
|
+
uint8_t c = s[1] < 4? 3 - s[1] : 4;
|
259
|
+
s[1] = s[0] < 4? 3 - s[0] : 4;
|
260
|
+
s[0] = c;
|
261
|
+
}
|
262
|
+
|
263
|
+
void mm_write_junc(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r)
|
264
|
+
{
|
265
|
+
int32_t i, t_off, swritten = 0;
|
266
|
+
s->l = 0;
|
267
|
+
if (!r->is_spliced || r->p == 0) return; // no junctions
|
268
|
+
if (r->p->trans_strand != 1 && r->p->trans_strand != 2) return; // no preferred strand
|
269
|
+
for (i = 0, t_off = r->rs; i < (int)r->p->n_cigar; ++i) {
|
270
|
+
int op = r->p->cigar[i]&0xf, len = r->p->cigar[i]>>4;
|
271
|
+
if (op == MM_CIGAR_MATCH || op == MM_CIGAR_EQ_MATCH || op == MM_CIGAR_X_MISMATCH || op == MM_CIGAR_DEL) {
|
272
|
+
t_off += len;
|
273
|
+
} else if (op == MM_CIGAR_N_SKIP) { // intron
|
274
|
+
uint8_t donor[2], acceptor[2];
|
275
|
+
int32_t score1 = 0, score2 = 0, rev;
|
276
|
+
assert(len >= 2);
|
277
|
+
rev = (r->p->trans_strand == 2) ^ r->rev;
|
278
|
+
if (!rev) {
|
279
|
+
mm_idx_getseq(mi, r->rid, t_off, t_off + 2, donor);
|
280
|
+
mm_idx_getseq(mi, r->rid, t_off + len - 2, t_off + len, acceptor);
|
281
|
+
} else {
|
282
|
+
mm_idx_getseq(mi, r->rid, t_off, t_off + 2, acceptor);
|
283
|
+
mm_idx_getseq(mi, r->rid, t_off + len - 2, t_off + len, donor);
|
284
|
+
revcomp_splice(donor);
|
285
|
+
revcomp_splice(acceptor);
|
286
|
+
}
|
287
|
+
//fprintf(stderr, "%c%c-%c%c\n", "ACGTN"[donor[0]], "ACGTN"[donor[1]], "ACGTN"[acceptor[0]], "ACGTN"[acceptor[1]]);
|
288
|
+
if (donor[0] == 2 && donor[1] == 3) score1 = 3;
|
289
|
+
else if (donor[0] == 2 && donor[1] == 1) score1 = 2;
|
290
|
+
else if (donor[0] == 0 && donor[1] == 3) score1 = 1;
|
291
|
+
if (acceptor[0] == 0 && acceptor[1] == 2) score2 = 3;
|
292
|
+
else if (acceptor[0] == 0 && acceptor[1] == 1) score2 = 1;
|
293
|
+
if (swritten) mm_sprintf_lite(s, "\n");
|
294
|
+
else swritten = 1;
|
295
|
+
mm_sprintf_lite(s, "%s\t%d\t%d\t%s\t%d\t%c", mi->seq[r->rid].name, t_off, t_off + len, t->name, score1 + score2, "+-"[rev]);
|
296
|
+
t_off += len;
|
297
|
+
}
|
298
|
+
}
|
299
|
+
assert(t_off == r->re);
|
300
|
+
}
|
301
|
+
|
256
302
|
static void write_MD_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq, const mm_reg1_t *r, char *tmp, int write_tag)
|
257
303
|
{
|
258
304
|
int i, q_off, t_off, l_MD = 0;
|
@@ -310,7 +356,7 @@ static void write_cs_ds_or_MD(void *km, kstring_t *s, const mm_idx_t *mi, const
|
|
310
356
|
}
|
311
357
|
}
|
312
358
|
}
|
313
|
-
if (is_MD
|
359
|
+
if (is_MD) write_MD_core(s, tseq, qseq, r, tmp, write_tag);
|
314
360
|
else write_cs_ds_core(s, tseq, qseq, r, tmp, no_iden, is_ds, write_tag);
|
315
361
|
kfree(km, qseq); kfree(km, tseq); kfree(km, tmp);
|
316
362
|
}
|
@@ -366,15 +412,18 @@ static inline void write_tags(kstring_t *s, const mm_reg1_t *r)
|
|
366
412
|
if (r->split) mm_sprintf_lite(s, "\tzd:i:%d", r->split);
|
367
413
|
}
|
368
414
|
|
369
|
-
void
|
415
|
+
void mm_write_paf4(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int64_t opt_flag, int rep_len, int n_seg, int seg_idx)
|
370
416
|
{
|
371
417
|
s->l = 0;
|
418
|
+
mm_sprintf_lite(s, "%s", t->name);
|
419
|
+
if ((opt_flag & MM_F_FRAG_MODE) && n_seg >= 2 && seg_idx >= 0)
|
420
|
+
mm_sprintf_lite(s, "/%d", seg_idx + 1);
|
372
421
|
if (r == 0) {
|
373
|
-
mm_sprintf_lite(s, "
|
422
|
+
mm_sprintf_lite(s, "\t%d\t0\t0\t*\t*\t0\t0\t0\t0\t0\t0", t->l_seq);
|
374
423
|
if (rep_len >= 0) mm_sprintf_lite(s, "\trl:i:%d", rep_len);
|
375
424
|
return;
|
376
425
|
}
|
377
|
-
mm_sprintf_lite(s, "
|
426
|
+
mm_sprintf_lite(s, "\t%d\t%d\t%d\t%c\t", t->l_seq, r->qs, r->qe, "+-"[r->rev]);
|
378
427
|
if (mi->seq[r->rid].name) mm_sprintf_lite(s, "%s", mi->seq[r->rid].name);
|
379
428
|
else mm_sprintf_lite(s, "%d", r->rid);
|
380
429
|
mm_sprintf_lite(s, "\t%d", mi->seq[r->rid].len);
|
@@ -393,11 +442,16 @@ void mm_write_paf3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const
|
|
393
442
|
mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, MM_CIGAR_STR[r->p->cigar[k]&0xf]);
|
394
443
|
}
|
395
444
|
if (r->p && (opt_flag & (MM_F_OUT_CS|MM_F_OUT_DS|MM_F_OUT_MD)))
|
396
|
-
write_cs_ds_or_MD(km, s, mi, t, r, !(opt_flag&MM_F_OUT_CS_LONG), opt_flag&MM_F_OUT_MD, !!(opt_flag&MM_F_OUT_DS), 1, !!(opt_flag&MM_F_QSTRAND));
|
445
|
+
write_cs_ds_or_MD(km, s, mi, t, r, !(opt_flag&MM_F_OUT_CS_LONG), !!(opt_flag&MM_F_OUT_MD), !!(opt_flag&MM_F_OUT_DS), 1, !!(opt_flag&MM_F_QSTRAND));
|
397
446
|
if ((opt_flag & MM_F_COPY_COMMENT) && t->comment)
|
398
447
|
mm_sprintf_lite(s, "\t%s", t->comment);
|
399
448
|
}
|
400
449
|
|
450
|
+
void mm_write_paf3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int64_t opt_flag, int rep_len)
|
451
|
+
{
|
452
|
+
mm_write_paf4(s, mi, t, r, km, opt_flag, rep_len, 0, 0);
|
453
|
+
}
|
454
|
+
|
401
455
|
void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int64_t opt_flag)
|
402
456
|
{
|
403
457
|
mm_write_paf3(s, mi, t, r, km, opt_flag, -1);
|
data/ext/minimap2/hit.c
CHANGED
@@ -55,7 +55,7 @@ mm_reg1_t *mm_gen_regs(void *km, uint32_t hash, int qlen, int n_u, uint64_t *u,
|
|
55
55
|
mm_reg1_t *r;
|
56
56
|
int i, k;
|
57
57
|
|
58
|
-
if (n_u
|
58
|
+
if (n_u <= 0) return 0;
|
59
59
|
|
60
60
|
// sort by score
|
61
61
|
z = (mm128_t*)kmalloc(km, n_u * 16);
|
@@ -418,16 +418,19 @@ static void mm_set_inv_mapq(void *km, int n_regs, mm_reg1_t *regs)
|
|
418
418
|
kfree(km, aux);
|
419
419
|
}
|
420
420
|
|
421
|
-
void
|
421
|
+
void mm_set_mapq2(void *km, int n_regs, mm_reg1_t *regs, int min_chain_sc, int match_sc, int rep_len, int is_sr, int is_splice)
|
422
422
|
{
|
423
423
|
static const float q_coef = 40.0f;
|
424
424
|
int64_t sum_sc = 0;
|
425
425
|
float uniq_ratio;
|
426
|
-
int i;
|
426
|
+
int i, n_2nd_splice = 0;
|
427
427
|
if (n_regs == 0) return;
|
428
|
-
for (i = 0; i < n_regs; ++i)
|
428
|
+
for (i = 0; i < n_regs; ++i) {
|
429
429
|
if (regs[i].parent == regs[i].id)
|
430
430
|
sum_sc += regs[i].score;
|
431
|
+
else if (regs[i].is_spliced)
|
432
|
+
++n_2nd_splice;
|
433
|
+
}
|
431
434
|
uniq_ratio = (float)sum_sc / (sum_sc + rep_len);
|
432
435
|
for (i = 0; i < n_regs; ++i) {
|
433
436
|
mm_reg1_t *r = ®s[i];
|
@@ -440,13 +443,18 @@ void mm_set_mapq(void *km, int n_regs, mm_reg1_t *regs, int min_chain_sc, int ma
|
|
440
443
|
pen_cm = pen_s1 < pen_cm? pen_s1 : pen_cm;
|
441
444
|
subsc = r->subsc > min_chain_sc? r->subsc : min_chain_sc;
|
442
445
|
if (r->p && r->p->dp_max2 > 0 && r->p->dp_max > 0) {
|
443
|
-
float identity = (float)r->mlen / r->blen;
|
444
|
-
|
446
|
+
float x, identity = (float)r->mlen / r->blen;
|
447
|
+
if (is_sr && is_splice)
|
448
|
+
x = (float)r->p->dp_max2 / r->p->dp_max; // ignore chaining score; for short RNA-seq reads, unspliced chaining score tends to be higher
|
449
|
+
else
|
450
|
+
x = (float)r->p->dp_max2 * subsc / r->p->dp_max / r->score0;
|
445
451
|
mapq = (int)(identity * pen_cm * q_coef * (1.0f - x * x) * logf((float)r->p->dp_max / match_sc));
|
446
452
|
if (!is_sr) {
|
447
453
|
int mapq_alt = (int)(6.02f * identity * identity * (r->p->dp_max - r->p->dp_max2) / match_sc + .499f); // BWA-MEM like mapQ, mostly for short reads
|
448
454
|
mapq = mapq < mapq_alt? mapq : mapq_alt; // in case the long-read heuristic fails
|
449
455
|
}
|
456
|
+
if (is_splice && is_sr && r->is_spliced && n_2nd_splice == 0)
|
457
|
+
mapq += 10;
|
450
458
|
} else {
|
451
459
|
float x = (float)subsc / r->score0;
|
452
460
|
if (r->p) {
|
data/ext/minimap2/index.c
CHANGED
@@ -12,6 +12,7 @@
|
|
12
12
|
#include "bseq.h"
|
13
13
|
#include "minimap.h"
|
14
14
|
#include "mmpriv.h"
|
15
|
+
#include "ksw2.h"
|
15
16
|
#include "kvec.h"
|
16
17
|
#include "khash.h"
|
17
18
|
|
@@ -32,7 +33,7 @@ typedef struct mm_idx_bucket_s {
|
|
32
33
|
} mm_idx_bucket_t;
|
33
34
|
|
34
35
|
typedef struct {
|
35
|
-
int32_t st, en,
|
36
|
+
int32_t st, en, cnt;
|
36
37
|
int32_t score:30, strand:2;
|
37
38
|
} mm_idx_intv1_t;
|
38
39
|
|
@@ -41,6 +42,11 @@ typedef struct mm_idx_intv_s {
|
|
41
42
|
mm_idx_intv1_t *a;
|
42
43
|
} mm_idx_intv_t;
|
43
44
|
|
45
|
+
typedef struct mm_idx_jjump_s {
|
46
|
+
int32_t n, m;
|
47
|
+
mm_idx_jjump1_t *a;
|
48
|
+
} mm_idx_jjump_t;
|
49
|
+
|
44
50
|
mm_idx_t *mm_idx_init(int w, int k, int b, int flag)
|
45
51
|
{
|
46
52
|
mm_idx_t *mi;
|
@@ -65,11 +71,17 @@ void mm_idx_destroy(mm_idx_t *mi)
|
|
65
71
|
kh_destroy(idx, (idxhash_t*)mi->B[i].h);
|
66
72
|
}
|
67
73
|
}
|
74
|
+
if (mi->spsc) free(mi->spsc);
|
68
75
|
if (mi->I) {
|
69
76
|
for (i = 0; i < mi->n_seq; ++i)
|
70
77
|
free(mi->I[i].a);
|
71
78
|
free(mi->I);
|
72
79
|
}
|
80
|
+
if (mi->J) {
|
81
|
+
for (i = 0; i < mi->n_seq; ++i)
|
82
|
+
free(mi->J[i].a);
|
83
|
+
free(mi->J);
|
84
|
+
}
|
73
85
|
if (!mi->km) {
|
74
86
|
for (i = 0; i < mi->n_seq; ++i)
|
75
87
|
free(mi->seq[i].name);
|
@@ -99,7 +111,7 @@ const uint64_t *mm_idx_get(const mm_idx_t *mi, uint64_t minier, int *n)
|
|
99
111
|
|
100
112
|
void mm_idx_stat(const mm_idx_t *mi)
|
101
113
|
{
|
102
|
-
|
114
|
+
int64_t n = 0, n1 = 0;
|
103
115
|
uint32_t i;
|
104
116
|
uint64_t sum = 0, len = 0;
|
105
117
|
fprintf(stderr, "[M::%s] kmer size: %d; skip: %d; is_hpc: %d; #seq: %d\n", __func__, mi->k, mi->w, mi->flag&MM_I_HPC, mi->n_seq);
|
@@ -117,8 +129,8 @@ void mm_idx_stat(const mm_idx_t *mi)
|
|
117
129
|
if (kh_key(h, k)&1) ++n1;
|
118
130
|
}
|
119
131
|
}
|
120
|
-
fprintf(stderr, "[M::%s::%.3f*%.2f] distinct minimizers: %
|
121
|
-
__func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), n, 100.0*n1/n, (double)sum / n, (double)len / sum, (long)len);
|
132
|
+
fprintf(stderr, "[M::%s::%.3f*%.2f] distinct minimizers: %ld (%.2f%% are singletons); average occurrences: %.3lf; average spacing: %.3lf; total length: %ld\n",
|
133
|
+
__func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), (long)n, 100.0*n1/n, (double)sum / n, (double)len / sum, (long)len);
|
122
134
|
}
|
123
135
|
|
124
136
|
int mm_idx_index_name(mm_idx_t *mi)
|
@@ -657,10 +669,17 @@ int mm_idx_alt_read(mm_idx_t *mi, const char *fn)
|
|
657
669
|
return n_alt;
|
658
670
|
}
|
659
671
|
|
672
|
+
/***************
|
673
|
+
* BED reading *
|
674
|
+
***************/
|
675
|
+
|
660
676
|
#define sort_key_bed(a) ((a).st)
|
661
677
|
KRADIX_SORT_INIT(bed, mm_idx_intv1_t, sort_key_bed, 4)
|
662
678
|
|
663
|
-
|
679
|
+
#define sort_key_end(a) ((a).en)
|
680
|
+
KRADIX_SORT_INIT(end, mm_idx_intv1_t, sort_key_end, 4)
|
681
|
+
|
682
|
+
static mm_idx_intv_t *mm_idx_bed_read_core(const mm_idx_t *mi, const char *fn, int read_junc, int min_sc)
|
664
683
|
{
|
665
684
|
gzFile fp;
|
666
685
|
kstream_t *ks;
|
@@ -669,7 +688,7 @@ mm_idx_intv_t *mm_idx_read_bed(const mm_idx_t *mi, const char *fn, int read_junc
|
|
669
688
|
|
670
689
|
fp = fn && strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
|
671
690
|
if (fp == 0) return 0;
|
672
|
-
I = (mm_idx_intv_t
|
691
|
+
I = CALLOC(mm_idx_intv_t, mi->n_seq);
|
673
692
|
ks = ks_init(fp);
|
674
693
|
while (ks_getuntil(ks, KS_SEP_LINE, &str, 0) >= 0) {
|
675
694
|
mm_idx_intv_t *r;
|
@@ -690,7 +709,7 @@ mm_idx_intv_t *mm_idx_read_bed(const mm_idx_t *mi, const char *fn, int read_junc
|
|
690
709
|
t.en = atol(q);
|
691
710
|
if (t.en < 0) break;
|
692
711
|
} else if (i == 4) { // BED score
|
693
|
-
t.score = atol(q);
|
712
|
+
t.score = *q >= '0' && *q <= '9'? atol(q) : -1;
|
694
713
|
} else if (i == 5) { // strand
|
695
714
|
t.strand = *q == '+'? 1 : *q == '-'? -1 : 0;
|
696
715
|
} else if (i == 9) {
|
@@ -706,7 +725,8 @@ mm_idx_intv_t *mm_idx_read_bed(const mm_idx_t *mi, const char *fn, int read_junc
|
|
706
725
|
++i, q = p + 1;
|
707
726
|
}
|
708
727
|
}
|
709
|
-
if (id < 0 || t.st < 0 || t.st >= t.en) continue;
|
728
|
+
if (id < 0 || t.st < 0 || t.st >= t.en) continue; // contig ID not found, or other problems
|
729
|
+
if (min_sc > 0 && t.score < min_sc) continue;
|
710
730
|
r = &I[id];
|
711
731
|
if (i >= 11 && read_junc) { // BED12
|
712
732
|
int32_t st, sz, en;
|
@@ -739,14 +759,44 @@ mm_idx_intv_t *mm_idx_read_bed(const mm_idx_t *mi, const char *fn, int read_junc
|
|
739
759
|
return I;
|
740
760
|
}
|
741
761
|
|
742
|
-
|
762
|
+
static mm_idx_intv_t *mm_idx_bed_read_merge(const mm_idx_t *mi, const char *fn, int read_junc, int min_sc)
|
743
763
|
{
|
764
|
+
long n = 0, n0 = 0;
|
744
765
|
int32_t i;
|
766
|
+
mm_idx_intv_t *I;
|
767
|
+
I = mm_idx_bed_read_core(mi, fn, read_junc, min_sc);
|
768
|
+
if (I == 0) return 0;
|
769
|
+
for (i = 0; i < mi->n_seq; ++i) {
|
770
|
+
int32_t j, j0, k;
|
771
|
+
mm_idx_intv_t *intv = &I[i];
|
772
|
+
n0 += intv->n;
|
773
|
+
radix_sort_bed(intv->a, intv->a + intv->n); // sort by st
|
774
|
+
for (j = 1, j0 = 0; j <= intv->n; ++j) { // sort by st and then by end
|
775
|
+
if (j == intv->n || intv->a[j].st != intv->a[j0].st) {
|
776
|
+
radix_sort_end(intv->a + j0, intv->a + j);
|
777
|
+
j0 = j;
|
778
|
+
}
|
779
|
+
}
|
780
|
+
for (j = 1, j0 = 0, k = 0; j <= intv->n; ++j) { // merge intervals with the same (st, en)
|
781
|
+
if (j == intv->n || intv->a[j].st != intv->a[j0].st || intv->a[j].en != intv->a[j0].en) {
|
782
|
+
intv->a[k] = intv->a[j0];
|
783
|
+
intv->a[k++].cnt = j - j0;
|
784
|
+
j0 = j;
|
785
|
+
}
|
786
|
+
}
|
787
|
+
intv->a = REALLOC(mm_idx_intv1_t, intv->a, k);
|
788
|
+
intv->n = intv->m = k;
|
789
|
+
n += k;
|
790
|
+
}
|
791
|
+
if (mm_verbose >= 3)
|
792
|
+
fprintf(stderr, "[%s] read %ld introns, %ld of which are non-redundant\n", __func__, n0, n);
|
793
|
+
return I;
|
794
|
+
}
|
795
|
+
|
796
|
+
int mm_idx_bed_read(mm_idx_t *mi, const char *fn, int read_junc)
|
797
|
+
{
|
745
798
|
if (mi->h == 0) mm_idx_index_name(mi);
|
746
|
-
mi->I =
|
747
|
-
if (mi->I == 0) return -1;
|
748
|
-
for (i = 0; i < mi->n_seq; ++i) // TODO: eliminate redundant intervals
|
749
|
-
radix_sort_bed(mi->I[i].a, mi->I[i].a + mi->I[i].n);
|
799
|
+
mi->I = mm_idx_bed_read_merge(mi, fn, read_junc, -1);
|
750
800
|
return 0;
|
751
801
|
}
|
752
802
|
|
@@ -774,3 +824,244 @@ int mm_idx_bed_junc(const mm_idx_t *mi, int32_t ctg, int32_t st, int32_t en, uin
|
|
774
824
|
}
|
775
825
|
return left;
|
776
826
|
}
|
827
|
+
|
828
|
+
/*********************************
|
829
|
+
* Reading junctions for jumping *
|
830
|
+
*********************************/
|
831
|
+
|
832
|
+
#define sort_key_jj(a) ((a).off)
|
833
|
+
KRADIX_SORT_INIT(jj, mm_idx_jjump1_t, sort_key_jj, 4)
|
834
|
+
|
835
|
+
#define sort_key_jj2(a) ((a).off2)
|
836
|
+
KRADIX_SORT_INIT(jj2, mm_idx_jjump1_t, sort_key_jj2, 4)
|
837
|
+
|
838
|
+
static void sort_jjump(mm_idx_jjump_t *jj2)
|
839
|
+
{
|
840
|
+
int32_t j0, j, k;
|
841
|
+
if (jj2 == 0 || jj2->n == 0) return;
|
842
|
+
radix_sort_jj(jj2->a, jj2->a + jj2->n);
|
843
|
+
for (j0 = 0, j = 1; j <= jj2->n; ++j) {
|
844
|
+
if (j == jj2->n || jj2->a[j0].off != jj2->a[j].off) {
|
845
|
+
radix_sort_jj2(jj2->a + j0, jj2->a + j);
|
846
|
+
j0 = j;
|
847
|
+
}
|
848
|
+
}
|
849
|
+
// the actual merge
|
850
|
+
for (j0 = 0, j = 1, k = 0; j <= jj2->n; ++j) {
|
851
|
+
if (j == jj2->n || jj2->a[j0].off != jj2->a[j].off || jj2->a[j0].off2 != jj2->a[j].off2) {
|
852
|
+
int32_t t, cnt = 0;
|
853
|
+
uint16_t flag = 0;
|
854
|
+
for (t = j0; t < j; ++t) cnt += jj2->a[t].cnt, flag |= jj2->a[t].flag;
|
855
|
+
jj2->a[k] = jj2->a[j0];
|
856
|
+
jj2->a[k].cnt = cnt;
|
857
|
+
jj2->a[k++].flag = flag;
|
858
|
+
j0 = j;
|
859
|
+
}
|
860
|
+
}
|
861
|
+
jj2->n = k;
|
862
|
+
jj2->a = REALLOC(mm_idx_jjump1_t, jj2->a, k);
|
863
|
+
}
|
864
|
+
|
865
|
+
static mm_idx_jjump_t *mm_idx_bed2jjump(const mm_idx_t *mi, const mm_idx_intv_t *I, uint16_t flag)
|
866
|
+
{
|
867
|
+
int32_t i;
|
868
|
+
mm_idx_jjump_t *J;
|
869
|
+
J = CALLOC(mm_idx_jjump_t, mi->n_seq);
|
870
|
+
for (i = 0; i < mi->n_seq; ++i) {
|
871
|
+
int32_t j, k;
|
872
|
+
const mm_idx_intv_t *intv = &I[i];
|
873
|
+
mm_idx_jjump_t *jj = &J[i];
|
874
|
+
jj->n = intv->n * 2;
|
875
|
+
jj->a = CALLOC(mm_idx_jjump1_t, jj->n);
|
876
|
+
for (j = k = 0; j < intv->n; ++j) {
|
877
|
+
jj->a[k].off = intv->a[j].st, jj->a[k].off2 = intv->a[j].en, jj->a[k].cnt = intv->a[j].cnt, jj->a[k].strand = intv->a[j].strand, jj->a[k++].flag = flag;
|
878
|
+
jj->a[k].off = intv->a[j].en, jj->a[k].off2 = intv->a[j].st, jj->a[k].cnt = intv->a[j].cnt, jj->a[k].strand = intv->a[j].strand, jj->a[k++].flag = flag;
|
879
|
+
}
|
880
|
+
sort_jjump(jj);
|
881
|
+
}
|
882
|
+
return J;
|
883
|
+
}
|
884
|
+
|
885
|
+
static mm_idx_jjump_t *mm_idx_jjump_merge(const mm_idx_t *mi, const mm_idx_jjump_t *J0, const mm_idx_jjump_t *J1)
|
886
|
+
{
|
887
|
+
int32_t i;
|
888
|
+
mm_idx_jjump_t *J2;
|
889
|
+
J2 = CALLOC(mm_idx_jjump_t, mi->n_seq);
|
890
|
+
for (i = 0; i < mi->n_seq; ++i) {
|
891
|
+
int32_t j, k;
|
892
|
+
const mm_idx_jjump_t *jj0 = &J0[i], *jj1 = &J1[i];
|
893
|
+
mm_idx_jjump_t *jj2 = &J2[i];
|
894
|
+
jj2->n = jj0->n + jj1->n;
|
895
|
+
jj2->a = CALLOC(mm_idx_jjump1_t, jj2->n);
|
896
|
+
for (j = k = 0; j < jj0->n; ++j) jj2->a[k++] = jj0->a[j];
|
897
|
+
for (j = 0; j < jj1->n; ++j) jj2->a[k++] = jj1->a[j];
|
898
|
+
sort_jjump(jj2);
|
899
|
+
}
|
900
|
+
return J2;
|
901
|
+
}
|
902
|
+
|
903
|
+
int mm_idx_jjump_read(mm_idx_t *mi, const char *fn, int flag, int min_sc)
|
904
|
+
{
|
905
|
+
int32_t i, j, n_anno = 0, n_misc = 0;
|
906
|
+
mm_idx_intv_t *I;
|
907
|
+
mm_idx_jjump_t *J;
|
908
|
+
if (mi->h == 0) mm_idx_index_name(mi);
|
909
|
+
I = mm_idx_bed_read_merge(mi, fn, 1, min_sc);
|
910
|
+
J = mm_idx_bed2jjump(mi, I, flag);
|
911
|
+
for (i = 0; i < mi->n_seq; ++i) free(I[i].a);
|
912
|
+
free(I);
|
913
|
+
if (mi->J) {
|
914
|
+
mm_idx_jjump_t *J2;
|
915
|
+
J2 = mm_idx_jjump_merge(mi, mi->J, J);
|
916
|
+
for (i = 0; i < mi->n_seq; ++i) {
|
917
|
+
free(mi->J[i].a); free(J[i].a);
|
918
|
+
}
|
919
|
+
free(mi->J); free(J);
|
920
|
+
mi->J = J2;
|
921
|
+
} else mi->J = J;
|
922
|
+
for (i = 0; i < mi->n_seq; ++i) {
|
923
|
+
for (j = 0; j < mi->J[i].n; ++j)
|
924
|
+
if (mi->J[i].a[j].flag & MM_JUNC_ANNO) ++n_anno;
|
925
|
+
else ++n_misc;
|
926
|
+
}
|
927
|
+
if (mm_verbose >= 3)
|
928
|
+
fprintf(stderr, "[%s] there are %d annotated and %d other splice positions in the index\n", __func__, n_anno, n_misc);
|
929
|
+
return 0;
|
930
|
+
}
|
931
|
+
|
932
|
+
static int32_t mm_idx_jump_get_core(int32_t n, const mm_idx_jjump1_t *a, int32_t x) // similar to mm_idx_find_intv()
|
933
|
+
{
|
934
|
+
int32_t s = 0, e = n;
|
935
|
+
if (n == 0) return -1;
|
936
|
+
if (x < a[0].off) return -1;
|
937
|
+
while (s < e) {
|
938
|
+
int32_t mid = s + (e - s) / 2;
|
939
|
+
if (x >= a[mid].off && (mid + 1 >= n || x < a[mid+1].off)) return mid;
|
940
|
+
else if (x < a[mid].off) e = mid;
|
941
|
+
else s = mid + 1;
|
942
|
+
}
|
943
|
+
assert(0);
|
944
|
+
}
|
945
|
+
|
946
|
+
const mm_idx_jjump1_t *mm_idx_jump_get(const mm_idx_t *db, int32_t cid, int32_t st, int32_t en, int32_t *n)
|
947
|
+
{
|
948
|
+
mm_idx_jjump_t *s;
|
949
|
+
int32_t l, r;
|
950
|
+
*n = 0;
|
951
|
+
if (cid >= db->n_seq || cid < 0 || db->J == 0) return 0;
|
952
|
+
if (en < 0 || en > db->seq[cid].len) en = db->seq[cid].len;
|
953
|
+
s = &db->J[cid];
|
954
|
+
if (s->n == 0) return 0;
|
955
|
+
l = mm_idx_jump_get_core(s->n, s->a, st);
|
956
|
+
r = mm_idx_jump_get_core(s->n, s->a, en);
|
957
|
+
*n = r - l;
|
958
|
+
return &s->a[l + 1];
|
959
|
+
}
|
960
|
+
|
961
|
+
/****************
|
962
|
+
* splice score *
|
963
|
+
****************/
|
964
|
+
|
965
|
+
typedef struct mm_idx_spsc_s {
|
966
|
+
uint32_t n, m;
|
967
|
+
uint64_t *a; // pos<<56 | score<<1 | acceptor
|
968
|
+
} mm_idx_spsc_t;
|
969
|
+
|
970
|
+
int32_t mm_idx_spsc_read(mm_idx_t *idx, const char *fn, int32_t max_sc)
|
971
|
+
{
|
972
|
+
gzFile fp;
|
973
|
+
kstring_t str = {0,0,0};
|
974
|
+
kstream_t *ks;
|
975
|
+
int32_t dret, j;
|
976
|
+
int64_t n_read = 0;
|
977
|
+
|
978
|
+
fp = fn && strcmp(fn, "-") != 0? gzopen(fn, "rb") : gzdopen(0, "rb");
|
979
|
+
if (fp == 0) return -1;
|
980
|
+
if (idx->h == 0) mm_idx_index_name(idx);
|
981
|
+
if (max_sc > 63) max_sc = 63;
|
982
|
+
idx->spsc = Kcalloc(0, mm_idx_spsc_t, idx->n_seq * 2);
|
983
|
+
ks = ks_init(fp);
|
984
|
+
while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) >= 0) {
|
985
|
+
mm_idx_spsc_t *s;
|
986
|
+
char *p, *q, *name = 0;
|
987
|
+
int32_t i, type = -1, strand = 0, cid = -1, score = -1;
|
988
|
+
int64_t pos = -1;
|
989
|
+
for (i = 0, p = q = str.s;; ++p) {
|
990
|
+
if (*p == '\t' || *p == 0) {
|
991
|
+
int c = *p;
|
992
|
+
*p = 0;
|
993
|
+
if (i == 0) {
|
994
|
+
name = q;
|
995
|
+
} else if (i == 1) {
|
996
|
+
pos = atol(q);
|
997
|
+
} else if (i == 2) {
|
998
|
+
strand = *q == '+'? 1 : '-'? -1 : 0;
|
999
|
+
} else if (i == 3) {
|
1000
|
+
type = *q == 'D'? 0 : *q == 'A'? 1 : -1;
|
1001
|
+
} else if (i == 4) {
|
1002
|
+
score = atoi(q);
|
1003
|
+
break;
|
1004
|
+
}
|
1005
|
+
if (c == 0) break;
|
1006
|
+
q = p + 1, ++i;
|
1007
|
+
}
|
1008
|
+
}
|
1009
|
+
if (i < 4) continue; // not enough fields
|
1010
|
+
if (score > max_sc) score = max_sc;
|
1011
|
+
if (score < -max_sc) score = -max_sc;
|
1012
|
+
cid = mm_idx_name2id(idx, name);
|
1013
|
+
if (cid < 0 || type < 0 || strand == 0 || pos < 0) continue; // FIXME: give a warning!
|
1014
|
+
s = &idx->spsc[cid << 1 | (strand > 0? 0 : 1)];
|
1015
|
+
Kgrow(0, uint64_t, s->a, s->n, s->m);
|
1016
|
+
if (pos > 0 && pos < idx->seq[cid].len) { // ignore scores at the ends
|
1017
|
+
s->a[s->n++] = (uint64_t)pos << 8 | (score + KSW_SPSC_OFFSET) << 1 | type;
|
1018
|
+
++n_read;
|
1019
|
+
}
|
1020
|
+
}
|
1021
|
+
ks_destroy(ks);
|
1022
|
+
gzclose(fp);
|
1023
|
+
for (j = 0; j < idx->n_seq * 2; ++j) {
|
1024
|
+
mm_idx_spsc_t *s = &idx->spsc[j];
|
1025
|
+
if (s->n > 0)
|
1026
|
+
radix_sort_64(s->a, s->a + s->n);
|
1027
|
+
}
|
1028
|
+
if (mm_verbose >= 3)
|
1029
|
+
fprintf(stderr, "[M::%s] read %ld splice scores\n", __func__, (long)n_read);
|
1030
|
+
return 0;
|
1031
|
+
}
|
1032
|
+
|
1033
|
+
static int32_t mm_idx_find_intv(int32_t n, const uint64_t *a, int64_t x)
|
1034
|
+
{
|
1035
|
+
int32_t s = 0, e = n;
|
1036
|
+
if (n == 0) return -1;
|
1037
|
+
if (x < a[0]>>8) return -1;
|
1038
|
+
while (s < e) {
|
1039
|
+
int32_t mid = s + (e - s) / 2;
|
1040
|
+
if (x >= a[mid]>>8 && (mid + 1 >= n || x < a[mid+1]>>8)) return mid;
|
1041
|
+
else if (x < a[mid]>>8) e = mid;
|
1042
|
+
else s = mid + 1;
|
1043
|
+
}
|
1044
|
+
assert(0);
|
1045
|
+
}
|
1046
|
+
|
1047
|
+
int64_t mm_idx_spsc_get(const mm_idx_t *db, int32_t cid, int64_t st, int64_t en, int32_t rev, uint8_t *sc)
|
1048
|
+
{
|
1049
|
+
const mm_idx_spsc_t *s;
|
1050
|
+
if (cid >= db->n_seq || cid < 0 || db->spsc == 0) return -1;
|
1051
|
+
if (en < 0 || en > db->seq[cid].len) en = db->seq[cid].len;
|
1052
|
+
memset(sc, 0xff, en - st);
|
1053
|
+
s = &db->spsc[cid << 1 | (!!rev)];
|
1054
|
+
if (s->n > 0) {
|
1055
|
+
int32_t j, l, r;
|
1056
|
+
l = mm_idx_find_intv(s->n, s->a, st);
|
1057
|
+
r = mm_idx_find_intv(s->n, s->a, en);
|
1058
|
+
for (j = l + 1; j <= r; ++j) {
|
1059
|
+
int64_t x = (s->a[j]>>8) - st;
|
1060
|
+
uint8_t score = s->a[j] & 0xff;
|
1061
|
+
assert(x <= en - st);
|
1062
|
+
if (x == en - st) continue;
|
1063
|
+
if (sc[x] == 0xff || sc[x] < score) sc[x] = score;
|
1064
|
+
}
|
1065
|
+
}
|
1066
|
+
return en - st;
|
1067
|
+
}
|