minimap2 0.2.28.0 → 0.2.29.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -0
- data/ext/cmappy/cmappy.c +3 -3
- data/ext/cmappy/cmappy.h +1 -1
- data/ext/minimap2/FAQ.md +1 -1
- data/ext/minimap2/Makefile +4 -3
- data/ext/minimap2/NEWS.md +39 -0
- data/ext/minimap2/README.md +30 -14
- data/ext/minimap2/align.c +134 -50
- data/ext/minimap2/cookbook.md +2 -2
- data/ext/minimap2/format.c +57 -3
- data/ext/minimap2/hit.c +14 -6
- data/ext/minimap2/index.c +304 -13
- data/ext/minimap2/jump.c +201 -0
- data/ext/minimap2/kalloc.h +8 -0
- data/ext/minimap2/ksw2.h +5 -2
- data/ext/minimap2/ksw2_dispatch.c +5 -5
- data/ext/minimap2/ksw2_exts2_sse.c +17 -6
- data/ext/minimap2/main.c +60 -12
- data/ext/minimap2/map.c +35 -8
- data/ext/minimap2/minimap.h +14 -3
- data/ext/minimap2/minimap2.1 +92 -45
- data/ext/minimap2/misc/README.md +2 -1
- data/ext/minimap2/misc/pafcluster.js +241 -0
- data/ext/minimap2/misc/paftools.js +8 -3
- data/ext/minimap2/mmpriv.h +24 -2
- data/ext/minimap2/options.c +27 -2
- data/ext/minimap2/python/cmappy.h +3 -3
- data/ext/minimap2/python/cmappy.pxd +4 -2
- data/ext/minimap2/python/mappy.pyx +19 -7
- data/ext/minimap2/setup.py +2 -2
- data/ext/minimap2.patch +2 -2
- data/lib/minimap2/aligner.rb +19 -12
- data/lib/minimap2/ffi/constants.rb +9 -1
- data/lib/minimap2/ffi/functions.rb +145 -6
- data/lib/minimap2/ffi/mappy.rb +1 -1
- data/lib/minimap2/version.rb +1 -1
- data/lib/minimap2.rb +2 -2
- metadata +5 -4
- data/ext/minimap2/misc/mmphase.js +0 -335
data/ext/minimap2/minimap.h
CHANGED
@@ -5,7 +5,7 @@
|
|
5
5
|
#include <stdio.h>
|
6
6
|
#include <sys/types.h>
|
7
7
|
|
8
|
-
#define MM_VERSION "2.
|
8
|
+
#define MM_VERSION "2.29-r1283"
|
9
9
|
|
10
10
|
#define MM_F_NO_DIAG (0x001LL) // no exact diagonal hit
|
11
11
|
#define MM_F_NO_DUAL (0x002LL) // skip pairs where query name is lexicographically larger than target name
|
@@ -45,6 +45,9 @@
|
|
45
45
|
#define MM_F_SPLICE_OLD (0x800000000LL)
|
46
46
|
#define MM_F_SECONDARY_SEQ (0x1000000000LL) //output SEQ field for seqondary alignments using hard clipping
|
47
47
|
#define MM_F_OUT_DS (0x2000000000LL)
|
48
|
+
#define MM_F_WEAK_PAIRING (0x4000000000LL)
|
49
|
+
#define MM_F_SR_RNA (0x8000000000LL)
|
50
|
+
#define MM_F_OUT_JUNC (0x10000000000LL)
|
48
51
|
|
49
52
|
#define MM_I_HPC 0x1
|
50
53
|
#define MM_I_NO_SEQ 0x2
|
@@ -91,6 +94,8 @@ typedef struct {
|
|
91
94
|
uint32_t *S; // 4-bit packed sequence
|
92
95
|
struct mm_idx_bucket_s *B; // index (hidden)
|
93
96
|
struct mm_idx_intv_s *I; // intervals (hidden)
|
97
|
+
struct mm_idx_spsc_s *spsc;// splice score (hidden)
|
98
|
+
struct mm_idx_jjump_s *J; // junctions to create jumps (hidden)
|
94
99
|
void *km, *h;
|
95
100
|
} mm_idx_t;
|
96
101
|
|
@@ -115,7 +120,7 @@ typedef struct {
|
|
115
120
|
int32_t mlen, blen; // seeded exact match length; seeded alignment block length
|
116
121
|
int32_t n_sub; // number of suboptimal mappings
|
117
122
|
int32_t score0; // initial chaining score (before chain merging/spliting)
|
118
|
-
uint32_t mapq:8, split:2, rev:1, inv:1, sam_pri:1, proper_frag:1, pe_thru:1, seg_split:1, seg_id:8, split_inv:1, is_alt:1, strand_retained:1, dummy:
|
123
|
+
uint32_t mapq:8, split:2, rev:1, inv:1, sam_pri:1, proper_frag:1, pe_thru:1, seg_split:1, seg_id:8, split_inv:1, is_alt:1, strand_retained:1, is_spliced:1, dummy:4;
|
119
124
|
uint32_t hash;
|
120
125
|
float div;
|
121
126
|
mm_extra_t *p;
|
@@ -158,7 +163,7 @@ typedef struct {
|
|
158
163
|
int transition; // transition mismatch score (A:G, C:T)
|
159
164
|
int sc_ambi; // score when one or both bases are "N"
|
160
165
|
int noncan; // cost of non-canonical splicing sites
|
161
|
-
int junc_bonus;
|
166
|
+
int junc_bonus, junc_pen;
|
162
167
|
int zdrop, zdrop_inv; // break alignment if alignment score drops too fast along the diagonal
|
163
168
|
int end_bonus;
|
164
169
|
int min_dp_max; // drop an alignment if the score of the max scoring segment is below this threshold
|
@@ -171,6 +176,8 @@ typedef struct {
|
|
171
176
|
|
172
177
|
int pe_ori, pe_bonus;
|
173
178
|
|
179
|
+
int32_t jump_min_match;
|
180
|
+
|
174
181
|
float mid_occ_frac; // only used by mm_mapopt_update(); see below
|
175
182
|
float q_occ_frac;
|
176
183
|
int32_t min_mid_occ, max_mid_occ;
|
@@ -411,6 +418,10 @@ int mm_idx_alt_read(mm_idx_t *mi, const char *fn);
|
|
411
418
|
int mm_idx_bed_read(mm_idx_t *mi, const char *fn, int read_junc);
|
412
419
|
int mm_idx_bed_junc(const mm_idx_t *mi, int32_t ctg, int32_t st, int32_t en, uint8_t *s);
|
413
420
|
|
421
|
+
int mm_max_spsc_bonus(const mm_mapopt_t *mo);
|
422
|
+
int32_t mm_idx_spsc_read(mm_idx_t *idx, const char *fn, int32_t max_sc);
|
423
|
+
int64_t mm_idx_spsc_get(const mm_idx_t *db, int32_t cid, int64_t st0, int64_t en0, int32_t rev, uint8_t *sc);
|
424
|
+
|
414
425
|
// deprecated APIs for backward compatibility
|
415
426
|
void mm_mapopt_init(mm_mapopt_t *opt);
|
416
427
|
mm_idx_t *mm_idx_build(const char *fn, int w, int k, int flag, int n_threads);
|
data/ext/minimap2/minimap2.1
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
.TH minimap2 1 "
|
1
|
+
.TH minimap2 1 "18 April 2025" "minimap2-2.29 (r1283)" "Bioinformatics tools"
|
2
2
|
.SH NAME
|
3
3
|
.PP
|
4
4
|
minimap2 - mapping and alignment between collections of DNA sequences
|
@@ -79,19 +79,6 @@ Minimizer k-mer length [15]
|
|
79
79
|
.BI -w \ INT
|
80
80
|
Minimizer window size [10]. A minimizer is the smallest k-mer
|
81
81
|
in a window of w consecutive k-mers.
|
82
|
-
.TP
|
83
|
-
.BI -j \ INT
|
84
|
-
Syncmer submer size [10]. Option
|
85
|
-
.B -j
|
86
|
-
and
|
87
|
-
.B -w
|
88
|
-
will override each: if
|
89
|
-
.B -w
|
90
|
-
is applied after
|
91
|
-
.BR -j ,
|
92
|
-
.B -j
|
93
|
-
will have no effect, and vice versa.
|
94
|
-
|
95
82
|
.TP
|
96
83
|
.B -H
|
97
84
|
Use homopolymer-compressed (HPC) minimizers. An HPC sequence is constructed by
|
@@ -310,11 +297,13 @@ maximum alignment gap is mostly controlled by
|
|
310
297
|
.B --splice
|
311
298
|
Enable the splice alignment mode.
|
312
299
|
.TP
|
313
|
-
.
|
314
|
-
Enable short-read alignment heuristics.
|
315
|
-
|
316
|
-
|
317
|
-
|
300
|
+
.BR --sr [= no | dna | rna ]
|
301
|
+
Enable short-read alignment heuristics [no]. If this option is used with no argument,
|
302
|
+
.RB ` dna '
|
303
|
+
is set. In the DNA short-read mode, minimap2 applies a second round of chaining
|
304
|
+
with a higher minimizer occurrence threshold if no good chain is found. In
|
305
|
+
addition, minimap2 attempts to patch gaps between seeds with ungapped
|
306
|
+
alignment.
|
318
307
|
.TP
|
319
308
|
.BI --split-prefix \ STR
|
320
309
|
Prefix to create temporary files. Typically used for a multi-part index.
|
@@ -334,10 +323,6 @@ Only map to the reverse complement strand of the reference sequences.
|
|
334
323
|
If yes, sort anchors with heap merge, instead of radix sort. Heap merge is
|
335
324
|
faster for short reads, but slower for long reads. [no]
|
336
325
|
.TP
|
337
|
-
.B --no-pairing
|
338
|
-
Treat two reads in a pair as independent reads. The mate related fields in SAM
|
339
|
-
are still properly populated.
|
340
|
-
.TP
|
341
326
|
.B --no-hash-name
|
342
327
|
Produce the same alignment for identical sequences regardless of their sequence names.
|
343
328
|
.SS Alignment options
|
@@ -371,7 +356,16 @@ Splice model [1]. 0 for the original minimap2 splice model that always penalizes
|
|
371
356
|
.B -C
|
372
357
|
has no effect with the default
|
373
358
|
.BR -J1 .
|
374
|
-
.
|
359
|
+
.TP
|
360
|
+
.BR -j \ FILE
|
361
|
+
Junctions used to extend alignment towards ends of reads [].
|
362
|
+
.I FILE
|
363
|
+
can be gene annotations in the BED12 format (aka 12-column BED), or intron
|
364
|
+
positions in 5-column BED with the strand column required. BED12 file can be
|
365
|
+
converted from GTF/GFF3 with `paftools.js gff2bed anno.gtf'. This option is
|
366
|
+
intended for short RNA-seq reads, while
|
367
|
+
.B --junc-bed
|
368
|
+
for long noisy RNA-seq reads.
|
375
369
|
.TP
|
376
370
|
.BI -C \ INT
|
377
371
|
Cost for a non-canonical GT-AG splicing (effective with
|
@@ -414,7 +408,16 @@ no attempt to match GT-AG [n]
|
|
414
408
|
Score bonus when alignment extends to the end of the query sequence [0].
|
415
409
|
.TP
|
416
410
|
.BI --score-N \ INT
|
417
|
-
|
411
|
+
Penalty of a mismatch involving ambiguous bases [1].
|
412
|
+
.TP
|
413
|
+
.BR --pairing = strong | weak | no
|
414
|
+
How to pair paired-end reads [strong].
|
415
|
+
.RB ` no '
|
416
|
+
for aligning the two ends in a pair independently with no `properly paired' set.
|
417
|
+
.RB ` weak '
|
418
|
+
for aligning the two ends independently and then pairing the hits.
|
419
|
+
.RB ` strong '
|
420
|
+
for jointly aligning and pairing the two ends.
|
418
421
|
.TP
|
419
422
|
.BR --splice-flank = yes | no
|
420
423
|
Assume the next base to a
|
@@ -433,16 +436,40 @@ on SIRV data, please add
|
|
433
436
|
.B --splice-flank=no
|
434
437
|
to the command line.
|
435
438
|
.TP
|
439
|
+
.BR --spsc \ FILE
|
440
|
+
Splice scores []. Each line consists of five fields: 1) contig, 2) offset, 3) `+' or `-', 4) `D' or `A', and 5) score,
|
441
|
+
where offset is the number of bases before a splice junction, `D' indicates the
|
442
|
+
line corresponds to a donor site and `A' for an acceptor site.
|
443
|
+
A positive score suggests the junction is preferred and a negative score
|
444
|
+
suggests the junction is not preferred.
|
445
|
+
.TP
|
446
|
+
.BR --junc-pen \ INT
|
447
|
+
Penalty for a position not in FILE specified by
|
448
|
+
.B --spsc
|
449
|
+
[5]. Effective with
|
450
|
+
.B --spsc
|
451
|
+
but not
|
452
|
+
.BR --junc-bed .
|
453
|
+
.TP
|
436
454
|
.BR --junc-bed \ FILE
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
455
|
+
Junctions to prefer during base alignment [].
|
456
|
+
Same format as
|
457
|
+
.BR -j .
|
458
|
+
It is
|
459
|
+
.I NOT
|
460
|
+
recommended to apply this option to short RNA-seq reads. This would increase
|
461
|
+
run time with little improvement to junction accuracy.
|
441
462
|
.TP
|
442
463
|
.BR --junc-bonus \ INT
|
443
|
-
Score bonus for a splice donor or acceptor found in annotation
|
444
|
-
.
|
445
|
-
|
464
|
+
Score bonus for a splice donor or acceptor found in annotation [9]. Effective with
|
465
|
+
.B --junc-bed
|
466
|
+
but not
|
467
|
+
.BR --spsc .
|
468
|
+
.TP
|
469
|
+
.BR --jump-min-match \ INT
|
470
|
+
Minimum matching length to create a jump [3]. Equivalent to
|
471
|
+
.B STAR
|
472
|
+
.BR --alignSJDBoverhangMin .
|
446
473
|
.TP
|
447
474
|
.BI --end-seed-pen \ INT
|
448
475
|
Drop a terminal anchor if
|
@@ -500,20 +527,13 @@ Copy input FASTA/Q comments to output.
|
|
500
527
|
.B -c
|
501
528
|
Generate CIGAR. In PAF, the CIGAR is written to the `cg' custom tag.
|
502
529
|
.TP
|
503
|
-
.
|
530
|
+
.BR --cs [= short | long ]
|
504
531
|
Output the
|
505
532
|
.B cs
|
506
533
|
tag.
|
507
|
-
|
508
|
-
|
509
|
-
.
|
510
|
-
or
|
511
|
-
.IR long .
|
512
|
-
If no
|
513
|
-
.I STR
|
514
|
-
is given,
|
515
|
-
.I short
|
516
|
-
is assumed. [none]
|
534
|
+
If no argument is given,
|
535
|
+
.RB ` short '
|
536
|
+
is set. [none]
|
517
537
|
.TP
|
518
538
|
.B --MD
|
519
539
|
Output the MD tag (see the SAM spec).
|
@@ -527,6 +547,26 @@ In SAM output, use soft clipping for supplementary alignments.
|
|
527
547
|
.B --secondary-seq
|
528
548
|
In SAM output, show query sequences for secondary alignments.
|
529
549
|
.TP
|
550
|
+
.B --write-junc
|
551
|
+
Output splice junctions in 6-column BED: contig name, start, end,
|
552
|
+
read name, score and strand. Score is the sum of donor and acceptor scores,
|
553
|
+
where GT gets 3, GC gets 2 and AT gets 1 at donor sites,
|
554
|
+
while AG gets 3 and AC gets 1 at acceptor sites.
|
555
|
+
Alignments with mapping quality below 10 are ignored.
|
556
|
+
.TP
|
557
|
+
.BI --pass1 \ FILE
|
558
|
+
Junctions BED file outputted by
|
559
|
+
.B --write-junc
|
560
|
+
[]. Rows with scores lower than 5 are ignored. When both
|
561
|
+
.B -j
|
562
|
+
and
|
563
|
+
.B --pass1
|
564
|
+
are present, junctions in
|
565
|
+
.B -j
|
566
|
+
are preferred over in
|
567
|
+
.BR --pass1
|
568
|
+
when there is ambiguity.
|
569
|
+
.TP
|
530
570
|
.BI --seed \ INT
|
531
571
|
Integer seed for randomizing equally best hits. Minimap2 hashes
|
532
572
|
.I INT
|
@@ -666,10 +706,16 @@ Spliced alignment for accurate long RNA-seq reads such as PacBio iso-seq
|
|
666
706
|
.B -C5 -O6,24
|
667
707
|
.BR -B4 ).
|
668
708
|
.TP
|
709
|
+
.B splice:sr
|
710
|
+
Spliced alignment for short RNA-seq reads
|
711
|
+
.RB ( -xsplice:hq
|
712
|
+
.B --frag=yes -m25 -s40 -2K100m --heap-sort=yes --pairing=weak --sr=rna --min-dp-len=20
|
713
|
+
.BR --secondary=no ).
|
714
|
+
.TP
|
669
715
|
.B sr
|
670
716
|
Short-read alignment without splicing
|
671
717
|
.RB ( -k21
|
672
|
-
.B -w11 --sr --frag=yes -A2 -B8 -O12,32 -E2,1 -
|
718
|
+
.B -w11 --sr --frag=yes -A2 -B8 -O12,32 -E2,1 -r100 -p.5 -N20 -f1000,5000 -n2 -m25
|
673
719
|
.B -s40 -g100 -2K50m --heap-sort=yes
|
674
720
|
.BR --secondary=no ).
|
675
721
|
.TP
|
@@ -742,7 +788,7 @@ s2 i Chaining score of the best secondary chain
|
|
742
788
|
NM i Total number of mismatches and gaps in the alignment
|
743
789
|
MD Z To generate the ref sequence in the alignment
|
744
790
|
AS i DP alignment score
|
745
|
-
SA Z List of other supplementary alignments
|
791
|
+
SA Z List of other supplementary alignments (with approximate CIGAR strings)
|
746
792
|
ms i DP score of the max scoring segment in the alignment
|
747
793
|
nn i Number of ambiguous bases in the alignment
|
748
794
|
ts A Transcript strand (splice mode only)
|
@@ -751,6 +797,7 @@ cs Z Difference string
|
|
751
797
|
dv f Approximate per-base sequence divergence
|
752
798
|
de f Gap-compressed per-base sequence divergence
|
753
799
|
rl i Length of query regions harboring repetitive seeds
|
800
|
+
zd i Alignment broken due to Z-drop; bit 1: left broken; bit 2: right broken
|
754
801
|
.TE
|
755
802
|
|
756
803
|
.PP
|
data/ext/minimap2/misc/README.md
CHANGED
@@ -16,7 +16,8 @@ minimap2 -c test/MT-human.fa test/MT-orang.fa \
|
|
16
16
|
| paftools.js liftover -l10000 - <(echo -e "MT_orang\t2000\t5000") # liftOver
|
17
17
|
# no test data for the following examples
|
18
18
|
paftools.js junceval -e anno.gtf splice.sam > out.txt # compare splice junctions to annotations
|
19
|
-
paftools.js splice2bed
|
19
|
+
paftools.js splice2bed splice.sam > splice.bed # convert PAF/SAM to BED12
|
20
|
+
paftools.js gff2bed anno.gtf > anno.bed # convert GTF/GFF3 to BED12
|
20
21
|
```
|
21
22
|
|
22
23
|
## Table of Contents
|
@@ -0,0 +1,241 @@
|
|
1
|
+
#!/usr/bin/env k8
|
2
|
+
|
3
|
+
"use strict";
|
4
|
+
|
5
|
+
Array.prototype.delete_at = function(i) {
|
6
|
+
for (let j = i; j < this.length - 1; ++j)
|
7
|
+
this[j] = this[j + 1];
|
8
|
+
--this.length;
|
9
|
+
}
|
10
|
+
|
11
|
+
function* getopt(argv, ostr, longopts) {
|
12
|
+
if (argv.length == 0) return;
|
13
|
+
let pos = 0, cur = 0;
|
14
|
+
while (cur < argv.length) {
|
15
|
+
let lopt = "", opt = "?", arg = "";
|
16
|
+
while (cur < argv.length) { // skip non-option arguments
|
17
|
+
if (argv[cur][0] == "-" && argv[cur].length > 1) {
|
18
|
+
if (argv[cur] == "--") cur = argv.length;
|
19
|
+
break;
|
20
|
+
} else ++cur;
|
21
|
+
}
|
22
|
+
if (cur == argv.length) break;
|
23
|
+
let a = argv[cur];
|
24
|
+
if (a[0] == "-" && a[1] == "-") { // a long option
|
25
|
+
pos = -1;
|
26
|
+
let c = 0, k = -1, tmp = "", o;
|
27
|
+
const pos_eq = a.indexOf("=");
|
28
|
+
if (pos_eq > 0) {
|
29
|
+
o = a.substring(2, pos_eq);
|
30
|
+
arg = a.substring(pos_eq + 1);
|
31
|
+
} else o = a.substring(2);
|
32
|
+
for (let i = 0; i < longopts.length; ++i) {
|
33
|
+
let y = longopts[i];
|
34
|
+
if (y[y.length - 1] == "=") y = y.substring(0, y.length - 1);
|
35
|
+
if (o.length <= y.length && o == y.substring(0, o.length)) {
|
36
|
+
k = i, tmp = y;
|
37
|
+
++c; // c is the number of matches
|
38
|
+
if (o == y) { // exact match
|
39
|
+
c = 1;
|
40
|
+
break;
|
41
|
+
}
|
42
|
+
}
|
43
|
+
}
|
44
|
+
if (c == 1) { // find a unique match
|
45
|
+
lopt = tmp;
|
46
|
+
if (pos_eq < 0 && longopts[k][longopts[k].length-1] == "=" && cur + 1 < argv.length) {
|
47
|
+
arg = argv[cur+1];
|
48
|
+
argv.delete_at(cur + 1);
|
49
|
+
}
|
50
|
+
}
|
51
|
+
} else { // a short option
|
52
|
+
if (pos == 0) pos = 1;
|
53
|
+
opt = a[pos++];
|
54
|
+
let k = ostr.indexOf(opt);
|
55
|
+
if (k < 0) {
|
56
|
+
opt = "?";
|
57
|
+
} else if (k + 1 < ostr.length && ostr[k+1] == ":") { // requiring an argument
|
58
|
+
if (pos >= a.length) {
|
59
|
+
arg = argv[cur+1];
|
60
|
+
argv.delete_at(cur + 1);
|
61
|
+
} else arg = a.substring(pos);
|
62
|
+
pos = -1;
|
63
|
+
}
|
64
|
+
}
|
65
|
+
if (pos < 0 || pos >= argv[cur].length) {
|
66
|
+
argv.delete_at(cur);
|
67
|
+
pos = 0;
|
68
|
+
}
|
69
|
+
if (lopt != "") yield { opt: `--${lopt}`, arg: arg };
|
70
|
+
else if (opt != "?") yield { opt: `-${opt}`, arg: arg };
|
71
|
+
else yield { opt: "?", arg: "" };
|
72
|
+
}
|
73
|
+
}
|
74
|
+
|
75
|
+
function* k8_readline(fn) {
|
76
|
+
let buf = new Bytes();
|
77
|
+
let file = new File(fn);
|
78
|
+
while (file.readline(buf) >= 0) {
|
79
|
+
yield buf.toString();
|
80
|
+
}
|
81
|
+
file.close();
|
82
|
+
buf.destroy();
|
83
|
+
}
|
84
|
+
|
85
|
+
function merge_hits(b) {
|
86
|
+
if (b.length == 1)
|
87
|
+
return { name1:b[0].name1, name2:b[0].name2, len1:b[0].len1, len2:b[0].len2, min_cov:b[0].min_cov, max_cov:b[0].max_cov, cov1:b[0].cov1, cov2:b[0].cov2, s1:b[0].s1, dv:b[0].dv };
|
88
|
+
b.sort(function(x, y) { return x.st1 - y.st1 });
|
89
|
+
let f = [], bt = [];
|
90
|
+
for (let i = 0; i < b.length; ++i)
|
91
|
+
f[i] = b[i].s1, bt[i] = -1;
|
92
|
+
for (let i = 0; i < b.length; ++i) {
|
93
|
+
for (let j = 0; j < i; ++j) {
|
94
|
+
if (b[j].st2 < b[i].st2) {
|
95
|
+
if (b[j].en1 >= b[i].en1) continue;
|
96
|
+
if (b[j].en2 >= b[i].en2) continue;
|
97
|
+
const ov1 = b[j].en1 <= b[i].st1? 0 : b[i].st1 - b[j].en1;
|
98
|
+
const li1 = b[i].en1 - b[i].st1;
|
99
|
+
const s11 = b[i].s1 / li1 * (li1 - ov1);
|
100
|
+
const ov2 = b[j].en2 <= b[i].st2? 0 : b[i].st2 - b[j].en2;
|
101
|
+
const li2 = b[i].en2 - b[i].st2;
|
102
|
+
const s12 = b[i].s1 / li2 * (li2 - ov2);
|
103
|
+
const s1 = s11 < s12? s11 : s12;
|
104
|
+
if (f[i] < f[j] + s1)
|
105
|
+
f[i] = f[j] + s1, bt[i] = j;
|
106
|
+
}
|
107
|
+
}
|
108
|
+
}
|
109
|
+
let max_i = -1, max_f = 0, d = [];
|
110
|
+
for (let i = 0; i < b.length; ++i)
|
111
|
+
if (max_f < f[i])
|
112
|
+
max_f = f[i], max_i = i;
|
113
|
+
for (let k = max_i; k >= 0; k = bt[k])
|
114
|
+
d.push(k);
|
115
|
+
d = d.reverse();
|
116
|
+
let dv = 0, tot = 0, cov1 = 0, cov2 = 0, st1 = 0, en1 = 0, st2 = 0, en2 = 0;
|
117
|
+
for (let k = 0; k < d.length; ++k) {
|
118
|
+
const i = d[k];
|
119
|
+
tot += b[i].blen;
|
120
|
+
dv += b[i].dv * b[i].blen;
|
121
|
+
if (b[i].st1 > en1) {
|
122
|
+
cov1 += en1 - st1;
|
123
|
+
st1 = b[i].st1, en1 = b[i].en1;
|
124
|
+
} else en1 = en1 > b[i].en1? en1 : b[i].en1;
|
125
|
+
if (b[i].st2 > en2) {
|
126
|
+
cov2 += en2 - st2;
|
127
|
+
st2 = b[i].st2, en2 = b[i].en2;
|
128
|
+
} else en2 = en2 > b[i].en2? en2 : b[i].en2;
|
129
|
+
}
|
130
|
+
dv /= tot;
|
131
|
+
cov1 = (cov1 + (en1 - st1)) / b[0].len1;
|
132
|
+
cov2 = (cov2 + (en2 - st2)) / b[0].len2;
|
133
|
+
const min_cov = cov1 < cov2? cov1 : cov2;
|
134
|
+
const max_cov = cov1 > cov2? cov1 : cov2;
|
135
|
+
//warn(d.length, b[0].name1, b[0].name2, min_cov, max_cov);
|
136
|
+
return { name1:b[0].name1, name2:b[0].name2, len1:b[0].len1, len2:b[0].len2, min_cov:min_cov, max_cov:max_cov, cov1:cov1, cov2:cov2, s1:max_f, dv:dv };
|
137
|
+
}
|
138
|
+
|
139
|
+
function main(args) {
|
140
|
+
let opt = { min_cov:.9, max_dv:.015, max_diff:20000 };
|
141
|
+
for (const o of getopt(args, "c:d:e:", [])) {
|
142
|
+
if (o.opt == '-c') opt.min_cov = parseFloat(o.arg);
|
143
|
+
else if (o.opt == '-d') opt.max_dv = parseFloat(o.arg);
|
144
|
+
else if (o.opt == '-e') opt.max_diff = parseFloat(o.arg);
|
145
|
+
}
|
146
|
+
if (args.length == 0) {
|
147
|
+
print("Usage: pafcluster.js [options] <ava.paf>");
|
148
|
+
print("Options:");
|
149
|
+
print(` -c FLOAT min coverage [${opt.min_cov}]`);
|
150
|
+
print(` -d FLOAT max divergence [${opt.max_dv}]`);
|
151
|
+
print(` -e FLOAT max difference [${opt.max_diff}]`);
|
152
|
+
return;
|
153
|
+
}
|
154
|
+
|
155
|
+
// read
|
156
|
+
let a = [], len = {}, name2len = {};
|
157
|
+
for (const line of k8_readline(args[0])) {
|
158
|
+
let m, t = line.split("\t");
|
159
|
+
if (t[4] != "+") continue;
|
160
|
+
for (let i = 1; i < 4; ++i) t[i] = parseInt(t[i]);
|
161
|
+
for (let i = 6; i < 11; ++i) t[i] = parseInt(t[i]);
|
162
|
+
const len1 = t[1], len2 = t[6];
|
163
|
+
let s1 = -1, dv = -1.0;
|
164
|
+
for (let i = 12; i < t.length; ++i) {
|
165
|
+
if ((m = /^(s1|dv):\S:(\S+)/.exec(t[i])) != null) {
|
166
|
+
if (m[1] == "s1") s1 = parseInt(m[2]);
|
167
|
+
else if (m[1] == "dv") dv = parseFloat(m[2]);
|
168
|
+
}
|
169
|
+
}
|
170
|
+
if (s1 < 0 || dv < 0) continue;
|
171
|
+
const cov1 = (parseInt(t[3]) - parseInt(t[2])) / len1;
|
172
|
+
const cov2 = (parseInt(t[8]) - parseInt(t[7])) / len2;
|
173
|
+
const min_cov = cov1 < cov2? cov1 : cov2;
|
174
|
+
const max_cov = cov1 > cov2? cov1 : cov2;
|
175
|
+
name2len[t[0]] = len1;
|
176
|
+
name2len[t[5]] = len2;
|
177
|
+
a.push({ name1:t[0], name2:t[5], len1:len1, len2:len2, min_cov:min_cov, max_cov:max_cov, s1:s1, dv:dv, cov1:cov1, cov2:cov2, st1:t[2], en1:t[3], st2:t[7], en2:t[8], blen:t[10] });
|
178
|
+
len[t[0]] = len1, len[t[5]] = len2;
|
179
|
+
}
|
180
|
+
warn(`Read ${a.length} hits`);
|
181
|
+
|
182
|
+
// merge duplicated hits
|
183
|
+
let h = {};
|
184
|
+
for (let i = 0; i < a.length; ++i) {
|
185
|
+
const key = `${a[i].name1}\t${a[i].name2}`;
|
186
|
+
if (h[key] == null) h[key] = [];
|
187
|
+
h[key].push(a[i]);
|
188
|
+
}
|
189
|
+
a = [];
|
190
|
+
for (const key in h)
|
191
|
+
a.push(merge_hits(h[key]));
|
192
|
+
|
193
|
+
// core loop
|
194
|
+
while (a.length > 1) {
|
195
|
+
// select the sequence with the highest sum of s1
|
196
|
+
let h = {};
|
197
|
+
for (let i = 0; i < a.length; ++i) {
|
198
|
+
if (h[a[i].name1] == null) h[a[i].name1] = 0;
|
199
|
+
h[a[i].name1] += a[i].s1;
|
200
|
+
}
|
201
|
+
let max_s1 = 0, max_name = "";
|
202
|
+
for (const name in h)
|
203
|
+
if (max_s1 < h[name])
|
204
|
+
max_s1 = h[name], max_name = name;
|
205
|
+
// find contigs in the same group
|
206
|
+
h = {};
|
207
|
+
h[max_name] = 1;
|
208
|
+
for (let i = 0; i < a.length; ++i) {
|
209
|
+
if (a[i].name1 != max_name && a[i].name2 != max_name)
|
210
|
+
continue;
|
211
|
+
const diff1 = a[i].len1 * (1.0 - a[i].cov1);
|
212
|
+
const diff2 = a[i].len2 * (1.0 - a[i].cov2);
|
213
|
+
if (a[i].min_cov >= opt.min_cov && a[i].dv <= opt.max_dv && diff1 <= opt.max_diff && diff2 <= opt.max_diff)
|
214
|
+
h[a[i].name1] = h[a[i].name2] = 1;
|
215
|
+
}
|
216
|
+
let n = 0;
|
217
|
+
for (const key in h) {
|
218
|
+
++n;
|
219
|
+
delete name2len[key];
|
220
|
+
}
|
221
|
+
print(`SD\t${max_name}\t${n}`);
|
222
|
+
for (const key in h) print(`CL\t${key}\t${len[key]}`);
|
223
|
+
print("//");
|
224
|
+
// filter out redundant hits
|
225
|
+
let b = [];
|
226
|
+
for (let i = 0; i < a.length; ++i)
|
227
|
+
if (h[a[i].name1] == null && h[a[i].name2] == null)
|
228
|
+
b.push(a[i]);
|
229
|
+
warn(`Reduced the number of hits from ${a.length} to ${b.length}`);
|
230
|
+
a = b;
|
231
|
+
}
|
232
|
+
|
233
|
+
// output remaining singletons
|
234
|
+
for (const key in name2len) {
|
235
|
+
print(`SD\t${key}\t1`);
|
236
|
+
print(`CL\t${key}\t${name2len[key]}`);
|
237
|
+
print(`//`);
|
238
|
+
}
|
239
|
+
}
|
240
|
+
|
241
|
+
main(arguments);
|
@@ -1,6 +1,6 @@
|
|
1
1
|
#!/usr/bin/env k8
|
2
2
|
|
3
|
-
var paftools_version = '2.
|
3
|
+
var paftools_version = '2.29-r1283';
|
4
4
|
|
5
5
|
/*****************************
|
6
6
|
***** Library functions *****
|
@@ -2187,7 +2187,7 @@ function paf_mapeval(args)
|
|
2187
2187
|
}
|
2188
2188
|
|
2189
2189
|
var lineno = 0, last = null, a = [], n_unmapped = null;
|
2190
|
-
var re_cigar = /(\d+)([MIDSHN])/g;
|
2190
|
+
var re_cigar = /(\d+)([MIDSHN=X])/g;
|
2191
2191
|
while (file.readline(buf) >= 0) {
|
2192
2192
|
var m, line = buf.toString();
|
2193
2193
|
++lineno;
|
@@ -2225,7 +2225,7 @@ function paf_mapeval(args)
|
|
2225
2225
|
var n_gap = 0, mlen = 0;
|
2226
2226
|
while ((m = re_cigar.exec(t[5])) != null) {
|
2227
2227
|
var len = parseInt(m[1]);
|
2228
|
-
if (m[2] == 'M') pos_end += len, mlen += len;
|
2228
|
+
if (m[2] == 'M' || m[2] == 'X' || m[2] == '=') pos_end += len, mlen += len;
|
2229
2229
|
else if (m[2] == 'I') n_gap += len;
|
2230
2230
|
else if (m[2] == 'D') n_gap += len, pos_end += len;
|
2231
2231
|
}
|
@@ -2494,6 +2494,10 @@ function paf_junceval(args)
|
|
2494
2494
|
} else { // SAM
|
2495
2495
|
ctg_name = t[2], pos = parseInt(t[3]) - 1, cigar = t[5];
|
2496
2496
|
var flag = parseInt(t[1]);
|
2497
|
+
if (flag & 1) {
|
2498
|
+
if (flag & 0x40) qname += '/1';
|
2499
|
+
else if (flag & 0x80) qname += '/2';
|
2500
|
+
}
|
2497
2501
|
if (flag&0x100) continue; // secondary
|
2498
2502
|
}
|
2499
2503
|
|
@@ -3240,6 +3244,7 @@ function paf_sveval(args)
|
|
3240
3244
|
if (bed != null && bed[t[0]] == null) continue;
|
3241
3245
|
if (t[4] == '<INV>' || t[4] == '<INVDUP>') continue; // no inversion
|
3242
3246
|
if (/[\[\]]/.test(t[4])) continue; // no break points
|
3247
|
+
if (t[6] != "." && t[6] != "PASS") continue;
|
3243
3248
|
var st = parseInt(t[1]) - 1, en = st + t[3].length;
|
3244
3249
|
// parse svlen
|
3245
3250
|
var b = _paf_get_alen(t), svlen = b[0];
|
data/ext/minimap2/mmpriv.h
CHANGED
@@ -24,6 +24,9 @@
|
|
24
24
|
#define MM_SEED_SEG_SHIFT 48
|
25
25
|
#define MM_SEED_SEG_MASK (0xffULL<<(MM_SEED_SEG_SHIFT))
|
26
26
|
|
27
|
+
#define MM_JUNC_ANNO 0x1
|
28
|
+
#define MM_JUNC_MISC 0x2
|
29
|
+
|
27
30
|
#ifndef kroundup32
|
28
31
|
#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
|
29
32
|
#endif
|
@@ -33,6 +36,7 @@
|
|
33
36
|
|
34
37
|
#define MALLOC(type, len) ((type*)malloc((len) * sizeof(type)))
|
35
38
|
#define CALLOC(type, len) ((type*)calloc((len), sizeof(type)))
|
39
|
+
#define REALLOC(type, ptr, cnt) ((type*)realloc((ptr), (cnt) * sizeof(type)))
|
36
40
|
|
37
41
|
#ifdef __cplusplus
|
38
42
|
extern "C" {
|
@@ -52,6 +56,12 @@ typedef struct {
|
|
52
56
|
mm128_t *a;
|
53
57
|
} mm_seg_t;
|
54
58
|
|
59
|
+
typedef struct {
|
60
|
+
int32_t off, off2, cnt;
|
61
|
+
int16_t strand;
|
62
|
+
uint16_t flag;
|
63
|
+
} mm_idx_jjump1_t;
|
64
|
+
|
55
65
|
double cputime(void);
|
56
66
|
double realtime(void);
|
57
67
|
long peakrss(void);
|
@@ -69,17 +79,23 @@ double mm_event_identity(const mm_reg1_t *r);
|
|
69
79
|
int mm_write_sam_hdr(const mm_idx_t *mi, const char *rg, const char *ver, int argc, char *argv[]);
|
70
80
|
void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int64_t opt_flag);
|
71
81
|
void mm_write_paf3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int64_t opt_flag, int rep_len);
|
82
|
+
void mm_write_paf4(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int64_t opt_flag, int rep_len, int n_seg, int seg_idx);
|
72
83
|
void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int n_regs, const mm_reg1_t *regs);
|
73
84
|
void mm_write_sam2(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, int seg_idx, int reg_idx, int n_seg, const int *n_regs, const mm_reg1_t *const* regs, void *km, int64_t opt_flag);
|
74
85
|
void mm_write_sam3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, int seg_idx, int reg_idx, int n_seg, const int *n_regss, const mm_reg1_t *const* regss, void *km, int64_t opt_flag, int rep_len);
|
86
|
+
void mm_write_junc(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r);
|
75
87
|
|
88
|
+
// indexing related in index.c
|
76
89
|
void mm_idxopt_init(mm_idxopt_t *opt);
|
77
90
|
const uint64_t *mm_idx_get(const mm_idx_t *mi, uint64_t minier, int *n);
|
78
91
|
int32_t mm_idx_cal_max_occ(const mm_idx_t *mi, float f);
|
79
92
|
int mm_idx_getseq2(const mm_idx_t *mi, int is_rev, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq);
|
80
|
-
mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a);
|
81
93
|
mm_reg1_t *mm_gen_regs(void *km, uint32_t hash, int qlen, int n_u, uint64_t *u, mm128_t *a, int is_qstrand);
|
94
|
+
int mm_idx_bed_read(mm_idx_t *mi, const char *fn, int read_junc);
|
95
|
+
int mm_idx_jjump_read(mm_idx_t *mi, const char *fn, int flag, int min_sc);
|
96
|
+
const mm_idx_jjump1_t *mm_idx_jump_get(const mm_idx_t *db, int32_t cid, int32_t st, int32_t en, int32_t *n);
|
82
97
|
|
98
|
+
// chaining in lchain.c
|
83
99
|
mm128_t *mg_lchain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int max_iter, int min_cnt, int min_sc, float chn_pen_gap, float chn_pen_skip,
|
84
100
|
int is_cdna, int n_segs, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km);
|
85
101
|
mm128_t *mg_lchain_rmq(int max_dist, int max_dist_inner, int bw, int max_chn_skip, int cap_rmq_size, int min_cnt, int min_sc, float chn_pen_gap, float chn_pen_skip,
|
@@ -96,8 +112,12 @@ void mm_select_sub_multi(void *km, float pri_ratio, float pri1, float pri2, int
|
|
96
112
|
int mm_filter_strand_retained(int n_regs, mm_reg1_t *r);
|
97
113
|
void mm_filter_regs(const mm_mapopt_t *opt, int qlen, int *n_regs, mm_reg1_t *regs);
|
98
114
|
void mm_hit_sort(void *km, int *n_regs, mm_reg1_t *r, float alt_diff_frac);
|
99
|
-
void
|
115
|
+
void mm_set_mapq2(void *km, int n_regs, mm_reg1_t *regs, int min_chain_sc, int match_sc, int rep_len, int is_sr, int is_splice);
|
100
116
|
void mm_update_dp_max(int qlen, int n_regs, mm_reg1_t *regs, float frac, int a, int b);
|
117
|
+
void mm_jump_split(void *km, const mm_idx_t *mi, const mm_mapopt_t *opt, int32_t qlen, const uint8_t *qseq, mm_reg1_t *r, int32_t ts_strand);
|
118
|
+
|
119
|
+
mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a);
|
120
|
+
void mm_enlarge_cigar(mm_reg1_t *r, uint32_t n_cigar);
|
101
121
|
|
102
122
|
void mm_est_err(const mm_idx_t *mi, int qlen, int n_regs, mm_reg1_t *regs, const mm128_t *a, int32_t n, const uint64_t *mini_pos);
|
103
123
|
|
@@ -105,6 +125,8 @@ mm_seg_t *mm_seg_gen(void *km, uint32_t hash, int n_segs, const int *qlens, int
|
|
105
125
|
void mm_seg_free(void *km, int n_segs, mm_seg_t *segs);
|
106
126
|
void mm_pair(void *km, int max_gap_ref, int dp_bonus, int sub_diff, int match_sc, const int *qlens, int *n_regs, mm_reg1_t **regs);
|
107
127
|
|
128
|
+
void mm_jump_split(void *km, const mm_idx_t *mi, const mm_mapopt_t *opt, int32_t qlen, const uint8_t *qseq, mm_reg1_t *r, int32_t ts_strand);
|
129
|
+
|
108
130
|
FILE *mm_split_init(const char *prefix, const mm_idx_t *mi);
|
109
131
|
mm_idx_t *mm_split_merge_prep(const char *prefix, int n_splits, FILE **fp, uint32_t *n_seq_part);
|
110
132
|
int mm_split_merge(int n_segs, const char **fn, const mm_mapopt_t *opt, int n_split_idx);
|