minimap2 0.2.27.0 → 0.2.29.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -0
- data/ext/cmappy/cmappy.c +3 -3
- data/ext/cmappy/cmappy.h +1 -1
- data/ext/minimap2/FAQ.md +1 -1
- data/ext/minimap2/Makefile +4 -3
- data/ext/minimap2/NEWS.md +68 -0
- data/ext/minimap2/README.md +30 -14
- data/ext/minimap2/align.c +136 -52
- data/ext/minimap2/cookbook.md +2 -2
- data/ext/minimap2/format.c +59 -5
- data/ext/minimap2/hit.c +14 -6
- data/ext/minimap2/index.c +304 -13
- data/ext/minimap2/jump.c +201 -0
- data/ext/minimap2/kalloc.h +8 -0
- data/ext/minimap2/ksw2.h +5 -2
- data/ext/minimap2/ksw2_dispatch.c +5 -5
- data/ext/minimap2/ksw2_exts2_sse.c +17 -6
- data/ext/minimap2/lchain.c +5 -5
- data/ext/minimap2/main.c +64 -12
- data/ext/minimap2/map.c +35 -8
- data/ext/minimap2/minimap.h +14 -3
- data/ext/minimap2/minimap2.1 +98 -46
- data/ext/minimap2/misc/README.md +2 -1
- data/ext/minimap2/misc/pafcluster.js +241 -0
- data/ext/minimap2/misc/paftools.js +17 -6
- data/ext/minimap2/mmpriv.h +25 -4
- data/ext/minimap2/options.c +36 -3
- data/ext/minimap2/python/cmappy.h +3 -3
- data/ext/minimap2/python/cmappy.pxd +5 -2
- data/ext/minimap2/python/mappy.pyx +20 -7
- data/ext/minimap2/python/minimap2.py +5 -3
- data/ext/minimap2/seed.c +2 -1
- data/ext/minimap2/setup.py +2 -2
- data/ext/minimap2.patch +2 -2
- data/lib/minimap2/aligner.rb +19 -12
- data/lib/minimap2/alignment.rb +1 -0
- data/lib/minimap2/ffi/constants.rb +10 -2
- data/lib/minimap2/ffi/functions.rb +145 -6
- data/lib/minimap2/ffi/mappy.rb +1 -1
- data/lib/minimap2/version.rb +1 -1
- data/lib/minimap2.rb +2 -2
- metadata +8 -7
- data/ext/minimap2/misc/mmphase.js +0 -335
data/ext/minimap2/minimap.h
CHANGED
@@ -5,7 +5,7 @@
|
|
5
5
|
#include <stdio.h>
|
6
6
|
#include <sys/types.h>
|
7
7
|
|
8
|
-
#define MM_VERSION "2.
|
8
|
+
#define MM_VERSION "2.29-r1283"
|
9
9
|
|
10
10
|
#define MM_F_NO_DIAG (0x001LL) // no exact diagonal hit
|
11
11
|
#define MM_F_NO_DUAL (0x002LL) // skip pairs where query name is lexicographically larger than target name
|
@@ -45,6 +45,9 @@
|
|
45
45
|
#define MM_F_SPLICE_OLD (0x800000000LL)
|
46
46
|
#define MM_F_SECONDARY_SEQ (0x1000000000LL) //output SEQ field for seqondary alignments using hard clipping
|
47
47
|
#define MM_F_OUT_DS (0x2000000000LL)
|
48
|
+
#define MM_F_WEAK_PAIRING (0x4000000000LL)
|
49
|
+
#define MM_F_SR_RNA (0x8000000000LL)
|
50
|
+
#define MM_F_OUT_JUNC (0x10000000000LL)
|
48
51
|
|
49
52
|
#define MM_I_HPC 0x1
|
50
53
|
#define MM_I_NO_SEQ 0x2
|
@@ -91,6 +94,8 @@ typedef struct {
|
|
91
94
|
uint32_t *S; // 4-bit packed sequence
|
92
95
|
struct mm_idx_bucket_s *B; // index (hidden)
|
93
96
|
struct mm_idx_intv_s *I; // intervals (hidden)
|
97
|
+
struct mm_idx_spsc_s *spsc;// splice score (hidden)
|
98
|
+
struct mm_idx_jjump_s *J; // junctions to create jumps (hidden)
|
94
99
|
void *km, *h;
|
95
100
|
} mm_idx_t;
|
96
101
|
|
@@ -115,7 +120,7 @@ typedef struct {
|
|
115
120
|
int32_t mlen, blen; // seeded exact match length; seeded alignment block length
|
116
121
|
int32_t n_sub; // number of suboptimal mappings
|
117
122
|
int32_t score0; // initial chaining score (before chain merging/spliting)
|
118
|
-
uint32_t mapq:8, split:2, rev:1, inv:1, sam_pri:1, proper_frag:1, pe_thru:1, seg_split:1, seg_id:8, split_inv:1, is_alt:1, strand_retained:1, dummy:
|
123
|
+
uint32_t mapq:8, split:2, rev:1, inv:1, sam_pri:1, proper_frag:1, pe_thru:1, seg_split:1, seg_id:8, split_inv:1, is_alt:1, strand_retained:1, is_spliced:1, dummy:4;
|
119
124
|
uint32_t hash;
|
120
125
|
float div;
|
121
126
|
mm_extra_t *p;
|
@@ -158,7 +163,7 @@ typedef struct {
|
|
158
163
|
int transition; // transition mismatch score (A:G, C:T)
|
159
164
|
int sc_ambi; // score when one or both bases are "N"
|
160
165
|
int noncan; // cost of non-canonical splicing sites
|
161
|
-
int junc_bonus;
|
166
|
+
int junc_bonus, junc_pen;
|
162
167
|
int zdrop, zdrop_inv; // break alignment if alignment score drops too fast along the diagonal
|
163
168
|
int end_bonus;
|
164
169
|
int min_dp_max; // drop an alignment if the score of the max scoring segment is below this threshold
|
@@ -171,6 +176,8 @@ typedef struct {
|
|
171
176
|
|
172
177
|
int pe_ori, pe_bonus;
|
173
178
|
|
179
|
+
int32_t jump_min_match;
|
180
|
+
|
174
181
|
float mid_occ_frac; // only used by mm_mapopt_update(); see below
|
175
182
|
float q_occ_frac;
|
176
183
|
int32_t min_mid_occ, max_mid_occ;
|
@@ -411,6 +418,10 @@ int mm_idx_alt_read(mm_idx_t *mi, const char *fn);
|
|
411
418
|
int mm_idx_bed_read(mm_idx_t *mi, const char *fn, int read_junc);
|
412
419
|
int mm_idx_bed_junc(const mm_idx_t *mi, int32_t ctg, int32_t st, int32_t en, uint8_t *s);
|
413
420
|
|
421
|
+
int mm_max_spsc_bonus(const mm_mapopt_t *mo);
|
422
|
+
int32_t mm_idx_spsc_read(mm_idx_t *idx, const char *fn, int32_t max_sc);
|
423
|
+
int64_t mm_idx_spsc_get(const mm_idx_t *db, int32_t cid, int64_t st0, int64_t en0, int32_t rev, uint8_t *sc);
|
424
|
+
|
414
425
|
// deprecated APIs for backward compatibility
|
415
426
|
void mm_mapopt_init(mm_mapopt_t *opt);
|
416
427
|
mm_idx_t *mm_idx_build(const char *fn, int w, int k, int flag, int n_threads);
|
data/ext/minimap2/minimap2.1
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
.TH minimap2 1 "
|
1
|
+
.TH minimap2 1 "18 April 2025" "minimap2-2.29 (r1283)" "Bioinformatics tools"
|
2
2
|
.SH NAME
|
3
3
|
.PP
|
4
4
|
minimap2 - mapping and alignment between collections of DNA sequences
|
@@ -79,19 +79,6 @@ Minimizer k-mer length [15]
|
|
79
79
|
.BI -w \ INT
|
80
80
|
Minimizer window size [10]. A minimizer is the smallest k-mer
|
81
81
|
in a window of w consecutive k-mers.
|
82
|
-
.TP
|
83
|
-
.BI -j \ INT
|
84
|
-
Syncmer submer size [10]. Option
|
85
|
-
.B -j
|
86
|
-
and
|
87
|
-
.B -w
|
88
|
-
will override each: if
|
89
|
-
.B -w
|
90
|
-
is applied after
|
91
|
-
.BR -j ,
|
92
|
-
.B -j
|
93
|
-
will have no effect, and vice versa.
|
94
|
-
|
95
82
|
.TP
|
96
83
|
.B -H
|
97
84
|
Use homopolymer-compressed (HPC) minimizers. An HPC sequence is constructed by
|
@@ -268,6 +255,11 @@ or more of the shorter chain [0.5]
|
|
268
255
|
Use the minigraph chaining algorithm [no]. The minigraph algorithm is better
|
269
256
|
for aligning contigs through long INDELs.
|
270
257
|
.TP
|
258
|
+
.BI --rmq-inner \ NUM
|
259
|
+
Apply full dynamic programming for anchors within distance
|
260
|
+
.I NUM
|
261
|
+
[1000].
|
262
|
+
.TP
|
271
263
|
.B --hard-mask-level
|
272
264
|
Honor option
|
273
265
|
.B -M
|
@@ -305,11 +297,13 @@ maximum alignment gap is mostly controlled by
|
|
305
297
|
.B --splice
|
306
298
|
Enable the splice alignment mode.
|
307
299
|
.TP
|
308
|
-
.
|
309
|
-
Enable short-read alignment heuristics.
|
310
|
-
|
311
|
-
|
312
|
-
|
300
|
+
.BR --sr [= no | dna | rna ]
|
301
|
+
Enable short-read alignment heuristics [no]. If this option is used with no argument,
|
302
|
+
.RB ` dna '
|
303
|
+
is set. In the DNA short-read mode, minimap2 applies a second round of chaining
|
304
|
+
with a higher minimizer occurrence threshold if no good chain is found. In
|
305
|
+
addition, minimap2 attempts to patch gaps between seeds with ungapped
|
306
|
+
alignment.
|
313
307
|
.TP
|
314
308
|
.BI --split-prefix \ STR
|
315
309
|
Prefix to create temporary files. Typically used for a multi-part index.
|
@@ -329,10 +323,6 @@ Only map to the reverse complement strand of the reference sequences.
|
|
329
323
|
If yes, sort anchors with heap merge, instead of radix sort. Heap merge is
|
330
324
|
faster for short reads, but slower for long reads. [no]
|
331
325
|
.TP
|
332
|
-
.B --no-pairing
|
333
|
-
Treat two reads in a pair as independent reads. The mate related fields in SAM
|
334
|
-
are still properly populated.
|
335
|
-
.TP
|
336
326
|
.B --no-hash-name
|
337
327
|
Produce the same alignment for identical sequences regardless of their sequence names.
|
338
328
|
.SS Alignment options
|
@@ -366,7 +356,16 @@ Splice model [1]. 0 for the original minimap2 splice model that always penalizes
|
|
366
356
|
.B -C
|
367
357
|
has no effect with the default
|
368
358
|
.BR -J1 .
|
369
|
-
.
|
359
|
+
.TP
|
360
|
+
.BR -j \ FILE
|
361
|
+
Junctions used to extend alignment towards ends of reads [].
|
362
|
+
.I FILE
|
363
|
+
can be gene annotations in the BED12 format (aka 12-column BED), or intron
|
364
|
+
positions in 5-column BED with the strand column required. BED12 file can be
|
365
|
+
converted from GTF/GFF3 with `paftools.js gff2bed anno.gtf'. This option is
|
366
|
+
intended for short RNA-seq reads, while
|
367
|
+
.B --junc-bed
|
368
|
+
for long noisy RNA-seq reads.
|
370
369
|
.TP
|
371
370
|
.BI -C \ INT
|
372
371
|
Cost for a non-canonical GT-AG splicing (effective with
|
@@ -409,7 +408,16 @@ no attempt to match GT-AG [n]
|
|
409
408
|
Score bonus when alignment extends to the end of the query sequence [0].
|
410
409
|
.TP
|
411
410
|
.BI --score-N \ INT
|
412
|
-
|
411
|
+
Penalty of a mismatch involving ambiguous bases [1].
|
412
|
+
.TP
|
413
|
+
.BR --pairing = strong | weak | no
|
414
|
+
How to pair paired-end reads [strong].
|
415
|
+
.RB ` no '
|
416
|
+
for aligning the two ends in a pair independently with no `properly paired' set.
|
417
|
+
.RB ` weak '
|
418
|
+
for aligning the two ends independently and then pairing the hits.
|
419
|
+
.RB ` strong '
|
420
|
+
for jointly aligning and pairing the two ends.
|
413
421
|
.TP
|
414
422
|
.BR --splice-flank = yes | no
|
415
423
|
Assume the next base to a
|
@@ -428,16 +436,40 @@ on SIRV data, please add
|
|
428
436
|
.B --splice-flank=no
|
429
437
|
to the command line.
|
430
438
|
.TP
|
439
|
+
.BR --spsc \ FILE
|
440
|
+
Splice scores []. Each line consists of five fields: 1) contig, 2) offset, 3) `+' or `-', 4) `D' or `A', and 5) score,
|
441
|
+
where offset is the number of bases before a splice junction, `D' indicates the
|
442
|
+
line corresponds to a donor site and `A' for an acceptor site.
|
443
|
+
A positive score suggests the junction is preferred and a negative score
|
444
|
+
suggests the junction is not preferred.
|
445
|
+
.TP
|
446
|
+
.BR --junc-pen \ INT
|
447
|
+
Penalty for a position not in FILE specified by
|
448
|
+
.B --spsc
|
449
|
+
[5]. Effective with
|
450
|
+
.B --spsc
|
451
|
+
but not
|
452
|
+
.BR --junc-bed .
|
453
|
+
.TP
|
431
454
|
.BR --junc-bed \ FILE
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
455
|
+
Junctions to prefer during base alignment [].
|
456
|
+
Same format as
|
457
|
+
.BR -j .
|
458
|
+
It is
|
459
|
+
.I NOT
|
460
|
+
recommended to apply this option to short RNA-seq reads. This would increase
|
461
|
+
run time with little improvement to junction accuracy.
|
436
462
|
.TP
|
437
463
|
.BR --junc-bonus \ INT
|
438
|
-
Score bonus for a splice donor or acceptor found in annotation
|
439
|
-
.
|
440
|
-
|
464
|
+
Score bonus for a splice donor or acceptor found in annotation [9]. Effective with
|
465
|
+
.B --junc-bed
|
466
|
+
but not
|
467
|
+
.BR --spsc .
|
468
|
+
.TP
|
469
|
+
.BR --jump-min-match \ INT
|
470
|
+
Minimum matching length to create a jump [3]. Equivalent to
|
471
|
+
.B STAR
|
472
|
+
.BR --alignSJDBoverhangMin .
|
441
473
|
.TP
|
442
474
|
.BI --end-seed-pen \ INT
|
443
475
|
Drop a terminal anchor if
|
@@ -463,7 +495,7 @@ Set 0 to disable [100m].
|
|
463
495
|
.BI --cap-kalloc \ NUM
|
464
496
|
Free thread-local kalloc memory reservoir if after the alignment the size of the reservoir above
|
465
497
|
.IR NUM .
|
466
|
-
Set 0 to disable [
|
498
|
+
Set 0 to disable [500m].
|
467
499
|
.SS Input/output options
|
468
500
|
.TP 10
|
469
501
|
.B -a
|
@@ -495,20 +527,13 @@ Copy input FASTA/Q comments to output.
|
|
495
527
|
.B -c
|
496
528
|
Generate CIGAR. In PAF, the CIGAR is written to the `cg' custom tag.
|
497
529
|
.TP
|
498
|
-
.
|
530
|
+
.BR --cs [= short | long ]
|
499
531
|
Output the
|
500
532
|
.B cs
|
501
533
|
tag.
|
502
|
-
|
503
|
-
|
504
|
-
.
|
505
|
-
or
|
506
|
-
.IR long .
|
507
|
-
If no
|
508
|
-
.I STR
|
509
|
-
is given,
|
510
|
-
.I short
|
511
|
-
is assumed. [none]
|
534
|
+
If no argument is given,
|
535
|
+
.RB ` short '
|
536
|
+
is set. [none]
|
512
537
|
.TP
|
513
538
|
.B --MD
|
514
539
|
Output the MD tag (see the SAM spec).
|
@@ -522,6 +547,26 @@ In SAM output, use soft clipping for supplementary alignments.
|
|
522
547
|
.B --secondary-seq
|
523
548
|
In SAM output, show query sequences for secondary alignments.
|
524
549
|
.TP
|
550
|
+
.B --write-junc
|
551
|
+
Output splice junctions in 6-column BED: contig name, start, end,
|
552
|
+
read name, score and strand. Score is the sum of donor and acceptor scores,
|
553
|
+
where GT gets 3, GC gets 2 and AT gets 1 at donor sites,
|
554
|
+
while AG gets 3 and AC gets 1 at acceptor sites.
|
555
|
+
Alignments with mapping quality below 10 are ignored.
|
556
|
+
.TP
|
557
|
+
.BI --pass1 \ FILE
|
558
|
+
Junctions BED file outputted by
|
559
|
+
.B --write-junc
|
560
|
+
[]. Rows with scores lower than 5 are ignored. When both
|
561
|
+
.B -j
|
562
|
+
and
|
563
|
+
.B --pass1
|
564
|
+
are present, junctions in
|
565
|
+
.B -j
|
566
|
+
are preferred over in
|
567
|
+
.BR --pass1
|
568
|
+
when there is ambiguity.
|
569
|
+
.TP
|
525
570
|
.BI --seed \ INT
|
526
571
|
Integer seed for randomizing equally best hits. Minimap2 hashes
|
527
572
|
.I INT
|
@@ -661,10 +706,16 @@ Spliced alignment for accurate long RNA-seq reads such as PacBio iso-seq
|
|
661
706
|
.B -C5 -O6,24
|
662
707
|
.BR -B4 ).
|
663
708
|
.TP
|
709
|
+
.B splice:sr
|
710
|
+
Spliced alignment for short RNA-seq reads
|
711
|
+
.RB ( -xsplice:hq
|
712
|
+
.B --frag=yes -m25 -s40 -2K100m --heap-sort=yes --pairing=weak --sr=rna --min-dp-len=20
|
713
|
+
.BR --secondary=no ).
|
714
|
+
.TP
|
664
715
|
.B sr
|
665
716
|
Short-read alignment without splicing
|
666
717
|
.RB ( -k21
|
667
|
-
.B -w11 --sr --frag=yes -A2 -B8 -O12,32 -E2,1 -
|
718
|
+
.B -w11 --sr --frag=yes -A2 -B8 -O12,32 -E2,1 -r100 -p.5 -N20 -f1000,5000 -n2 -m25
|
668
719
|
.B -s40 -g100 -2K50m --heap-sort=yes
|
669
720
|
.BR --secondary=no ).
|
670
721
|
.TP
|
@@ -737,7 +788,7 @@ s2 i Chaining score of the best secondary chain
|
|
737
788
|
NM i Total number of mismatches and gaps in the alignment
|
738
789
|
MD Z To generate the ref sequence in the alignment
|
739
790
|
AS i DP alignment score
|
740
|
-
SA Z List of other supplementary alignments
|
791
|
+
SA Z List of other supplementary alignments (with approximate CIGAR strings)
|
741
792
|
ms i DP score of the max scoring segment in the alignment
|
742
793
|
nn i Number of ambiguous bases in the alignment
|
743
794
|
ts A Transcript strand (splice mode only)
|
@@ -746,6 +797,7 @@ cs Z Difference string
|
|
746
797
|
dv f Approximate per-base sequence divergence
|
747
798
|
de f Gap-compressed per-base sequence divergence
|
748
799
|
rl i Length of query regions harboring repetitive seeds
|
800
|
+
zd i Alignment broken due to Z-drop; bit 1: left broken; bit 2: right broken
|
749
801
|
.TE
|
750
802
|
|
751
803
|
.PP
|
data/ext/minimap2/misc/README.md
CHANGED
@@ -16,7 +16,8 @@ minimap2 -c test/MT-human.fa test/MT-orang.fa \
|
|
16
16
|
| paftools.js liftover -l10000 - <(echo -e "MT_orang\t2000\t5000") # liftOver
|
17
17
|
# no test data for the following examples
|
18
18
|
paftools.js junceval -e anno.gtf splice.sam > out.txt # compare splice junctions to annotations
|
19
|
-
paftools.js splice2bed
|
19
|
+
paftools.js splice2bed splice.sam > splice.bed # convert PAF/SAM to BED12
|
20
|
+
paftools.js gff2bed anno.gtf > anno.bed # convert GTF/GFF3 to BED12
|
20
21
|
```
|
21
22
|
|
22
23
|
## Table of Contents
|
@@ -0,0 +1,241 @@
|
|
1
|
+
#!/usr/bin/env k8
|
2
|
+
|
3
|
+
"use strict";
|
4
|
+
|
5
|
+
Array.prototype.delete_at = function(i) {
|
6
|
+
for (let j = i; j < this.length - 1; ++j)
|
7
|
+
this[j] = this[j + 1];
|
8
|
+
--this.length;
|
9
|
+
}
|
10
|
+
|
11
|
+
function* getopt(argv, ostr, longopts) {
|
12
|
+
if (argv.length == 0) return;
|
13
|
+
let pos = 0, cur = 0;
|
14
|
+
while (cur < argv.length) {
|
15
|
+
let lopt = "", opt = "?", arg = "";
|
16
|
+
while (cur < argv.length) { // skip non-option arguments
|
17
|
+
if (argv[cur][0] == "-" && argv[cur].length > 1) {
|
18
|
+
if (argv[cur] == "--") cur = argv.length;
|
19
|
+
break;
|
20
|
+
} else ++cur;
|
21
|
+
}
|
22
|
+
if (cur == argv.length) break;
|
23
|
+
let a = argv[cur];
|
24
|
+
if (a[0] == "-" && a[1] == "-") { // a long option
|
25
|
+
pos = -1;
|
26
|
+
let c = 0, k = -1, tmp = "", o;
|
27
|
+
const pos_eq = a.indexOf("=");
|
28
|
+
if (pos_eq > 0) {
|
29
|
+
o = a.substring(2, pos_eq);
|
30
|
+
arg = a.substring(pos_eq + 1);
|
31
|
+
} else o = a.substring(2);
|
32
|
+
for (let i = 0; i < longopts.length; ++i) {
|
33
|
+
let y = longopts[i];
|
34
|
+
if (y[y.length - 1] == "=") y = y.substring(0, y.length - 1);
|
35
|
+
if (o.length <= y.length && o == y.substring(0, o.length)) {
|
36
|
+
k = i, tmp = y;
|
37
|
+
++c; // c is the number of matches
|
38
|
+
if (o == y) { // exact match
|
39
|
+
c = 1;
|
40
|
+
break;
|
41
|
+
}
|
42
|
+
}
|
43
|
+
}
|
44
|
+
if (c == 1) { // find a unique match
|
45
|
+
lopt = tmp;
|
46
|
+
if (pos_eq < 0 && longopts[k][longopts[k].length-1] == "=" && cur + 1 < argv.length) {
|
47
|
+
arg = argv[cur+1];
|
48
|
+
argv.delete_at(cur + 1);
|
49
|
+
}
|
50
|
+
}
|
51
|
+
} else { // a short option
|
52
|
+
if (pos == 0) pos = 1;
|
53
|
+
opt = a[pos++];
|
54
|
+
let k = ostr.indexOf(opt);
|
55
|
+
if (k < 0) {
|
56
|
+
opt = "?";
|
57
|
+
} else if (k + 1 < ostr.length && ostr[k+1] == ":") { // requiring an argument
|
58
|
+
if (pos >= a.length) {
|
59
|
+
arg = argv[cur+1];
|
60
|
+
argv.delete_at(cur + 1);
|
61
|
+
} else arg = a.substring(pos);
|
62
|
+
pos = -1;
|
63
|
+
}
|
64
|
+
}
|
65
|
+
if (pos < 0 || pos >= argv[cur].length) {
|
66
|
+
argv.delete_at(cur);
|
67
|
+
pos = 0;
|
68
|
+
}
|
69
|
+
if (lopt != "") yield { opt: `--${lopt}`, arg: arg };
|
70
|
+
else if (opt != "?") yield { opt: `-${opt}`, arg: arg };
|
71
|
+
else yield { opt: "?", arg: "" };
|
72
|
+
}
|
73
|
+
}
|
74
|
+
|
75
|
+
function* k8_readline(fn) {
|
76
|
+
let buf = new Bytes();
|
77
|
+
let file = new File(fn);
|
78
|
+
while (file.readline(buf) >= 0) {
|
79
|
+
yield buf.toString();
|
80
|
+
}
|
81
|
+
file.close();
|
82
|
+
buf.destroy();
|
83
|
+
}
|
84
|
+
|
85
|
+
function merge_hits(b) {
|
86
|
+
if (b.length == 1)
|
87
|
+
return { name1:b[0].name1, name2:b[0].name2, len1:b[0].len1, len2:b[0].len2, min_cov:b[0].min_cov, max_cov:b[0].max_cov, cov1:b[0].cov1, cov2:b[0].cov2, s1:b[0].s1, dv:b[0].dv };
|
88
|
+
b.sort(function(x, y) { return x.st1 - y.st1 });
|
89
|
+
let f = [], bt = [];
|
90
|
+
for (let i = 0; i < b.length; ++i)
|
91
|
+
f[i] = b[i].s1, bt[i] = -1;
|
92
|
+
for (let i = 0; i < b.length; ++i) {
|
93
|
+
for (let j = 0; j < i; ++j) {
|
94
|
+
if (b[j].st2 < b[i].st2) {
|
95
|
+
if (b[j].en1 >= b[i].en1) continue;
|
96
|
+
if (b[j].en2 >= b[i].en2) continue;
|
97
|
+
const ov1 = b[j].en1 <= b[i].st1? 0 : b[i].st1 - b[j].en1;
|
98
|
+
const li1 = b[i].en1 - b[i].st1;
|
99
|
+
const s11 = b[i].s1 / li1 * (li1 - ov1);
|
100
|
+
const ov2 = b[j].en2 <= b[i].st2? 0 : b[i].st2 - b[j].en2;
|
101
|
+
const li2 = b[i].en2 - b[i].st2;
|
102
|
+
const s12 = b[i].s1 / li2 * (li2 - ov2);
|
103
|
+
const s1 = s11 < s12? s11 : s12;
|
104
|
+
if (f[i] < f[j] + s1)
|
105
|
+
f[i] = f[j] + s1, bt[i] = j;
|
106
|
+
}
|
107
|
+
}
|
108
|
+
}
|
109
|
+
let max_i = -1, max_f = 0, d = [];
|
110
|
+
for (let i = 0; i < b.length; ++i)
|
111
|
+
if (max_f < f[i])
|
112
|
+
max_f = f[i], max_i = i;
|
113
|
+
for (let k = max_i; k >= 0; k = bt[k])
|
114
|
+
d.push(k);
|
115
|
+
d = d.reverse();
|
116
|
+
let dv = 0, tot = 0, cov1 = 0, cov2 = 0, st1 = 0, en1 = 0, st2 = 0, en2 = 0;
|
117
|
+
for (let k = 0; k < d.length; ++k) {
|
118
|
+
const i = d[k];
|
119
|
+
tot += b[i].blen;
|
120
|
+
dv += b[i].dv * b[i].blen;
|
121
|
+
if (b[i].st1 > en1) {
|
122
|
+
cov1 += en1 - st1;
|
123
|
+
st1 = b[i].st1, en1 = b[i].en1;
|
124
|
+
} else en1 = en1 > b[i].en1? en1 : b[i].en1;
|
125
|
+
if (b[i].st2 > en2) {
|
126
|
+
cov2 += en2 - st2;
|
127
|
+
st2 = b[i].st2, en2 = b[i].en2;
|
128
|
+
} else en2 = en2 > b[i].en2? en2 : b[i].en2;
|
129
|
+
}
|
130
|
+
dv /= tot;
|
131
|
+
cov1 = (cov1 + (en1 - st1)) / b[0].len1;
|
132
|
+
cov2 = (cov2 + (en2 - st2)) / b[0].len2;
|
133
|
+
const min_cov = cov1 < cov2? cov1 : cov2;
|
134
|
+
const max_cov = cov1 > cov2? cov1 : cov2;
|
135
|
+
//warn(d.length, b[0].name1, b[0].name2, min_cov, max_cov);
|
136
|
+
return { name1:b[0].name1, name2:b[0].name2, len1:b[0].len1, len2:b[0].len2, min_cov:min_cov, max_cov:max_cov, cov1:cov1, cov2:cov2, s1:max_f, dv:dv };
|
137
|
+
}
|
138
|
+
|
139
|
+
function main(args) {
|
140
|
+
let opt = { min_cov:.9, max_dv:.015, max_diff:20000 };
|
141
|
+
for (const o of getopt(args, "c:d:e:", [])) {
|
142
|
+
if (o.opt == '-c') opt.min_cov = parseFloat(o.arg);
|
143
|
+
else if (o.opt == '-d') opt.max_dv = parseFloat(o.arg);
|
144
|
+
else if (o.opt == '-e') opt.max_diff = parseFloat(o.arg);
|
145
|
+
}
|
146
|
+
if (args.length == 0) {
|
147
|
+
print("Usage: pafcluster.js [options] <ava.paf>");
|
148
|
+
print("Options:");
|
149
|
+
print(` -c FLOAT min coverage [${opt.min_cov}]`);
|
150
|
+
print(` -d FLOAT max divergence [${opt.max_dv}]`);
|
151
|
+
print(` -e FLOAT max difference [${opt.max_diff}]`);
|
152
|
+
return;
|
153
|
+
}
|
154
|
+
|
155
|
+
// read
|
156
|
+
let a = [], len = {}, name2len = {};
|
157
|
+
for (const line of k8_readline(args[0])) {
|
158
|
+
let m, t = line.split("\t");
|
159
|
+
if (t[4] != "+") continue;
|
160
|
+
for (let i = 1; i < 4; ++i) t[i] = parseInt(t[i]);
|
161
|
+
for (let i = 6; i < 11; ++i) t[i] = parseInt(t[i]);
|
162
|
+
const len1 = t[1], len2 = t[6];
|
163
|
+
let s1 = -1, dv = -1.0;
|
164
|
+
for (let i = 12; i < t.length; ++i) {
|
165
|
+
if ((m = /^(s1|dv):\S:(\S+)/.exec(t[i])) != null) {
|
166
|
+
if (m[1] == "s1") s1 = parseInt(m[2]);
|
167
|
+
else if (m[1] == "dv") dv = parseFloat(m[2]);
|
168
|
+
}
|
169
|
+
}
|
170
|
+
if (s1 < 0 || dv < 0) continue;
|
171
|
+
const cov1 = (parseInt(t[3]) - parseInt(t[2])) / len1;
|
172
|
+
const cov2 = (parseInt(t[8]) - parseInt(t[7])) / len2;
|
173
|
+
const min_cov = cov1 < cov2? cov1 : cov2;
|
174
|
+
const max_cov = cov1 > cov2? cov1 : cov2;
|
175
|
+
name2len[t[0]] = len1;
|
176
|
+
name2len[t[5]] = len2;
|
177
|
+
a.push({ name1:t[0], name2:t[5], len1:len1, len2:len2, min_cov:min_cov, max_cov:max_cov, s1:s1, dv:dv, cov1:cov1, cov2:cov2, st1:t[2], en1:t[3], st2:t[7], en2:t[8], blen:t[10] });
|
178
|
+
len[t[0]] = len1, len[t[5]] = len2;
|
179
|
+
}
|
180
|
+
warn(`Read ${a.length} hits`);
|
181
|
+
|
182
|
+
// merge duplicated hits
|
183
|
+
let h = {};
|
184
|
+
for (let i = 0; i < a.length; ++i) {
|
185
|
+
const key = `${a[i].name1}\t${a[i].name2}`;
|
186
|
+
if (h[key] == null) h[key] = [];
|
187
|
+
h[key].push(a[i]);
|
188
|
+
}
|
189
|
+
a = [];
|
190
|
+
for (const key in h)
|
191
|
+
a.push(merge_hits(h[key]));
|
192
|
+
|
193
|
+
// core loop
|
194
|
+
while (a.length > 1) {
|
195
|
+
// select the sequence with the highest sum of s1
|
196
|
+
let h = {};
|
197
|
+
for (let i = 0; i < a.length; ++i) {
|
198
|
+
if (h[a[i].name1] == null) h[a[i].name1] = 0;
|
199
|
+
h[a[i].name1] += a[i].s1;
|
200
|
+
}
|
201
|
+
let max_s1 = 0, max_name = "";
|
202
|
+
for (const name in h)
|
203
|
+
if (max_s1 < h[name])
|
204
|
+
max_s1 = h[name], max_name = name;
|
205
|
+
// find contigs in the same group
|
206
|
+
h = {};
|
207
|
+
h[max_name] = 1;
|
208
|
+
for (let i = 0; i < a.length; ++i) {
|
209
|
+
if (a[i].name1 != max_name && a[i].name2 != max_name)
|
210
|
+
continue;
|
211
|
+
const diff1 = a[i].len1 * (1.0 - a[i].cov1);
|
212
|
+
const diff2 = a[i].len2 * (1.0 - a[i].cov2);
|
213
|
+
if (a[i].min_cov >= opt.min_cov && a[i].dv <= opt.max_dv && diff1 <= opt.max_diff && diff2 <= opt.max_diff)
|
214
|
+
h[a[i].name1] = h[a[i].name2] = 1;
|
215
|
+
}
|
216
|
+
let n = 0;
|
217
|
+
for (const key in h) {
|
218
|
+
++n;
|
219
|
+
delete name2len[key];
|
220
|
+
}
|
221
|
+
print(`SD\t${max_name}\t${n}`);
|
222
|
+
for (const key in h) print(`CL\t${key}\t${len[key]}`);
|
223
|
+
print("//");
|
224
|
+
// filter out redundant hits
|
225
|
+
let b = [];
|
226
|
+
for (let i = 0; i < a.length; ++i)
|
227
|
+
if (h[a[i].name1] == null && h[a[i].name2] == null)
|
228
|
+
b.push(a[i]);
|
229
|
+
warn(`Reduced the number of hits from ${a.length} to ${b.length}`);
|
230
|
+
a = b;
|
231
|
+
}
|
232
|
+
|
233
|
+
// output remaining singletons
|
234
|
+
for (const key in name2len) {
|
235
|
+
print(`SD\t${key}\t1`);
|
236
|
+
print(`CL\t${key}\t${name2len[key]}`);
|
237
|
+
print(`//`);
|
238
|
+
}
|
239
|
+
}
|
240
|
+
|
241
|
+
main(arguments);
|
@@ -1,6 +1,6 @@
|
|
1
1
|
#!/usr/bin/env k8
|
2
2
|
|
3
|
-
var paftools_version = '2.
|
3
|
+
var paftools_version = '2.29-r1283';
|
4
4
|
|
5
5
|
/*****************************
|
6
6
|
***** Library functions *****
|
@@ -1740,15 +1740,17 @@ function paf_gff2bed(args)
|
|
1740
1740
|
|
1741
1741
|
function paf_sam2paf(args)
|
1742
1742
|
{
|
1743
|
-
var c, pri_only = false, long_cs = false;
|
1744
|
-
while ((c = getopt(args, "
|
1743
|
+
var c, pri_only = false, long_cs = false, pri_pri_only = false;
|
1744
|
+
while ((c = getopt(args, "pPL")) != null) {
|
1745
1745
|
if (c == 'p') pri_only = true;
|
1746
|
+
else if (c == 'P') pri_pri_only = pri_only = true;
|
1746
1747
|
else if (c == 'L') long_cs = true;
|
1747
1748
|
}
|
1748
1749
|
if (args.length == getopt.ind) {
|
1749
1750
|
print("Usage: paftools.js sam2paf [options] <in.sam>");
|
1750
1751
|
print("Options:");
|
1751
1752
|
print(" -p convert primary or supplementary alignments only");
|
1753
|
+
print(" -P convert primary alignments only");
|
1752
1754
|
print(" -L output the cs tag in the long form");
|
1753
1755
|
exit(1);
|
1754
1756
|
}
|
@@ -1775,6 +1777,7 @@ function paf_sam2paf(args)
|
|
1775
1777
|
throw Error("at line " + lineno + ": inconsistent SEQ and QUAL lengths - " + t[9].length + " != " + t[10].length);
|
1776
1778
|
if (t[2] == '*' || (flag&4) || t[5] == '*') continue;
|
1777
1779
|
if (pri_only && (flag&0x100)) continue;
|
1780
|
+
if (pri_pri_only && (flag&0x900)) continue;
|
1778
1781
|
var tlen = ctg_len[t[2]];
|
1779
1782
|
if (tlen == null) throw Error("at line " + lineno + ": can't find the length of contig " + t[2]);
|
1780
1783
|
// find tags
|
@@ -1887,7 +1890,10 @@ function paf_sam2paf(args)
|
|
1887
1890
|
// optional tags
|
1888
1891
|
var type = flag&0x100? 'S' : 'P';
|
1889
1892
|
var tags = ["tp:A:" + type];
|
1890
|
-
if (NM != null)
|
1893
|
+
if (NM != null) {
|
1894
|
+
tags.push("NM:i:"+NM);
|
1895
|
+
tags.push("mm:i:"+mm);
|
1896
|
+
}
|
1891
1897
|
tags.push("gn:i:"+(I[1]+D[1]), "go:i:"+(I[0]+D[0]), "cg:Z:" + t[5].replace(/\d+[SH]/g, ''));
|
1892
1898
|
if (cs_str != null) tags.push("cs:Z:" + cs_str);
|
1893
1899
|
else if (cs.length > 0) tags.push("cs:Z:" + cs.join(""));
|
@@ -2181,7 +2187,7 @@ function paf_mapeval(args)
|
|
2181
2187
|
}
|
2182
2188
|
|
2183
2189
|
var lineno = 0, last = null, a = [], n_unmapped = null;
|
2184
|
-
var re_cigar = /(\d+)([MIDSHN])/g;
|
2190
|
+
var re_cigar = /(\d+)([MIDSHN=X])/g;
|
2185
2191
|
while (file.readline(buf) >= 0) {
|
2186
2192
|
var m, line = buf.toString();
|
2187
2193
|
++lineno;
|
@@ -2219,7 +2225,7 @@ function paf_mapeval(args)
|
|
2219
2225
|
var n_gap = 0, mlen = 0;
|
2220
2226
|
while ((m = re_cigar.exec(t[5])) != null) {
|
2221
2227
|
var len = parseInt(m[1]);
|
2222
|
-
if (m[2] == 'M') pos_end += len, mlen += len;
|
2228
|
+
if (m[2] == 'M' || m[2] == 'X' || m[2] == '=') pos_end += len, mlen += len;
|
2223
2229
|
else if (m[2] == 'I') n_gap += len;
|
2224
2230
|
else if (m[2] == 'D') n_gap += len, pos_end += len;
|
2225
2231
|
}
|
@@ -2488,6 +2494,10 @@ function paf_junceval(args)
|
|
2488
2494
|
} else { // SAM
|
2489
2495
|
ctg_name = t[2], pos = parseInt(t[3]) - 1, cigar = t[5];
|
2490
2496
|
var flag = parseInt(t[1]);
|
2497
|
+
if (flag & 1) {
|
2498
|
+
if (flag & 0x40) qname += '/1';
|
2499
|
+
else if (flag & 0x80) qname += '/2';
|
2500
|
+
}
|
2491
2501
|
if (flag&0x100) continue; // secondary
|
2492
2502
|
}
|
2493
2503
|
|
@@ -3234,6 +3244,7 @@ function paf_sveval(args)
|
|
3234
3244
|
if (bed != null && bed[t[0]] == null) continue;
|
3235
3245
|
if (t[4] == '<INV>' || t[4] == '<INVDUP>') continue; // no inversion
|
3236
3246
|
if (/[\[\]]/.test(t[4])) continue; // no break points
|
3247
|
+
if (t[6] != "." && t[6] != "PASS") continue;
|
3237
3248
|
var st = parseInt(t[1]) - 1, en = st + t[3].length;
|
3238
3249
|
// parse svlen
|
3239
3250
|
var b = _paf_get_alen(t), svlen = b[0];
|