RubyGems - minimap2 - Versions diffs - 0.2.28.0 → 0.2.29.0 - Mend

minimap2 0.2.28.0 → 0.2.29.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

checksums.yaml +4 -4
data/README.md +1 -0
data/ext/cmappy/cmappy.c +3 -3
data/ext/cmappy/cmappy.h +1 -1
data/ext/minimap2/FAQ.md +1 -1
data/ext/minimap2/Makefile +4 -3
data/ext/minimap2/NEWS.md +39 -0
data/ext/minimap2/README.md +30 -14
data/ext/minimap2/align.c +134 -50
data/ext/minimap2/cookbook.md +2 -2
data/ext/minimap2/format.c +57 -3
data/ext/minimap2/hit.c +14 -6
data/ext/minimap2/index.c +304 -13
data/ext/minimap2/jump.c +201 -0
data/ext/minimap2/kalloc.h +8 -0
data/ext/minimap2/ksw2.h +5 -2
data/ext/minimap2/ksw2_dispatch.c +5 -5
data/ext/minimap2/ksw2_exts2_sse.c +17 -6
data/ext/minimap2/main.c +60 -12
data/ext/minimap2/map.c +35 -8
data/ext/minimap2/minimap.h +14 -3
data/ext/minimap2/minimap2.1 +92 -45
data/ext/minimap2/misc/README.md +2 -1
data/ext/minimap2/misc/pafcluster.js +241 -0
data/ext/minimap2/misc/paftools.js +8 -3
data/ext/minimap2/mmpriv.h +24 -2
data/ext/minimap2/options.c +27 -2
data/ext/minimap2/python/cmappy.h +3 -3
data/ext/minimap2/python/cmappy.pxd +4 -2
data/ext/minimap2/python/mappy.pyx +19 -7
data/ext/minimap2/setup.py +2 -2
data/ext/minimap2.patch +2 -2
data/lib/minimap2/aligner.rb +19 -12
data/lib/minimap2/ffi/constants.rb +9 -1
data/lib/minimap2/ffi/functions.rb +145 -6
data/lib/minimap2/ffi/mappy.rb +1 -1
data/lib/minimap2/version.rb +1 -1
data/lib/minimap2.rb +2 -2
metadata +5 -4
data/ext/minimap2/misc/mmphase.js +0 -335

data/ext/minimap2/minimap.h CHANGED Viewed

@@ -5,7 +5,7 @@
 #include <stdio.h>
 #include <sys/types.h>
-#define MM_VERSION "2.28-r1209"
+#define MM_VERSION "2.29-r1283"
 #define MM_F_NO_DIAG       (0x001LL) // no exact diagonal hit
 #define MM_F_NO_DUAL       (0x002LL) // skip pairs where query name is lexicographically larger than target name
@@ -45,6 +45,9 @@
 #define MM_F_SPLICE_OLD    (0x800000000LL)
 #define MM_F_SECONDARY_SEQ (0x1000000000LL)	//output SEQ field for seqondary alignments using hard clipping
 #define MM_F_OUT_DS        (0x2000000000LL)
+#define MM_F_WEAK_PAIRING  (0x4000000000LL)
+#define MM_F_SR_RNA        (0x8000000000LL)
+#define MM_F_OUT_JUNC      (0x10000000000LL)
 #define MM_I_HPC          0x1
 #define MM_I_NO_SEQ       0x2
@@ -91,6 +94,8 @@ typedef struct {
 	uint32_t *S;               // 4-bit packed sequence
 	struct mm_idx_bucket_s *B; // index (hidden)
 	struct mm_idx_intv_s *I;   // intervals (hidden)
+	struct mm_idx_spsc_s *spsc;// splice score (hidden)
+	struct mm_idx_jjump_s *J;  // junctions to create jumps (hidden)
 	void *km, *h;
 } mm_idx_t;
@@ -115,7 +120,7 @@ typedef struct {
 	int32_t mlen, blen;     // seeded exact match length; seeded alignment block length
 	int32_t n_sub;          // number of suboptimal mappings
 	int32_t score0;         // initial chaining score (before chain merging/spliting)
-	uint32_t mapq:8, split:2, rev:1, inv:1, sam_pri:1, proper_frag:1, pe_thru:1, seg_split:1, seg_id:8, split_inv:1, is_alt:1, strand_retained:1, dummy:5;
+	uint32_t mapq:8, split:2, rev:1, inv:1, sam_pri:1, proper_frag:1, pe_thru:1, seg_split:1, seg_id:8, split_inv:1, is_alt:1, strand_retained:1, is_spliced:1, dummy:4;
 	uint32_t hash;
 	float div;
 	mm_extra_t *p;
@@ -158,7 +163,7 @@ typedef struct {
 	int transition; // transition mismatch score (A:G, C:T)
 	int sc_ambi; // score when one or both bases are "N"
 	int noncan;      // cost of non-canonical splicing sites
-	int junc_bonus;
+	int junc_bonus, junc_pen;
 	int zdrop, zdrop_inv;   // break alignment if alignment score drops too fast along the diagonal
 	int end_bonus;
 	int min_dp_max;  // drop an alignment if the score of the max scoring segment is below this threshold
@@ -171,6 +176,8 @@ typedef struct {
 	int pe_ori, pe_bonus;
+	int32_t jump_min_match;
 	float mid_occ_frac;  // only used by mm_mapopt_update(); see below
 	float q_occ_frac;
 	int32_t min_mid_occ, max_mid_occ;
@@ -411,6 +418,10 @@ int mm_idx_alt_read(mm_idx_t *mi, const char *fn);
 int mm_idx_bed_read(mm_idx_t *mi, const char *fn, int read_junc);
 int mm_idx_bed_junc(const mm_idx_t *mi, int32_t ctg, int32_t st, int32_t en, uint8_t *s);
+int mm_max_spsc_bonus(const mm_mapopt_t *mo);
+int32_t mm_idx_spsc_read(mm_idx_t *idx, const char *fn, int32_t max_sc);
+int64_t mm_idx_spsc_get(const mm_idx_t *db, int32_t cid, int64_t st0, int64_t en0, int32_t rev, uint8_t *sc);
 // deprecated APIs for backward compatibility
 void mm_mapopt_init(mm_mapopt_t *opt);
 mm_idx_t *mm_idx_build(const char *fn, int w, int k, int flag, int n_threads);

data/ext/minimap2/minimap2.1 CHANGED Viewed

@@ -1,4 +1,4 @@
-.TH minimap2 1 "12 March 2024" "minimap2-2.28 (r1209)" "Bioinformatics tools"
+.TH minimap2 1 "18 April 2025" "minimap2-2.29 (r1283)" "Bioinformatics tools"
 .SH NAME
 .PP
 minimap2 - mapping and alignment between collections of DNA sequences
@@ -79,19 +79,6 @@ Minimizer k-mer length [15]
 .BI -w \ INT
 Minimizer window size [10]. A minimizer is the smallest k-mer
 in a window of w consecutive k-mers.
-.TP
-.BI -j \ INT
-Syncmer submer size [10]. Option
-.B -j
-and
-.B -w
-will override each: if
-.B -w
-is applied after
-.BR -j ,
-.B -j
-will have no effect, and vice versa.
 .TP
 .B -H
 Use homopolymer-compressed (HPC) minimizers. An HPC sequence is constructed by
@@ -310,11 +297,13 @@ maximum alignment gap is mostly controlled by
 .B --splice
 Enable the splice alignment mode.
 .TP
-.B --sr
-Enable short-read alignment heuristics. In the short-read mode, minimap2
-applies a second round of chaining with a higher minimizer occurrence threshold
-if no good chain is found. In addition, minimap2 attempts to patch gaps between
-seeds with ungapped alignment.
+.BR --sr [= no | dna | rna ]
+Enable short-read alignment heuristics [no]. If this option is used with no argument,
+.RB ` dna '
+is set. In the DNA short-read mode, minimap2 applies a second round of chaining
+with a higher minimizer occurrence threshold if no good chain is found. In
+addition, minimap2 attempts to patch gaps between seeds with ungapped
+alignment.
 .TP
 .BI --split-prefix \ STR
 Prefix to create temporary files. Typically used for a multi-part index.
@@ -334,10 +323,6 @@ Only map to the reverse complement strand of the reference sequences.
 If yes, sort anchors with heap merge, instead of radix sort. Heap merge is
 faster for short reads, but slower for long reads. [no]
 .TP
-.B --no-pairing
-Treat two reads in a pair as independent reads. The mate related fields in SAM
-are still properly populated.
-.TP
 .B --no-hash-name
 Produce the same alignment for identical sequences regardless of their sequence names.
 .SS Alignment options
@@ -371,7 +356,16 @@ Splice model [1]. 0 for the original minimap2 splice model that always penalizes
 .B -C
 has no effect with the default
 .BR -J1 .
-.BR -J0 .
+.TP
+.BR -j \ FILE
+Junctions used to extend alignment towards ends of reads [].
+.I FILE
+can be gene annotations in the BED12 format (aka 12-column BED), or intron
+positions in 5-column BED with the strand column required. BED12 file can be
+converted from GTF/GFF3 with `paftools.js gff2bed anno.gtf'. This option is
+intended for short RNA-seq reads, while
+.B --junc-bed
+for long noisy RNA-seq reads.
 .TP
 .BI -C \ INT
 Cost for a non-canonical GT-AG splicing (effective with
@@ -414,7 +408,16 @@ no attempt to match GT-AG [n]
 Score bonus when alignment extends to the end of the query sequence [0].
 .TP
 .BI --score-N \ INT
-Score of a mismatch involving ambiguous bases [1].
+Penalty of a mismatch involving ambiguous bases [1].
+.TP
+.BR --pairing = strong | weak | no
+How to pair paired-end reads [strong].
+.RB ` no '
+for aligning the two ends in a pair independently with no `properly paired' set.
+.RB ` weak '
+for aligning the two ends independently and then pairing the hits.
+.RB ` strong '
+for jointly aligning and pairing the two ends.
 .TP
 .BR --splice-flank = yes | no
 Assume the next base to a
@@ -433,16 +436,40 @@ on SIRV data, please add
 .B --splice-flank=no
 to the command line.
 .TP
+.BR --spsc \ FILE
+Splice scores []. Each line consists of five fields: 1) contig, 2) offset, 3) `+' or `-', 4) `D' or `A', and 5) score,
+where offset is the number of bases before a splice junction, `D' indicates the
+line corresponds to a donor site and `A' for an acceptor site.
+A positive score suggests the junction is preferred and a negative score
+suggests the junction is not preferred.
+.TP
+.BR --junc-pen \ INT
+Penalty for a position not in FILE specified by
+.B --spsc
+[5]. Effective with
+.B --spsc
+but not
+.BR --junc-bed .
+.TP
 .BR --junc-bed \ FILE
-Gene annotations in the BED12 format (aka 12-column BED), or intron positions
-in 5-column BED. With this option, minimap2 prefers splicing in annotations.
-BED12 file can be converted from GTF/GFF3 with `paftools.js gff2bed anno.gtf'
-[].
+Junctions to prefer during base alignment [].
+Same format as
+.BR -j .
+It is
+.I NOT
+recommended to apply this option to short RNA-seq reads. This would increase
+run time with little improvement to junction accuracy.
 .TP
 .BR --junc-bonus \ INT
-Score bonus for a splice donor or acceptor found in annotation (effective with
-.BR --junc-bed )
-[9].
+Score bonus for a splice donor or acceptor found in annotation [9]. Effective with
+.B --junc-bed
+but not
+.BR --spsc .
+.TP
+.BR --jump-min-match \ INT
+Minimum matching length to create a jump [3]. Equivalent to
+.B STAR
+.BR --alignSJDBoverhangMin .
 .TP
 .BI --end-seed-pen \ INT
 Drop a terminal anchor if
@@ -500,20 +527,13 @@ Copy input FASTA/Q comments to output.
 .B -c
 Generate CIGAR. In PAF, the CIGAR is written to the `cg' custom tag.
 .TP
-.BI --cs[= STR ]
+.BR --cs [= short | long ]
 Output the
 .B cs
 tag.
-.I STR
-can be either
-.I short
-or
-.IR long .
-If no
-.I STR
-is given,
-.I short
-is assumed. [none]
+If no argument is given,
+.RB ` short '
+is set. [none]
 .TP
 .B --MD
 Output the MD tag (see the SAM spec).
@@ -527,6 +547,26 @@ In SAM output, use soft clipping for supplementary alignments.
 .B --secondary-seq
 In SAM output, show query sequences for secondary alignments.
 .TP
+.B --write-junc
+Output splice junctions in 6-column BED: contig name, start, end,
+read name, score and strand. Score is the sum of donor and acceptor scores,
+where GT gets 3, GC gets 2 and AT gets 1 at donor sites,
+while AG gets 3 and AC gets 1 at acceptor sites.
+Alignments with mapping quality below 10 are ignored.
+.TP
+.BI --pass1 \ FILE
+Junctions BED file outputted by
+.B --write-junc
+[]. Rows with scores lower than 5 are ignored. When both
+.B -j
+and
+.B --pass1
+are present, junctions in
+.B -j
+are preferred over in
+.BR --pass1
+when there is ambiguity.
+.TP
 .BI --seed \ INT
 Integer seed for randomizing equally best hits. Minimap2 hashes
 .I INT
@@ -666,10 +706,16 @@ Spliced alignment for accurate long RNA-seq reads such as PacBio iso-seq
 .B -C5 -O6,24
 .BR -B4 ).
 .TP
+.B splice:sr
+Spliced alignment for short RNA-seq reads
+.RB ( -xsplice:hq
+.B --frag=yes -m25 -s40 -2K100m --heap-sort=yes --pairing=weak --sr=rna --min-dp-len=20
+.BR --secondary=no ).
+.TP
 .B sr
 Short-read alignment without splicing
 .RB ( -k21
-.B -w11 --sr --frag=yes -A2 -B8 -O12,32 -E2,1 -b0 -r100 -p.5 -N20 -f1000,5000 -n2 -m25
+.B -w11 --sr --frag=yes -A2 -B8 -O12,32 -E2,1 -r100 -p.5 -N20 -f1000,5000 -n2 -m25
 .B -s40 -g100 -2K50m --heap-sort=yes
 .BR --secondary=no ).
 .TP
@@ -742,7 +788,7 @@ s2	i	Chaining score of the best secondary chain
 NM	i	Total number of mismatches and gaps in the alignment
 MD	Z	To generate the ref sequence in the alignment
 AS	i	DP alignment score
-SA	Z	List of other supplementary alignments
+SA	Z	List of other supplementary alignments (with approximate CIGAR strings)
 ms	i	DP score of the max scoring segment in the alignment
 nn	i	Number of ambiguous bases in the alignment
 ts	A	Transcript strand (splice mode only)
@@ -751,6 +797,7 @@ cs	Z	Difference string
 dv	f	Approximate per-base sequence divergence
 de	f	Gap-compressed per-base sequence divergence
 rl	i	Length of query regions harboring repetitive seeds
+zd	i	Alignment broken due to Z-drop; bit 1: left broken; bit 2: right broken
 .TE
 .PP

data/ext/minimap2/misc/README.md CHANGED Viewed

@@ -16,7 +16,8 @@ minimap2 -c test/MT-human.fa test/MT-orang.fa \
   | paftools.js liftover -l10000 - <(echo -e "MT_orang\t2000\t5000")     # liftOver
 # no test data for the following examples
 paftools.js junceval -e anno.gtf splice.sam > out.txt  # compare splice junctions to annotations
-paftools.js splice2bed anno.gtf > anno.bed             # convert GTF/GFF3 to BED12
+paftools.js splice2bed splice.sam > splice.bed             # convert PAF/SAM to BED12
+paftools.js gff2bed anno.gtf > anno.bed             # convert GTF/GFF3 to BED12
 ```
 ## Table of Contents

data/ext/minimap2/misc/pafcluster.js ADDED Viewed

@@ -0,0 +1,241 @@
+#!/usr/bin/env k8
+"use strict";
+Array.prototype.delete_at = function(i) {
+	for (let j = i; j < this.length - 1; ++j)
+		this[j] = this[j + 1];
+	--this.length;
+}
+function* getopt(argv, ostr, longopts) {
+	if (argv.length == 0) return;
+	let pos = 0, cur = 0;
+	while (cur < argv.length) {
+		let lopt = "", opt = "?", arg = "";
+		while (cur < argv.length) { // skip non-option arguments
+			if (argv[cur][0] == "-" && argv[cur].length > 1) {
+				if (argv[cur] == "--") cur = argv.length;
+				break;
+			} else ++cur;
+		}
+		if (cur == argv.length) break;
+		let a = argv[cur];
+		if (a[0] == "-" && a[1] == "-") { // a long option
+			pos = -1;
+			let c = 0, k = -1, tmp = "", o;
+			const pos_eq = a.indexOf("=");
+			if (pos_eq > 0) {
+				o = a.substring(2, pos_eq);
+				arg = a.substring(pos_eq + 1);
+			} else o = a.substring(2);
+			for (let i = 0; i < longopts.length; ++i) {
+				let y = longopts[i];
+				if (y[y.length - 1] == "=") y = y.substring(0, y.length - 1);
+				if (o.length <= y.length && o == y.substring(0, o.length)) {
+					k = i, tmp = y;
+					++c; // c is the number of matches
+					if (o == y) { // exact match
+						c = 1;
+						break;
+					}
+				}
+			}
+			if (c == 1) { // find a unique match
+				lopt = tmp;
+				if (pos_eq < 0 && longopts[k][longopts[k].length-1] == "=" && cur + 1 < argv.length) {
+					arg = argv[cur+1];
+					argv.delete_at(cur + 1);
+				}
+			}
+		} else { // a short option
+			if (pos == 0) pos = 1;
+			opt = a[pos++];
+			let k = ostr.indexOf(opt);
+			if (k < 0) {
+				opt = "?";
+			} else if (k + 1 < ostr.length && ostr[k+1] == ":") { // requiring an argument
+				if (pos >= a.length) {
+					arg = argv[cur+1];
+					argv.delete_at(cur + 1);
+				} else arg = a.substring(pos);
+				pos = -1;
+			}
+		}
+		if (pos < 0 || pos >= argv[cur].length) {
+			argv.delete_at(cur);
+			pos = 0;
+		}
+		if (lopt != "") yield { opt: `--${lopt}`, arg: arg };
+		else if (opt != "?") yield { opt: `-${opt}`, arg: arg };
+		else yield { opt: "?", arg: "" };
+	}
+}
+function* k8_readline(fn) {
+	let buf = new Bytes();
+	let file = new File(fn);
+	while (file.readline(buf) >= 0) {
+		yield buf.toString();
+	}
+	file.close();
+	buf.destroy();
+}
+function merge_hits(b) {
+	if (b.length == 1)
+		return { name1:b[0].name1, name2:b[0].name2, len1:b[0].len1, len2:b[0].len2, min_cov:b[0].min_cov, max_cov:b[0].max_cov, cov1:b[0].cov1, cov2:b[0].cov2, s1:b[0].s1, dv:b[0].dv };
+	b.sort(function(x, y) { return x.st1 - y.st1 });
+	let f = [], bt = [];
+	for (let i = 0; i < b.length; ++i)
+		f[i] = b[i].s1, bt[i] = -1;
+	for (let i = 0; i < b.length; ++i) {
+		for (let j = 0; j < i; ++j) {
+			if (b[j].st2 < b[i].st2) {
+				if (b[j].en1 >= b[i].en1) continue;
+				if (b[j].en2 >= b[i].en2) continue;
+				const ov1 = b[j].en1 <= b[i].st1? 0 : b[i].st1 - b[j].en1;
+				const li1 = b[i].en1 - b[i].st1;
+				const s11 = b[i].s1 / li1 * (li1 - ov1);
+				const ov2 = b[j].en2 <= b[i].st2? 0 : b[i].st2 - b[j].en2;
+				const li2 = b[i].en2 - b[i].st2;
+				const s12 = b[i].s1 / li2 * (li2 - ov2);
+				const s1 = s11 < s12? s11 : s12;
+				if (f[i] < f[j] + s1)
+					f[i] = f[j] + s1, bt[i] = j;
+			}
+		}
+	}
+	let max_i = -1, max_f = 0, d = [];
+	for (let i = 0; i < b.length; ++i)
+		if (max_f < f[i])
+			max_f = f[i], max_i = i;
+	for (let k = max_i; k >= 0; k = bt[k])
+		d.push(k);
+	d = d.reverse();
+	let dv = 0, tot = 0, cov1 = 0, cov2 = 0, st1 = 0, en1 = 0, st2 = 0, en2 = 0;
+	for (let k = 0; k < d.length; ++k) {
+		const i = d[k];
+		tot += b[i].blen;
+		dv += b[i].dv * b[i].blen;
+		if (b[i].st1 > en1) {
+			cov1 += en1 - st1;
+			st1 = b[i].st1, en1 = b[i].en1;
+		} else en1 = en1 > b[i].en1? en1 : b[i].en1;
+		if (b[i].st2 > en2) {
+			cov2 += en2 - st2;
+			st2 = b[i].st2, en2 = b[i].en2;
+		} else en2 = en2 > b[i].en2? en2 : b[i].en2;
+	}
+	dv /= tot;
+	cov1 = (cov1 + (en1 - st1)) / b[0].len1;
+	cov2 = (cov2 + (en2 - st2)) / b[0].len2;
+	const min_cov = cov1 < cov2? cov1 : cov2;
+	const max_cov = cov1 > cov2? cov1 : cov2;
+	//warn(d.length, b[0].name1, b[0].name2, min_cov, max_cov);
+	return { name1:b[0].name1, name2:b[0].name2, len1:b[0].len1, len2:b[0].len2, min_cov:min_cov, max_cov:max_cov, cov1:cov1, cov2:cov2, s1:max_f, dv:dv };
+}
+function main(args) {
+	let opt = { min_cov:.9, max_dv:.015, max_diff:20000 };
+	for (const o of getopt(args, "c:d:e:", [])) {
+		if (o.opt == '-c') opt.min_cov = parseFloat(o.arg);
+		else if (o.opt == '-d') opt.max_dv = parseFloat(o.arg);
+		else if (o.opt == '-e') opt.max_diff = parseFloat(o.arg);
+	}
+	if (args.length == 0) {
+		print("Usage: pafcluster.js [options] <ava.paf>");
+		print("Options:");
+		print(`  -c FLOAT     min coverage [${opt.min_cov}]`);
+		print(`  -d FLOAT     max divergence [${opt.max_dv}]`);
+		print(`  -e FLOAT     max difference [${opt.max_diff}]`);
+		return;
+	}
+	// read
+	let a = [], len = {}, name2len = {};
+	for (const line of k8_readline(args[0])) {
+		let m, t = line.split("\t");
+		if (t[4] != "+") continue;
+		for (let i = 1; i < 4;  ++i) t[i] = parseInt(t[i]);
+		for (let i = 6; i < 11; ++i) t[i] = parseInt(t[i]);
+		const len1 = t[1], len2 = t[6];
+		let s1 = -1, dv = -1.0;
+		for (let i = 12; i < t.length; ++i) {
+			if ((m = /^(s1|dv):\S:(\S+)/.exec(t[i])) != null) {
+				if (m[1] == "s1") s1 = parseInt(m[2]);
+				else if (m[1] == "dv") dv = parseFloat(m[2]);
+			}
+		}
+		if (s1 < 0 || dv < 0) continue;
+		const cov1 = (parseInt(t[3]) - parseInt(t[2])) / len1;
+		const cov2 = (parseInt(t[8]) - parseInt(t[7])) / len2;
+		const min_cov = cov1 < cov2? cov1 : cov2;
+		const max_cov = cov1 > cov2? cov1 : cov2;
+		name2len[t[0]] = len1;
+		name2len[t[5]] = len2;
+		a.push({ name1:t[0], name2:t[5], len1:len1, len2:len2, min_cov:min_cov, max_cov:max_cov, s1:s1, dv:dv, cov1:cov1, cov2:cov2, st1:t[2], en1:t[3], st2:t[7], en2:t[8], blen:t[10] });
+		len[t[0]] = len1, len[t[5]] = len2;
+	}
+	warn(`Read ${a.length} hits`);
+	// merge duplicated hits
+	let h = {};
+	for (let i = 0; i < a.length; ++i) {
+		const key = `${a[i].name1}\t${a[i].name2}`;
+		if (h[key] == null) h[key] = [];
+		h[key].push(a[i]);
+	}
+	a = [];
+	for (const key in h)
+		a.push(merge_hits(h[key]));
+	// core loop
+	while (a.length > 1) {
+		// select the sequence with the highest sum of s1
+		let h = {};
+		for (let i = 0; i < a.length; ++i) {
+			if (h[a[i].name1] == null) h[a[i].name1] = 0;
+			h[a[i].name1] += a[i].s1;
+		}
+		let max_s1 = 0, max_name = "";
+		for (const name in h)
+			if (max_s1 < h[name])
+				max_s1 = h[name], max_name = name;
+		// find contigs in the same group
+		h = {};
+		h[max_name] = 1;
+		for (let i = 0; i < a.length; ++i) {
+			if (a[i].name1 != max_name && a[i].name2 != max_name)
+				continue;
+			const diff1 = a[i].len1 * (1.0 - a[i].cov1);
+			const diff2 = a[i].len2 * (1.0 - a[i].cov2);
+			if (a[i].min_cov >= opt.min_cov && a[i].dv <= opt.max_dv && diff1 <= opt.max_diff && diff2 <= opt.max_diff)
+				h[a[i].name1] = h[a[i].name2] = 1;
+		}
+		let n = 0;
+		for (const key in h) {
+			++n;
+			delete name2len[key];
+		}
+		print(`SD\t${max_name}\t${n}`);
+		for (const key in h) print(`CL\t${key}\t${len[key]}`);
+		print("//");
+		// filter out redundant hits
+		let b = [];
+		for (let i = 0; i < a.length; ++i)
+			if (h[a[i].name1] == null && h[a[i].name2] == null)
+				b.push(a[i]);
+		warn(`Reduced the number of hits from ${a.length} to ${b.length}`);
+		a = b;
+	}
+	// output remaining singletons
+	for (const key in name2len) {
+		print(`SD\t${key}\t1`);
+		print(`CL\t${key}\t${name2len[key]}`);
+		print(`//`);
+	}
+}
+main(arguments);

data/ext/minimap2/misc/paftools.js CHANGED Viewed

@@ -1,6 +1,6 @@
 #!/usr/bin/env k8
-var paftools_version = '2.28-r1209';
+var paftools_version = '2.29-r1283';
 /*****************************
  ***** Library functions *****
@@ -2187,7 +2187,7 @@ function paf_mapeval(args)
 	}
 	var lineno = 0, last = null, a = [], n_unmapped = null;
-	var re_cigar = /(\d+)([MIDSHN])/g;
+	var re_cigar = /(\d+)([MIDSHN=X])/g;
 	while (file.readline(buf) >= 0) {
 		var m, line = buf.toString();
 		++lineno;
@@ -2225,7 +2225,7 @@ function paf_mapeval(args)
 				var n_gap = 0, mlen = 0;
 				while ((m = re_cigar.exec(t[5])) != null) {
 					var len = parseInt(m[1]);
-					if (m[2] == 'M') pos_end += len, mlen += len;
+					if (m[2] == 'M' || m[2] == 'X' || m[2] == '=') pos_end += len, mlen += len;
 					else if (m[2] == 'I') n_gap += len;
 					else if (m[2] == 'D') n_gap += len, pos_end += len;
 				}
@@ -2494,6 +2494,10 @@ function paf_junceval(args)
 		} else { // SAM
 			ctg_name = t[2], pos = parseInt(t[3]) - 1, cigar = t[5];
 			var flag = parseInt(t[1]);
+			if (flag & 1) {
+				if (flag & 0x40) qname += '/1';
+				else if (flag & 0x80) qname += '/2';
+			}
 			if (flag&0x100) continue; // secondary
 		}
@@ -3240,6 +3244,7 @@ function paf_sveval(args)
 			if (bed != null && bed[t[0]] == null) continue;
 			if (t[4] == '<INV>' || t[4] == '<INVDUP>') continue; // no inversion
 			if (/[\[\]]/.test(t[4])) continue; // no break points
+			if (t[6] != "." && t[6] != "PASS") continue;
 			var st = parseInt(t[1]) - 1, en = st + t[3].length;
 			// parse svlen
 			var b = _paf_get_alen(t), svlen = b[0];

data/ext/minimap2/mmpriv.h CHANGED Viewed

@@ -24,6 +24,9 @@
 #define MM_SEED_SEG_SHIFT  48
 #define MM_SEED_SEG_MASK   (0xffULL<<(MM_SEED_SEG_SHIFT))
+#define MM_JUNC_ANNO 0x1
+#define MM_JUNC_MISC 0x2
 #ifndef kroundup32
 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
 #endif
@@ -33,6 +36,7 @@
 #define MALLOC(type, len) ((type*)malloc((len) * sizeof(type)))
 #define CALLOC(type, len) ((type*)calloc((len), sizeof(type)))
+#define REALLOC(type, ptr, cnt) ((type*)realloc((ptr), (cnt) * sizeof(type)))
 #ifdef __cplusplus
 extern "C" {
@@ -52,6 +56,12 @@ typedef struct {
 	mm128_t *a;
 } mm_seg_t;
+typedef struct {
+	int32_t off, off2, cnt;
+	int16_t strand;
+	uint16_t flag;
+} mm_idx_jjump1_t;
 double cputime(void);
 double realtime(void);
 long peakrss(void);
@@ -69,17 +79,23 @@ double mm_event_identity(const mm_reg1_t *r);
 int mm_write_sam_hdr(const mm_idx_t *mi, const char *rg, const char *ver, int argc, char *argv[]);
 void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int64_t opt_flag);
 void mm_write_paf3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int64_t opt_flag, int rep_len);
+void mm_write_paf4(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int64_t opt_flag, int rep_len, int n_seg, int seg_idx);
 void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int n_regs, const mm_reg1_t *regs);
 void mm_write_sam2(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, int seg_idx, int reg_idx, int n_seg, const int *n_regs, const mm_reg1_t *const* regs, void *km, int64_t opt_flag);
 void mm_write_sam3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, int seg_idx, int reg_idx, int n_seg, const int *n_regss, const mm_reg1_t *const* regss, void *km, int64_t opt_flag, int rep_len);
+void mm_write_junc(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r);
+// indexing related in index.c
 void mm_idxopt_init(mm_idxopt_t *opt);
 const uint64_t *mm_idx_get(const mm_idx_t *mi, uint64_t minier, int *n);
 int32_t mm_idx_cal_max_occ(const mm_idx_t *mi, float f);
 int mm_idx_getseq2(const mm_idx_t *mi, int is_rev, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq);
-mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a);
 mm_reg1_t *mm_gen_regs(void *km, uint32_t hash, int qlen, int n_u, uint64_t *u, mm128_t *a, int is_qstrand);
+int mm_idx_bed_read(mm_idx_t *mi, const char *fn, int read_junc);
+int mm_idx_jjump_read(mm_idx_t *mi, const char *fn, int flag, int min_sc);
+const mm_idx_jjump1_t *mm_idx_jump_get(const mm_idx_t *db, int32_t cid, int32_t st, int32_t en, int32_t *n);
+// chaining in lchain.c
 mm128_t *mg_lchain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int max_iter, int min_cnt, int min_sc, float chn_pen_gap, float chn_pen_skip,
 					  int is_cdna, int n_segs, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km);
 mm128_t *mg_lchain_rmq(int max_dist, int max_dist_inner, int bw, int max_chn_skip, int cap_rmq_size, int min_cnt, int min_sc, float chn_pen_gap, float chn_pen_skip,
@@ -96,8 +112,12 @@ void mm_select_sub_multi(void *km, float pri_ratio, float pri1, float pri2, int
 int mm_filter_strand_retained(int n_regs, mm_reg1_t *r);
 void mm_filter_regs(const mm_mapopt_t *opt, int qlen, int *n_regs, mm_reg1_t *regs);
 void mm_hit_sort(void *km, int *n_regs, mm_reg1_t *r, float alt_diff_frac);
-void mm_set_mapq(void *km, int n_regs, mm_reg1_t *regs, int min_chain_sc, int match_sc, int rep_len, int is_sr);
+void mm_set_mapq2(void *km, int n_regs, mm_reg1_t *regs, int min_chain_sc, int match_sc, int rep_len, int is_sr, int is_splice);
 void mm_update_dp_max(int qlen, int n_regs, mm_reg1_t *regs, float frac, int a, int b);
+void mm_jump_split(void *km, const mm_idx_t *mi, const mm_mapopt_t *opt, int32_t qlen, const uint8_t *qseq, mm_reg1_t *r, int32_t ts_strand);
+mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a);
+void mm_enlarge_cigar(mm_reg1_t *r, uint32_t n_cigar);
 void mm_est_err(const mm_idx_t *mi, int qlen, int n_regs, mm_reg1_t *regs, const mm128_t *a, int32_t n, const uint64_t *mini_pos);
@@ -105,6 +125,8 @@ mm_seg_t *mm_seg_gen(void *km, uint32_t hash, int n_segs, const int *qlens, int
 void mm_seg_free(void *km, int n_segs, mm_seg_t *segs);
 void mm_pair(void *km, int max_gap_ref, int dp_bonus, int sub_diff, int match_sc, const int *qlens, int *n_regs, mm_reg1_t **regs);
+void mm_jump_split(void *km, const mm_idx_t *mi, const mm_mapopt_t *opt, int32_t qlen, const uint8_t *qseq, mm_reg1_t *r, int32_t ts_strand);
 FILE *mm_split_init(const char *prefix, const mm_idx_t *mi);
 mm_idx_t *mm_split_merge_prep(const char *prefix, int n_splits, FILE **fp, uint32_t *n_seq_part);
 int mm_split_merge(int n_segs, const char **fn, const mm_mapopt_t *opt, int n_split_idx);