bio-mummer 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/VERSION +1 -1
- data/bin/delta2gvcf.rb +181 -0
- data/lib/bio-mummer/mummer.rb +18 -14
- metadata +5 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 37345c73287e9b80c15dc7426ee1045caad772f7
|
4
|
+
data.tar.gz: 8b57b1755f5da0ba52af5ccbb92c3d55f06964e5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c86e8bb5d957da2331d817101577aa9990e4685a6445d5b194747c069fc8d5708b93465e20ff95b4e1a37de9ab02402552c1d77a7ba1acd98ddbeb61bb98d574
|
7
|
+
data.tar.gz: ffb5c859d7add8bb10c9d0a8dc5dc98a51dcb9a06370633d7fe6d69a3bd8d295836bafe4e32796ba6b62a855cde9a224731aa1c1cf45e077e8cc21eb4c31fb84
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.2.0
|
data/bin/delta2gvcf.rb
ADDED
@@ -0,0 +1,181 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'stringio'
|
4
|
+
require 'optparse'
|
5
|
+
require 'ostruct'
|
6
|
+
require 'pathname'
|
7
|
+
require 'tmpdir'
|
8
|
+
require 'bio-mummer'
|
9
|
+
require 'bio'
|
10
|
+
|
11
|
+
class OptParser
|
12
|
+
def self.parse(args)
|
13
|
+
options = OpenStruct.new
|
14
|
+
|
15
|
+
opt_parser = OptionParser.new do |opts|
|
16
|
+
opts.banner = "Usage: #{File.basename(__FILE__)} [options]"
|
17
|
+
opts.separator ""
|
18
|
+
opts.separator "Specific options:"
|
19
|
+
|
20
|
+
opts.on("-d", "--delta FILENAME", "Delta file generated by nucmer") do |fn|
|
21
|
+
pn = Pathname.new(fn)
|
22
|
+
if pn.size?
|
23
|
+
options.delta = pn
|
24
|
+
match = pn.each_line.first.match(/(?<ref>\/.*) (?<qry>\/.*)/)
|
25
|
+
if match
|
26
|
+
ref = Pathname.new(match[:ref])
|
27
|
+
if ref.size?
|
28
|
+
options.ref = Hash[Bio::FlatFile.open(ref).map{|e| [e.entry_id, e.naseq]}]
|
29
|
+
end
|
30
|
+
|
31
|
+
qry = Pathname.new(match[:qry])
|
32
|
+
if qry.size?
|
33
|
+
options.qry = Hash[Bio::FlatFile.open(qry).map{|e| [e.entry_id, e.naseq]}]
|
34
|
+
end
|
35
|
+
end
|
36
|
+
else
|
37
|
+
$stderr.puts "Error: cannot read file #{fn}"
|
38
|
+
exit(1)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
opt_parser.parse!(args)
|
43
|
+
return options
|
44
|
+
end
|
45
|
+
end
|
46
|
+
options = OptParser.parse(ARGV)
|
47
|
+
|
48
|
+
# Open the Delta File
|
49
|
+
d = BioMummer::DeltaFile.new(StringIO.new(`delta-filter -r -q #{options.delta}`))
|
50
|
+
|
51
|
+
puts '##fileformat=VCFv4.1
|
52
|
+
##ALT=<ID=NON_REF,Description="Represents any possible alternative allele at this location">
|
53
|
+
##FILTER=<ID=LowQual,Description="Low quality">
|
54
|
+
##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
|
55
|
+
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
|
56
|
+
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
|
57
|
+
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
|
58
|
+
##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum DP observed within the GVCF block">
|
59
|
+
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">'
|
60
|
+
|
61
|
+
options.ref.each do |name, seq|
|
62
|
+
puts "##contig=<ID=#{name},length=#{seq.length}>"
|
63
|
+
end
|
64
|
+
puts "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tDUMMYSAMPLENAME"
|
65
|
+
|
66
|
+
d.alignments
|
67
|
+
.group_by{|a| a.refname}.sort_by{|refname, alignments| refname}
|
68
|
+
.take(2)
|
69
|
+
.each do |refname, alignments|
|
70
|
+
cursor = 1
|
71
|
+
overlap = 0
|
72
|
+
alignments.sort_by{|a| [a.refstart, a.refstop].min}.each do |a|
|
73
|
+
ref = options.ref[a.refname].subseq(a.refstart, a.refstop)
|
74
|
+
qry = a.strand ? options.qry[a.queryname].subseq(a.querystart, a.querystop) : options.qry[a.queryname].subseq(a.querystart, a.querystop).complement
|
75
|
+
if a.refstart > cursor
|
76
|
+
puts [refname, cursor, '.', options.ref[refname][cursor].upcase, "<NON_REF>", '.', '.', "END=#{a.refstart - 1}", "GT:DP:GQ:MIN_DP:PL", "0:0:0:0:0,0"].join("\t")
|
77
|
+
cursor = a.refstart
|
78
|
+
elsif a.refstart < cursor
|
79
|
+
overlap = cursor - a.refstart
|
80
|
+
end
|
81
|
+
|
82
|
+
a.distances.inject([0,0]) do |mem, distance|
|
83
|
+
if distance > 0
|
84
|
+
qry.insert(mem.first + distance - 1, ".")
|
85
|
+
[mem.first + distance, mem.last + distance]
|
86
|
+
else
|
87
|
+
ref.insert(mem.last - distance - 1, ".")
|
88
|
+
[mem.first - distance, mem.last - distance]
|
89
|
+
end
|
90
|
+
end
|
91
|
+
overlap2 = 0
|
92
|
+
|
93
|
+
ref.chars.zip(0..(ref.length-1), qry.chars).chunk do |refBase, i, qryBase|
|
94
|
+
if refBase == qryBase
|
95
|
+
"NOVAR"
|
96
|
+
elsif refBase == "."
|
97
|
+
"INS"
|
98
|
+
elsif qryBase == "."
|
99
|
+
"DEL"
|
100
|
+
else
|
101
|
+
"SNP"
|
102
|
+
end
|
103
|
+
end.drop_while do |varClass, arr|
|
104
|
+
if varClass != "INS"
|
105
|
+
overlap, overlap2 = overlap - arr.length , overlap
|
106
|
+
end
|
107
|
+
overlap >= 0
|
108
|
+
end.each do |varClass, arr|
|
109
|
+
case varClass
|
110
|
+
when "NOVAR"
|
111
|
+
if overlap2 > 0
|
112
|
+
refBase = arr[overlap2].first.upcase
|
113
|
+
else
|
114
|
+
refBase = arr.first.first.upcase
|
115
|
+
overlap2 = 0
|
116
|
+
end
|
117
|
+
puts [refname,
|
118
|
+
cursor,
|
119
|
+
".",
|
120
|
+
refBase,
|
121
|
+
"<NON_REF>",
|
122
|
+
'.',
|
123
|
+
'.',
|
124
|
+
"END=#{cursor + arr.length - 1 - overlap2};ALTSCAFF=#{a.queryname};RLEN=#{arr.length};OVERLAP2=#{overlap2}",
|
125
|
+
"GT:DP:GQ:MIN_DP:PL",
|
126
|
+
"0:200:200:200:0,800"].join("\t")
|
127
|
+
cursor += arr.length - overlap2
|
128
|
+
overlap2 = 0
|
129
|
+
when "SNP"
|
130
|
+
arr.each do |snp|
|
131
|
+
puts [refname,
|
132
|
+
cursor,
|
133
|
+
".",
|
134
|
+
snp.first.upcase,
|
135
|
+
[snp.last.upcase, "<NON_REF>"].join(","),
|
136
|
+
'.',
|
137
|
+
'.',
|
138
|
+
"END=#{cursor};ALTSCAFF=#{a.queryname}",
|
139
|
+
"GT:DP:GQ:MIN_DP:PL",
|
140
|
+
"1:200:200:200:800,0,800"].join("\t")
|
141
|
+
cursor += 1
|
142
|
+
end
|
143
|
+
# puts [refname,
|
144
|
+
# cursor,
|
145
|
+
# ".",
|
146
|
+
# arr.map{|a| a.first.upcase}.join,
|
147
|
+
# [arr.map{|a| a.last.upcase}.join, "<NON_REF>"].join(","),
|
148
|
+
# '.',
|
149
|
+
# '.',
|
150
|
+
# "END=#{cursor + arr.length - 1};ALTSCAFF=#{a.queryname}",
|
151
|
+
# "GT:DP:GQ:MIN_DP:PL",
|
152
|
+
# "1:200:200:200:800,0,800"].join("\t")
|
153
|
+
# cursor += arr.length
|
154
|
+
when "INS"
|
155
|
+
refBase = options.ref[refname][cursor-2].upcase
|
156
|
+
puts [refname,
|
157
|
+
cursor-1,
|
158
|
+
".",
|
159
|
+
refBase,
|
160
|
+
[(refBase + arr.map{|a| a.last}.join).upcase, "<NON_REF>"].join(","),
|
161
|
+
'.',
|
162
|
+
'.',
|
163
|
+
"END=#{cursor - 1};ALTSCAFF=#{a.queryname};STRAND=#{a.strand}",
|
164
|
+
"GT:DP:GQ:MIN_DP:PL",
|
165
|
+
"1:200:200:200:800,0,800"].join("\t")
|
166
|
+
when "DEL"
|
167
|
+
puts [refname,
|
168
|
+
cursor,
|
169
|
+
".",
|
170
|
+
arr.map{|a| a.first.upcase}.join,
|
171
|
+
".,<NON_REF>",
|
172
|
+
'.',
|
173
|
+
'.',
|
174
|
+
"END=#{cursor + arr.length - 1};ALTSCAFF=#{a.queryname}",
|
175
|
+
"GT:DP:GQ:MIN_DP:PL",
|
176
|
+
"1:200:200:200:800,0,800"].join("\t")
|
177
|
+
cursor += arr.length
|
178
|
+
end
|
179
|
+
end
|
180
|
+
end
|
181
|
+
end
|
data/lib/bio-mummer/mummer.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
require 'stringio'
|
1
2
|
|
2
3
|
module BioMummer
|
3
4
|
|
@@ -13,25 +14,23 @@ module BioMummer
|
|
13
14
|
@querystop = querystop
|
14
15
|
@strand = strand
|
15
16
|
@distances = distances
|
16
|
-
@deltas = []
|
17
17
|
end
|
18
18
|
|
19
19
|
def deltas
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
end
|
20
|
+
ds = []
|
21
|
+
a = @distances.each_with_object([0]) do |d, arr|
|
22
|
+
state = arr.last
|
23
|
+
if d > 0
|
24
|
+
ds += Array.new(d - 1, state)
|
25
|
+
ds.push(nil)
|
26
|
+
arr << state - 1
|
27
|
+
else
|
28
|
+
ds += Array.new(d * -1 - 1, state)
|
29
|
+
arr << state + 1
|
31
30
|
end
|
32
|
-
@deltas << a.last
|
33
31
|
end
|
34
|
-
|
32
|
+
ds << a.last
|
33
|
+
return ds
|
35
34
|
end
|
36
35
|
|
37
36
|
def ref_to_query(ref_position)
|
@@ -58,6 +57,11 @@ module BioMummer
|
|
58
57
|
@alignments = parse(io.read)
|
59
58
|
end
|
60
59
|
|
60
|
+
def self.open(filename)
|
61
|
+
io = File.open(filename)
|
62
|
+
self.new(super(io))
|
63
|
+
end
|
64
|
+
|
61
65
|
def parse(string)
|
62
66
|
string.split("\n").slice_before(/^>/).flat_map do |block|
|
63
67
|
refname, queryname = block.shift.match(/>(.*) (.*) \d+ \d+/).captures
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bio-mummer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- robsyme
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-03-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: shoulda
|
@@ -97,7 +97,8 @@ dependencies:
|
|
97
97
|
description: Help for working with the output of the .delta files produced by nucmer
|
98
98
|
and promer
|
99
99
|
email: rob.syme@gmail.com
|
100
|
-
executables:
|
100
|
+
executables:
|
101
|
+
- delta2gvcf.rb
|
101
102
|
extensions: []
|
102
103
|
extra_rdoc_files:
|
103
104
|
- LICENSE.txt
|
@@ -112,6 +113,7 @@ files:
|
|
112
113
|
- README.rdoc
|
113
114
|
- Rakefile
|
114
115
|
- VERSION
|
116
|
+
- bin/delta2gvcf.rb
|
115
117
|
- lib/bio-mummer.rb
|
116
118
|
- lib/bio-mummer/mummer.rb
|
117
119
|
- test/data/out.delta
|