bio-mummer 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/VERSION +1 -1
- data/bin/delta2gvcf.rb +181 -0
- data/lib/bio-mummer/mummer.rb +18 -14
- metadata +5 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 37345c73287e9b80c15dc7426ee1045caad772f7
|
4
|
+
data.tar.gz: 8b57b1755f5da0ba52af5ccbb92c3d55f06964e5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c86e8bb5d957da2331d817101577aa9990e4685a6445d5b194747c069fc8d5708b93465e20ff95b4e1a37de9ab02402552c1d77a7ba1acd98ddbeb61bb98d574
|
7
|
+
data.tar.gz: ffb5c859d7add8bb10c9d0a8dc5dc98a51dcb9a06370633d7fe6d69a3bd8d295836bafe4e32796ba6b62a855cde9a224731aa1c1cf45e077e8cc21eb4c31fb84
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.2.0
|
data/bin/delta2gvcf.rb
ADDED
@@ -0,0 +1,181 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'stringio'
|
4
|
+
require 'optparse'
|
5
|
+
require 'ostruct'
|
6
|
+
require 'pathname'
|
7
|
+
require 'tmpdir'
|
8
|
+
require 'bio-mummer'
|
9
|
+
require 'bio'
|
10
|
+
|
11
|
+
class OptParser
|
12
|
+
def self.parse(args)
|
13
|
+
options = OpenStruct.new
|
14
|
+
|
15
|
+
opt_parser = OptionParser.new do |opts|
|
16
|
+
opts.banner = "Usage: #{File.basename(__FILE__)} [options]"
|
17
|
+
opts.separator ""
|
18
|
+
opts.separator "Specific options:"
|
19
|
+
|
20
|
+
opts.on("-d", "--delta FILENAME", "Delta file generated by nucmer") do |fn|
|
21
|
+
pn = Pathname.new(fn)
|
22
|
+
if pn.size?
|
23
|
+
options.delta = pn
|
24
|
+
match = pn.each_line.first.match(/(?<ref>\/.*) (?<qry>\/.*)/)
|
25
|
+
if match
|
26
|
+
ref = Pathname.new(match[:ref])
|
27
|
+
if ref.size?
|
28
|
+
options.ref = Hash[Bio::FlatFile.open(ref).map{|e| [e.entry_id, e.naseq]}]
|
29
|
+
end
|
30
|
+
|
31
|
+
qry = Pathname.new(match[:qry])
|
32
|
+
if qry.size?
|
33
|
+
options.qry = Hash[Bio::FlatFile.open(qry).map{|e| [e.entry_id, e.naseq]}]
|
34
|
+
end
|
35
|
+
end
|
36
|
+
else
|
37
|
+
$stderr.puts "Error: cannot read file #{fn}"
|
38
|
+
exit(1)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
opt_parser.parse!(args)
|
43
|
+
return options
|
44
|
+
end
|
45
|
+
end
|
46
|
+
options = OptParser.parse(ARGV)
|
47
|
+
|
48
|
+
# Open the Delta File
|
49
|
+
d = BioMummer::DeltaFile.new(StringIO.new(`delta-filter -r -q #{options.delta}`))
|
50
|
+
|
51
|
+
puts '##fileformat=VCFv4.1
|
52
|
+
##ALT=<ID=NON_REF,Description="Represents any possible alternative allele at this location">
|
53
|
+
##FILTER=<ID=LowQual,Description="Low quality">
|
54
|
+
##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
|
55
|
+
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
|
56
|
+
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
|
57
|
+
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
|
58
|
+
##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum DP observed within the GVCF block">
|
59
|
+
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">'
|
60
|
+
|
61
|
+
options.ref.each do |name, seq|
|
62
|
+
puts "##contig=<ID=#{name},length=#{seq.length}>"
|
63
|
+
end
|
64
|
+
puts "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tDUMMYSAMPLENAME"
|
65
|
+
|
66
|
+
d.alignments
|
67
|
+
.group_by{|a| a.refname}.sort_by{|refname, alignments| refname}
|
68
|
+
.take(2)
|
69
|
+
.each do |refname, alignments|
|
70
|
+
cursor = 1
|
71
|
+
overlap = 0
|
72
|
+
alignments.sort_by{|a| [a.refstart, a.refstop].min}.each do |a|
|
73
|
+
ref = options.ref[a.refname].subseq(a.refstart, a.refstop)
|
74
|
+
qry = a.strand ? options.qry[a.queryname].subseq(a.querystart, a.querystop) : options.qry[a.queryname].subseq(a.querystart, a.querystop).complement
|
75
|
+
if a.refstart > cursor
|
76
|
+
puts [refname, cursor, '.', options.ref[refname][cursor].upcase, "<NON_REF>", '.', '.', "END=#{a.refstart - 1}", "GT:DP:GQ:MIN_DP:PL", "0:0:0:0:0,0"].join("\t")
|
77
|
+
cursor = a.refstart
|
78
|
+
elsif a.refstart < cursor
|
79
|
+
overlap = cursor - a.refstart
|
80
|
+
end
|
81
|
+
|
82
|
+
a.distances.inject([0,0]) do |mem, distance|
|
83
|
+
if distance > 0
|
84
|
+
qry.insert(mem.first + distance - 1, ".")
|
85
|
+
[mem.first + distance, mem.last + distance]
|
86
|
+
else
|
87
|
+
ref.insert(mem.last - distance - 1, ".")
|
88
|
+
[mem.first - distance, mem.last - distance]
|
89
|
+
end
|
90
|
+
end
|
91
|
+
overlap2 = 0
|
92
|
+
|
93
|
+
ref.chars.zip(0..(ref.length-1), qry.chars).chunk do |refBase, i, qryBase|
|
94
|
+
if refBase == qryBase
|
95
|
+
"NOVAR"
|
96
|
+
elsif refBase == "."
|
97
|
+
"INS"
|
98
|
+
elsif qryBase == "."
|
99
|
+
"DEL"
|
100
|
+
else
|
101
|
+
"SNP"
|
102
|
+
end
|
103
|
+
end.drop_while do |varClass, arr|
|
104
|
+
if varClass != "INS"
|
105
|
+
overlap, overlap2 = overlap - arr.length , overlap
|
106
|
+
end
|
107
|
+
overlap >= 0
|
108
|
+
end.each do |varClass, arr|
|
109
|
+
case varClass
|
110
|
+
when "NOVAR"
|
111
|
+
if overlap2 > 0
|
112
|
+
refBase = arr[overlap2].first.upcase
|
113
|
+
else
|
114
|
+
refBase = arr.first.first.upcase
|
115
|
+
overlap2 = 0
|
116
|
+
end
|
117
|
+
puts [refname,
|
118
|
+
cursor,
|
119
|
+
".",
|
120
|
+
refBase,
|
121
|
+
"<NON_REF>",
|
122
|
+
'.',
|
123
|
+
'.',
|
124
|
+
"END=#{cursor + arr.length - 1 - overlap2};ALTSCAFF=#{a.queryname};RLEN=#{arr.length};OVERLAP2=#{overlap2}",
|
125
|
+
"GT:DP:GQ:MIN_DP:PL",
|
126
|
+
"0:200:200:200:0,800"].join("\t")
|
127
|
+
cursor += arr.length - overlap2
|
128
|
+
overlap2 = 0
|
129
|
+
when "SNP"
|
130
|
+
arr.each do |snp|
|
131
|
+
puts [refname,
|
132
|
+
cursor,
|
133
|
+
".",
|
134
|
+
snp.first.upcase,
|
135
|
+
[snp.last.upcase, "<NON_REF>"].join(","),
|
136
|
+
'.',
|
137
|
+
'.',
|
138
|
+
"END=#{cursor};ALTSCAFF=#{a.queryname}",
|
139
|
+
"GT:DP:GQ:MIN_DP:PL",
|
140
|
+
"1:200:200:200:800,0,800"].join("\t")
|
141
|
+
cursor += 1
|
142
|
+
end
|
143
|
+
# puts [refname,
|
144
|
+
# cursor,
|
145
|
+
# ".",
|
146
|
+
# arr.map{|a| a.first.upcase}.join,
|
147
|
+
# [arr.map{|a| a.last.upcase}.join, "<NON_REF>"].join(","),
|
148
|
+
# '.',
|
149
|
+
# '.',
|
150
|
+
# "END=#{cursor + arr.length - 1};ALTSCAFF=#{a.queryname}",
|
151
|
+
# "GT:DP:GQ:MIN_DP:PL",
|
152
|
+
# "1:200:200:200:800,0,800"].join("\t")
|
153
|
+
# cursor += arr.length
|
154
|
+
when "INS"
|
155
|
+
refBase = options.ref[refname][cursor-2].upcase
|
156
|
+
puts [refname,
|
157
|
+
cursor-1,
|
158
|
+
".",
|
159
|
+
refBase,
|
160
|
+
[(refBase + arr.map{|a| a.last}.join).upcase, "<NON_REF>"].join(","),
|
161
|
+
'.',
|
162
|
+
'.',
|
163
|
+
"END=#{cursor - 1};ALTSCAFF=#{a.queryname};STRAND=#{a.strand}",
|
164
|
+
"GT:DP:GQ:MIN_DP:PL",
|
165
|
+
"1:200:200:200:800,0,800"].join("\t")
|
166
|
+
when "DEL"
|
167
|
+
puts [refname,
|
168
|
+
cursor,
|
169
|
+
".",
|
170
|
+
arr.map{|a| a.first.upcase}.join,
|
171
|
+
".,<NON_REF>",
|
172
|
+
'.',
|
173
|
+
'.',
|
174
|
+
"END=#{cursor + arr.length - 1};ALTSCAFF=#{a.queryname}",
|
175
|
+
"GT:DP:GQ:MIN_DP:PL",
|
176
|
+
"1:200:200:200:800,0,800"].join("\t")
|
177
|
+
cursor += arr.length
|
178
|
+
end
|
179
|
+
end
|
180
|
+
end
|
181
|
+
end
|
data/lib/bio-mummer/mummer.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
require 'stringio'
|
1
2
|
|
2
3
|
module BioMummer
|
3
4
|
|
@@ -13,25 +14,23 @@ module BioMummer
|
|
13
14
|
@querystop = querystop
|
14
15
|
@strand = strand
|
15
16
|
@distances = distances
|
16
|
-
@deltas = []
|
17
17
|
end
|
18
18
|
|
19
19
|
def deltas
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
end
|
20
|
+
ds = []
|
21
|
+
a = @distances.each_with_object([0]) do |d, arr|
|
22
|
+
state = arr.last
|
23
|
+
if d > 0
|
24
|
+
ds += Array.new(d - 1, state)
|
25
|
+
ds.push(nil)
|
26
|
+
arr << state - 1
|
27
|
+
else
|
28
|
+
ds += Array.new(d * -1 - 1, state)
|
29
|
+
arr << state + 1
|
31
30
|
end
|
32
|
-
@deltas << a.last
|
33
31
|
end
|
34
|
-
|
32
|
+
ds << a.last
|
33
|
+
return ds
|
35
34
|
end
|
36
35
|
|
37
36
|
def ref_to_query(ref_position)
|
@@ -58,6 +57,11 @@ module BioMummer
|
|
58
57
|
@alignments = parse(io.read)
|
59
58
|
end
|
60
59
|
|
60
|
+
def self.open(filename)
|
61
|
+
io = File.open(filename)
|
62
|
+
self.new(super(io))
|
63
|
+
end
|
64
|
+
|
61
65
|
def parse(string)
|
62
66
|
string.split("\n").slice_before(/^>/).flat_map do |block|
|
63
67
|
refname, queryname = block.shift.match(/>(.*) (.*) \d+ \d+/).captures
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bio-mummer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- robsyme
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-03-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: shoulda
|
@@ -97,7 +97,8 @@ dependencies:
|
|
97
97
|
description: Help for working with the output of the .delta files produced by nucmer
|
98
98
|
and promer
|
99
99
|
email: rob.syme@gmail.com
|
100
|
-
executables:
|
100
|
+
executables:
|
101
|
+
- delta2gvcf.rb
|
101
102
|
extensions: []
|
102
103
|
extra_rdoc_files:
|
103
104
|
- LICENSE.txt
|
@@ -112,6 +113,7 @@ files:
|
|
112
113
|
- README.rdoc
|
113
114
|
- Rakefile
|
114
115
|
- VERSION
|
116
|
+
- bin/delta2gvcf.rb
|
115
117
|
- lib/bio-mummer.rb
|
116
118
|
- lib/bio-mummer/mummer.rb
|
117
119
|
- test/data/out.delta
|