dna_sequence_aligner 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,8 @@
1
+ .DS_Store
2
+ pkg/
3
+ rdoc/
4
+ backup/
5
+ config/
6
+ data/
7
+ *.swp
8
+ *.gemspec
data/History ADDED
@@ -0,0 +1,10 @@
1
+ == 0.0.2 / 2010-02-12
2
+
3
+ * standardized output, used it to align 8 different sequences with roughly 40
4
+ total reads.
5
+
6
+ == 0.0.1 / 2010-02-11
7
+
8
+ * Beginning
9
+
10
+
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ The MIT License
2
+
3
+ Copyright (c) 2010 Howard Hughes Medical Institute
4
+ Authored by John T. Prince
5
+
6
+ Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ of this software and associated documentation files (the "Software"), to deal
8
+ in the Software without restriction, including without limitation the rights
9
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ copies of the Software, and to permit persons to whom the Software is
11
+ furnished to do so, subject to the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be included in
14
+ all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
+ THE SOFTWARE.
@@ -0,0 +1,58 @@
1
+ = DNASequenceAligner
2
+
3
+ dna_sequence_aligner assumes you have a template dna sequence. All other DNA
4
+ sequences are matched up with the template and then they are all merged into
5
+ one template-centric alignment. The output was custom designed to show
6
+ coverage at a glance in a template-centric fashion.
7
+
8
+ The software is also written so that you can annotate your template fasta file
9
+ with comments (must lead with a '#' character).
10
+
11
+ == Dependencies
12
+
13
+ *Clustalw* must be installed (*clustalw* package in ubuntu/debian) and generally
14
+ accessible.
15
+
16
+ [Bioruby is heavily relied on, but it is explicitly stated as a gem dependency
17
+ so you shouldn't have to worry about it if installed by gem]
18
+
19
+ == Examples
20
+
21
+ The executable is the main item of interest. It takes one (or more) sequence
22
+ files. Your template should be the first fasta encountered.
23
+
24
+ dna_sequence_aligner template.fasta others.fasta > output.aligned.txt
25
+
26
+ # sequences in separate files
27
+ dna_sequence_aligner template.fasta other1.fasta other2.fasta > output.aligned.txt
28
+
29
+ # all sequences in one file (template first)
30
+ dna_sequence_aligner all_seqs.fasta > output.aligned.txt
31
+
32
+ A comment (#) aware DNA sequence translator is provided to check and see if
33
+ things are in register and so forth. It outputs the DNA sequence and protein
34
+ sequence below it.
35
+
36
+ # -s 2 is a 2 nucleotide frameshift
37
+ dna_translator.rb -s 2 dna_annotated.fasta > protein.txt
38
+
39
+ == Legend
40
+
41
+ all gaps <blank>
42
+ template gap ^
43
+ gap below template .
44
+ agreement =
45
+ all bad matches ^
46
+ non-consensus ?
47
+
48
+ == NOTE
49
+
50
+ This is very much alpha software at the moment. It was written in a time
51
+ crunch and so it is a little rough around the edges. However, key components
52
+ have specs written and appear to work properly. If I have to do more
53
+ alignments or you send me pull requests then this may get to be nicer
54
+ software.
55
+
56
+ == Copyright
57
+
58
+ See LICENSE
@@ -0,0 +1,45 @@
1
+
2
+ require 'rubygems'
3
+ require 'rake'
4
+ require 'jeweler'
5
+ require 'rake/testtask'
6
+ # require 'rcov/rcovtask'
7
+
8
+ NAME = "dna_sequence_aligner"
9
+ WEBSITE_BASE = "website"
10
+ WEBSITE_OUTPUT = WEBSITE_BASE + "/output"
11
+
12
+ gemspec = Gem::Specification.new do |s|
13
+ s.name = NAME
14
+ s.authors = ["John T. Prince"]
15
+ s.email = "jtprince@gmail.com"
16
+ s.homepage = "http://jtprince.github.com/" + NAME
17
+ s.summary = "does high pairwise alignment of sequencing reads with a template"
18
+ s.description = "does high pairwise alignment of sequencing reads with a template using bioruby and clustalw. gives template-centric output."
19
+ s.add_dependency("bio")
20
+ s.add_development_dependency("spec-more")
21
+ end
22
+
23
+ Jeweler::Tasks.new(gemspec)
24
+
25
+ Rake::TestTask.new(:spec) do |spec|
26
+ spec.libs << 'lib' << 'spec'
27
+ spec.pattern = 'spec/**/*_spec.rb'
28
+ spec.verbose = true
29
+ end
30
+
31
+ require 'rake/rdoctask'
32
+ Rake::RDocTask.new do |rdoc|
33
+ base_rdoc_output_dir = WEBSITE_OUTPUT + '/rdoc'
34
+ version = File.read('VERSION')
35
+ rdoc.rdoc_dir = base_rdoc_output_dir + "/#{version}"
36
+ rdoc.title = NAME + ' ' + version
37
+ rdoc.rdoc_files.include('README*')
38
+ rdoc.rdoc_files.include('lib/**/*.rb')
39
+ end
40
+
41
+ task :default => :spec
42
+
43
+ task :build => :gemspec
44
+
45
+ # credit: Rakefile modeled after Jeweler's
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.2
@@ -0,0 +1,61 @@
1
+ #!/usr/bin/ruby
2
+
3
+ require 'bio'
4
+ require 'optparse'
5
+
6
+ require 'bio/alignment/dna_sequence'
7
+
8
+ # returns an Array of entry objects. The method will remove any commented
9
+ # lines first
10
+ def fasta_entries(file)
11
+ clean_string = IO.read(file).split("\n").reject {|line| line =~ /^\#/ }.join("\n")
12
+ io = StringIO.new clean_string
13
+ objects = []
14
+ Bio::FlatFile.auto(io) do |ff|
15
+ ff.each_entry do |entry|
16
+ objects << entry
17
+ end
18
+ end
19
+ objects
20
+ end
21
+
22
+ DNAReads = Bio::Alignment::DNASequenceReads
23
+
24
+ opt = DNAReads::ALIGN_OPTS.dup
25
+
26
+ op = OptionParser.new do |op|
27
+ op.banner = "usage: #{File.basename(__FILE__)} <template>.fasta <read>.txt ..."
28
+ op.separator " "
29
+ op.on("--fidelity-length <#{opt[:fidelity_length]}>", Integer, "min length of correct reads on the ends") {|v| opt[:fidelity_length] = v }
30
+ op.separator " "
31
+ op.on("--type <DNA|PROTEIN>", "type of bio sequences") {|v| opt[:type] = v }
32
+ op.on("--gapopen <#{opt[:gapopen]}>", Float, "gap opening penalty") {|v| opt[:gapopen] = v }
33
+ op.on("--gapext <#{opt[:gapext]}>", Float, "gap extension penalty") {|v| opt[:gapext] = v }
34
+ op.on("--dnamatrix <String>", "DNA weight matrix IUB|CLUSTALW") {|v| opt[:dnamatrix] = v }
35
+ op.separator " "
36
+ op.on("-n", "--no-consensus", "use original seq for fidelity (not cons.)") {|v| opt[:consensus_fidelity] = false }
37
+ op.separator " "
38
+ op.separator "the first sequence is assumed the template sequence"
39
+
40
+ end
41
+
42
+ if ARGV.size == 0
43
+ puts op
44
+ exit
45
+ end
46
+
47
+
48
+ labels = nil
49
+ files = ARGV.map
50
+ fasta_entries = files.inject([]) {|ar, file| ar.push( *fasta_entries(file) ) }
51
+ bioseqs = fasta_entries.map {|entry| Bio::Sequence::NA.new(entry.seq) }
52
+ labels = fasta_entries.map {|entry| entry.definition }
53
+
54
+ pairwise = DNAReads.align_pairwise(bioseqs, opt)
55
+
56
+ (template, others) = DNAReads.merge_pairwise(pairwise)
57
+ template_label = labels.shift
58
+
59
+ DNAReads.print_align(STDOUT, others, labels, :template => template, :template_label => template_label, :chars => 30)
60
+
61
+
@@ -0,0 +1,59 @@
1
+ #!/usr/bin/ruby
2
+
3
+ require 'bio'
4
+ require 'optparse'
5
+
6
+ opt = {
7
+ :frameshift => 0
8
+ }
9
+ op = OptionParser.new do |op|
10
+ op.banner = "usage: #{File.basename(__FILE__)} <dnaseq>.fasta"
11
+ op.on("-s", "--shift <int>", Integer, "frameshift") {|v| opt[:frameshift] = v }
12
+ end
13
+ op.parse!
14
+
15
+ frameshift = opt[:frameshift]
16
+ p frameshift
17
+
18
+ if ARGV.size == 0
19
+ puts op
20
+ exit
21
+ end
22
+
23
+ file = ARGV.shift
24
+ string = IO.read(file).split("\n").reject {|line| line =~ /^\#/ }.join("\n")
25
+ st = StringIO.new(string)
26
+ ff = Bio::FlatFile.auto(st)
27
+
28
+ seqs = []
29
+ ff.each_entry do |entry|
30
+ seq = entry.seq
31
+ seqs << seq[frameshift..-1]
32
+ end
33
+
34
+ length = 70
35
+
36
+ seqs.each do |seq|
37
+ bsq = Bio::Sequence::NA.new(seq)
38
+ protseq = bsq.translate
39
+ start = 0
40
+ loop do
41
+ break if start >= seq.length
42
+ frag = seq[start, length]
43
+ puts frag
44
+ prot_line = (start...(start+length)).to_a.map do |x|
45
+ if x % 3 == 0
46
+ prot_i = x / 3
47
+ protseq[prot_i,1]
48
+ else
49
+ " "
50
+ end
51
+ end.join
52
+ puts prot_line
53
+ start += length
54
+ end
55
+ print "NUM START/STOP CODONS: "
56
+ puts protseq.to_s.split("").select {|v| v == '*' }.size
57
+ end
58
+
59
+
@@ -0,0 +1,313 @@
1
+
2
+
3
+ module Bio
4
+ module Alignment
5
+ module DNASequenceReads
6
+
7
+ module_function
8
+ CLUSTALW_OPTS = %w(gapopen gapext dnamatrix type)
9
+
10
+ # returns the index of the starting run of good chars
11
+ def find_start_good_section(iupac_concensus_string, min_length)
12
+ good_char_count = 0
13
+ char_index = 0
14
+ iupac_concensus_string.each_char do |char|
15
+ if char =~ /[^\?\-Nn]/
16
+ good_char_count += 1
17
+ if good_char_count >= min_length
18
+ break
19
+ end
20
+ else
21
+ good_char_count = 0
22
+ end
23
+ char_index += 1
24
+ end
25
+ char_index - (good_char_count - 1)
26
+ end
27
+
28
+ # returns (start, length) where min_length reads are correct
29
+ def find_good_section(iupac_concensus_string, min_length)
30
+ start = find_start_good_section(iupac_concensus_string, min_length)
31
+ from_end = find_start_good_section(iupac_concensus_string.reverse, min_length)
32
+ length = iupac_concensus_string.length - start - from_end
33
+ if length < 0
34
+ nil
35
+ else
36
+ [start, length]
37
+ end
38
+ end
39
+
40
+ def hash_opts_to_clustalopts(hash)
41
+ array = []
42
+ hash.each do |k,v|
43
+ if CLUSTALW_OPTS.include?(k.to_s)
44
+ array << "-#{k}=#{v}"
45
+ end
46
+ end
47
+ array
48
+ end
49
+
50
+ def lstrip_dash(string)
51
+ chr = first_non_dash_char(string)
52
+ string[chr..-1]
53
+ end
54
+
55
+ def strip_dash(string)
56
+ ls = lstrip_dash(string)
57
+ lstrip_dash(ls.reverse).reverse
58
+ end
59
+
60
+ def first_non_dash_char(string)
61
+ char_cnt = 0
62
+ string.each_char do |char|
63
+ if char != '-'
64
+ break
65
+ end
66
+ char_cnt += 1
67
+ end
68
+ char_cnt
69
+ end
70
+
71
+ def clustal_align(bioseqs, factory)
72
+ al = Bio::Alignment.new(bioseqs)
73
+ al.do_align(factory)
74
+ end
75
+
76
+ ALIGN_OPTS = {
77
+ :type => 'DNA',
78
+ :gapopen => 20,
79
+ :gapext => 20,
80
+ :dnamatrix => 'IUB', # "IUB" || "CLUSTALW"
81
+ :fidelity_length => 10,
82
+ :consensus_fidelity => true,
83
+ }
84
+
85
+ # returns high quality pairwise alignments
86
+ # based on the fidelity_length option
87
+ def align_pairwise(bioseqs, opt={})
88
+ factory = Bio::ClustalW.new
89
+ clustal_opts = hash_opts_to_clustalopts(opt)
90
+ factory.options = clustal_opts
91
+ template = bioseqs.shift
92
+ start_length = []
93
+ pairwise_aligns = bioseqs.map do |bseq|
94
+ clust_al = clustal_align([template, bseq], factory)
95
+ cl_cons = clust_al.consensus
96
+ aligned_string = clust_al[1].to_s
97
+ #(st, len) = find_good_section(aligned_string, opt[:fidelity_length])
98
+ seq_to_use =
99
+ if opt[:consensus_fidelity]
100
+ cl_cons
101
+ else
102
+ aligned_string
103
+ end
104
+ (st, len) = find_good_section(seq_to_use, opt[:fidelity_length])
105
+ if st
106
+ pristine = aligned_string[st, len].gsub('-','') # pristine read (ends removed)
107
+ clustal_align([template.to_s, Bio::Sequence::NA.new(pristine)], factory)
108
+ else
109
+ warn "a sequence does not meeting min fidelity! using original alignment"
110
+ clust_al
111
+ end
112
+
113
+ end
114
+ end
115
+
116
+ # assumes all were aligned to the same template (the first of a pair)
117
+ def merge_pairwise(aligns)
118
+ ps = aligns.map do |align|
119
+ seqs = []
120
+ align.each do |bioseq|
121
+ seqs << bioseq.to_s
122
+ end
123
+ seqs
124
+ end
125
+ template = []
126
+ #m,x,n
127
+ x = 2
128
+ ftemp = ps.first.first
129
+ nmax = ps.map {|pair| pair.first.size }.max
130
+ mmax = ps.size
131
+ mar = (0...mmax).to_a
132
+ others = mar.map { [] }
133
+ ns = mar.map { 0 }
134
+ tn = 0
135
+ on = 0
136
+ (0...nmax).each do |n|
137
+ (t_dsh, t_no_dsh) = mar.partition do |m|
138
+ # this is RUBY 1.8 ONLY!!
139
+ ps[m][0][ns[m]] == 45 # '-' is ascii 45
140
+ end
141
+
142
+ # if a template has a dash, all other off-templates need a dash
143
+ if t_dsh.size > 0
144
+ template[tn] = 45
145
+ t_no_dsh.each do |m|
146
+ # don't update these guys counter
147
+ others[m][tn] = 45
148
+ end
149
+ t_dsh.each do |m|
150
+ others[m][tn] = ps[m][1][ns[m]]
151
+ ns[m] += 1
152
+ end
153
+ else # no dashes in the template
154
+ t_no_dsh.each do |m|
155
+ others[m][tn] = ps[m][1][ns[m]]
156
+ end
157
+ template[tn] = ps[0][0][ns[0]]
158
+ ns.map!{|v| v+1 }
159
+ end
160
+ tn += 1
161
+ end
162
+ [cs_to_s(template), others.map! {|ar| cs_to_s(ar) } ]
163
+ end
164
+
165
+ def cs_to_s(ar)
166
+ ar.map {|v| v.nil? ? '-' : v.chr }.join
167
+ end
168
+
169
+ # adjust all pairwise alignments to fit each other
170
+
171
+ #consensus_template = []
172
+ #max_length = pairs_of_strings.map {|pair| pair.first.size }.max
173
+ #(0...max_length).each do |n|
174
+ # pairs_of_strings.map {|pair| pair.map {|st| st[n] } }
175
+ #end
176
+
177
+ # assumes the first is the template
178
+ def consensus_string_and_stats(strings)
179
+ as_chars = strings.map {|v| v.split("") }
180
+ stats = Array.new(6, 0)
181
+ consensus_string = as_chars.shift.zip(*as_chars).map do |chrs|
182
+ consensus_bool_ar = Array.new(6)
183
+ symbols = [' '] + %w(^ = . ^ ?)
184
+ all_gaps = 0
185
+ template_gap = 1
186
+ agreement = 2
187
+ gap_below_template = 3
188
+ all_bad_matches = 4
189
+ non_consensus = 5
190
+
191
+ first = chrs.shift
192
+ if [first, *chrs].all? {|v| v.nil? or (v == '-') }
193
+ consensus_bool_ar[all_gaps] = true
194
+ elsif first == '-'
195
+ consensus_bool_ar[template_gap] = true
196
+ elsif chrs.all? {|v| v == '-'}
197
+ consensus_bool_ar[gap_below_template] = true
198
+ elsif chrs.all? {|v| (v == '-') or (v == first) }
199
+ consensus_bool_ar[agreement] = true
200
+ elsif chrs.all? {|v| (v == '-') or (v != first) }
201
+ consensus_bool_ar[all_bad_matches] = true
202
+ else
203
+ consensus_bool_ar[non_consensus] = true
204
+ end
205
+ consensus_bool_ar.each_with_index {|v,i| stats[i] += 1 if v }
206
+ symbols[consensus_bool_ar.index(true)]
207
+ end.join
208
+ [consensus_string, stats]
209
+ end
210
+
211
+
212
+ def exactly_chars(string, n)
213
+ at_least = "%#{n}s" % string
214
+ at_least[0,n]
215
+ end
216
+
217
+
218
+ # all gaps <blank>
219
+ # template gap ^
220
+ # gap below template .
221
+ # agreement =
222
+ # all bad matches ^
223
+ # non-consensus ?
224
+ #
225
+ # accepts :template => template_sequence
226
+ def print_align(io, sequences, labels, opts={})
227
+ opts = {:cutoff => 70, :start => 0, :chars => 20}.merge(opts)
228
+ (start, length, chars) = opts.values_at(:start, :cutoff, :chars)
229
+ spacer = " "
230
+
231
+ if opts[:template]
232
+ sequences.unshift(opts[:template])
233
+ labels.unshift(opts[:template_label])
234
+ end
235
+
236
+ all_stats = Array.new(6,0)
237
+ loop do
238
+ fin = false
239
+
240
+ max_length = 0
241
+ lines = []
242
+ consensus_line = ""
243
+ fragments = sequences.map do |string|
244
+ fin = (start >= string.length )
245
+ break if fin
246
+
247
+ string_frag = string[start, length]
248
+
249
+ string_frag
250
+ end ; break if fin
251
+
252
+ doubles = fragments.zip(labels)
253
+
254
+ doubles = doubles.select {|frag, _| (frag.size > 0) && (frag =~ /[^-]/) }
255
+
256
+ max_length = doubles.map {|frag, _| frag.size }.max
257
+
258
+ (cs, stats) = consensus_string_and_stats( doubles.map {|frag,_| frag } )
259
+ all_stats = all_stats.zip(stats).map {|a,b| a + b }
260
+
261
+ doubles.push( [cs, "<CONSENSUS>"] )
262
+
263
+ lines = doubles.map {|frag, label| [exactly_chars(label, chars),spacer,frag].join }
264
+
265
+ ## the counters at the top of the line
266
+ start_s = start.to_s
267
+ finish_s = (start + max_length).to_s
268
+ count_line_gap = max_length - (start_s.size + finish_s.size)
269
+ count_line = [start_s, spacer]
270
+ unless count_line_gap < 1
271
+ count_line << " " * count_line_gap
272
+ end
273
+ io.puts [exactly_chars("", chars), spacer, count_line.join].join
274
+
275
+ io.puts lines.join("\n")
276
+
277
+ io.puts " " # separator between lines
278
+ start += length
279
+ end
280
+ end
281
+
282
+ # # accepts :template => template_sequence
283
+ #def print_align(io, sequences, labels, opts={})
284
+ #opts = {:cutoff => 100, :start => 0, :chars => 20}.merge(opts)
285
+ #(start, length, chars) = opts.values_at(:start, :cutoff, :chars)
286
+ #spacer = " "
287
+
288
+ #loop do
289
+ #fin = false
290
+
291
+ ### the counters at the top of the line
292
+ #start_s = start.to_s
293
+ #finish_s = (start + length).to_s
294
+ #count_line_gap = length - (start_s.size + finish_s.size)
295
+
296
+ #count_line = [start_s, " " * count_line_gap, finish_s].join
297
+ #io.puts [exactly_chars("", chars), spacer, count_line].join
298
+
299
+ #sequences.zip(labels) do |string, label|
300
+ #fin = (start >= string.length )
301
+ #break if fin
302
+ #io.puts "#{exactly_chars(label, chars)}#{spacer}#{string[start,length]}"
303
+ #end
304
+ #io.puts " "
305
+ #break if fin
306
+ #start += length
307
+ #end
308
+ #end
309
+
310
+
311
+ end
312
+ end
313
+ end