dna_sequence_aligner 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,8 @@
1
+ .DS_Store
2
+ pkg/
3
+ rdoc/
4
+ backup/
5
+ config/
6
+ data/
7
+ *.swp
8
+ *.gemspec
data/History ADDED
@@ -0,0 +1,10 @@
1
+ == 0.0.2 / 2010-02-12
2
+
3
+ * standardized output, used it to align 8 different sequences with roughly 40
4
+ total reads.
5
+
6
+ == 0.0.1 / 2010-02-11
7
+
8
+ * Beginning
9
+
10
+
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ The MIT License
2
+
3
+ Copyright (c) 2010 Howard Hughes Medical Institute
4
+ Authored by John T. Prince
5
+
6
+ Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ of this software and associated documentation files (the "Software"), to deal
8
+ in the Software without restriction, including without limitation the rights
9
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ copies of the Software, and to permit persons to whom the Software is
11
+ furnished to do so, subject to the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be included in
14
+ all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
+ THE SOFTWARE.
@@ -0,0 +1,58 @@
1
+ = DNASequenceAligner
2
+
3
+ dna_sequence_aligner assumes you have a template dna sequence. All other DNA
4
+ sequences are matched up with the template and then they are all merged into
5
+ one template-centric alignment. The output was custom designed to show
6
+ coverage at a glance in a template-centric fashion.
7
+
8
+ The software is also written so that you can annotate your template fasta file
9
+ with comments (must lead with a '#' character).
10
+
11
+ == Dependencies
12
+
13
+ *Clustalw* must be installed (*clustalw* package in ubuntu/debian) and generally
14
+ accessible.
15
+
16
+ [Bioruby is heavily relied on, but it is explicitly stated as a gem dependency
17
+ so you shouldn't have to worry about it if installed by gem]
18
+
19
+ == Examples
20
+
21
+ The executable is the main item of interest. It takes one (or more) sequence
22
+ files. Your template should be the first fasta encountered.
23
+
24
+ dna_sequence_aligner template.fasta others.fasta > output.aligned.txt
25
+
26
+ # sequences in separate files
27
+ dna_sequence_aligner template.fasta other1.fasta other2.fasta > output.aligned.txt
28
+
29
+ # all sequences in one file (template first)
30
+ dna_sequence_aligner all_seqs.fasta > output.aligned.txt
31
+
32
+ A comment (#) aware DNA sequence translator is provided to check and see if
33
+ things are in register and so forth. It outputs the DNA sequence and protein
34
+ sequence below it.
35
+
36
+ # -s 2 is a 2 nucleotide frameshift
37
+ dna_translator.rb -s 2 dna_annotated.fasta > protein.txt
38
+
39
+ == Legend
40
+
41
+ all gaps <blank>
42
+ template gap ^
43
+ gap below template .
44
+ agreement =
45
+ all bad matches ^
46
+ non-consensus ?
47
+
48
+ == NOTE
49
+
50
+ This is very much alpha software at the moment. It was written in a time
51
+ crunch and so it is a little rough around the edges. However, key components
52
+ have specs written and appear to work properly. If I have to do more
53
+ alignments or you send me pull requests then this may get to be nicer
54
+ software.
55
+
56
+ == Copyright
57
+
58
+ See LICENSE
@@ -0,0 +1,45 @@
1
+
2
+ require 'rubygems'
3
+ require 'rake'
4
+ require 'jeweler'
5
+ require 'rake/testtask'
6
+ # require 'rcov/rcovtask'
7
+
8
+ NAME = "dna_sequence_aligner"
9
+ WEBSITE_BASE = "website"
10
+ WEBSITE_OUTPUT = WEBSITE_BASE + "/output"
11
+
12
+ gemspec = Gem::Specification.new do |s|
13
+ s.name = NAME
14
+ s.authors = ["John T. Prince"]
15
+ s.email = "jtprince@gmail.com"
16
+ s.homepage = "http://jtprince.github.com/" + NAME
17
+ s.summary = "does high pairwise alignment of sequencing reads with a template"
18
+ s.description = "does high pairwise alignment of sequencing reads with a template using bioruby and clustalw. gives template-centric output."
19
+ s.add_dependency("bio")
20
+ s.add_development_dependency("spec-more")
21
+ end
22
+
23
+ Jeweler::Tasks.new(gemspec)
24
+
25
+ Rake::TestTask.new(:spec) do |spec|
26
+ spec.libs << 'lib' << 'spec'
27
+ spec.pattern = 'spec/**/*_spec.rb'
28
+ spec.verbose = true
29
+ end
30
+
31
+ require 'rake/rdoctask'
32
+ Rake::RDocTask.new do |rdoc|
33
+ base_rdoc_output_dir = WEBSITE_OUTPUT + '/rdoc'
34
+ version = File.read('VERSION')
35
+ rdoc.rdoc_dir = base_rdoc_output_dir + "/#{version}"
36
+ rdoc.title = NAME + ' ' + version
37
+ rdoc.rdoc_files.include('README*')
38
+ rdoc.rdoc_files.include('lib/**/*.rb')
39
+ end
40
+
41
+ task :default => :spec
42
+
43
+ task :build => :gemspec
44
+
45
+ # credit: Rakefile modeled after Jeweler's
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.2
@@ -0,0 +1,61 @@
1
+ #!/usr/bin/ruby
2
+
3
+ require 'bio'
4
+ require 'optparse'
5
+
6
+ require 'bio/alignment/dna_sequence'
7
+
8
+ # returns an Array of entry objects. The method will remove any commented
9
+ # lines first
10
+ def fasta_entries(file)
11
+ clean_string = IO.read(file).split("\n").reject {|line| line =~ /^\#/ }.join("\n")
12
+ io = StringIO.new clean_string
13
+ objects = []
14
+ Bio::FlatFile.auto(io) do |ff|
15
+ ff.each_entry do |entry|
16
+ objects << entry
17
+ end
18
+ end
19
+ objects
20
+ end
21
+
22
+ DNAReads = Bio::Alignment::DNASequenceReads
23
+
24
+ opt = DNAReads::ALIGN_OPTS.dup
25
+
26
+ op = OptionParser.new do |op|
27
+ op.banner = "usage: #{File.basename(__FILE__)} <template>.fasta <read>.txt ..."
28
+ op.separator " "
29
+ op.on("--fidelity-length <#{opt[:fidelity_length]}>", Integer, "min length of correct reads on the ends") {|v| opt[:fidelity_length] = v }
30
+ op.separator " "
31
+ op.on("--type <DNA|PROTEIN>", "type of bio sequences") {|v| opt[:type] = v }
32
+ op.on("--gapopen <#{opt[:gapopen]}>", Float, "gap opening penalty") {|v| opt[:gapopen] = v }
33
+ op.on("--gapext <#{opt[:gapext]}>", Float, "gap extension penalty") {|v| opt[:gapext] = v }
34
+ op.on("--dnamatrix <String>", "DNA weight matrix IUB|CLUSTALW") {|v| opt[:dnamatrix] = v }
35
+ op.separator " "
36
+ op.on("-n", "--no-consensus", "use original seq for fidelity (not cons.)") {|v| opt[:consensus_fidelity] = false }
37
+ op.separator " "
38
+ op.separator "the first sequence is assumed the template sequence"
39
+
40
+ end
41
+
42
+ if ARGV.size == 0
43
+ puts op
44
+ exit
45
+ end
46
+
47
+
48
+ labels = nil
49
+ files = ARGV.map
50
+ fasta_entries = files.inject([]) {|ar, file| ar.push( *fasta_entries(file) ) }
51
+ bioseqs = fasta_entries.map {|entry| Bio::Sequence::NA.new(entry.seq) }
52
+ labels = fasta_entries.map {|entry| entry.definition }
53
+
54
+ pairwise = DNAReads.align_pairwise(bioseqs, opt)
55
+
56
+ (template, others) = DNAReads.merge_pairwise(pairwise)
57
+ template_label = labels.shift
58
+
59
+ DNAReads.print_align(STDOUT, others, labels, :template => template, :template_label => template_label, :chars => 30)
60
+
61
+
@@ -0,0 +1,59 @@
1
+ #!/usr/bin/ruby
2
+
3
+ require 'bio'
4
+ require 'optparse'
5
+
6
+ opt = {
7
+ :frameshift => 0
8
+ }
9
+ op = OptionParser.new do |op|
10
+ op.banner = "usage: #{File.basename(__FILE__)} <dnaseq>.fasta"
11
+ op.on("-s", "--shift <int>", Integer, "frameshift") {|v| opt[:frameshift] = v }
12
+ end
13
+ op.parse!
14
+
15
+ frameshift = opt[:frameshift]
16
+ p frameshift
17
+
18
+ if ARGV.size == 0
19
+ puts op
20
+ exit
21
+ end
22
+
23
+ file = ARGV.shift
24
+ string = IO.read(file).split("\n").reject {|line| line =~ /^\#/ }.join("\n")
25
+ st = StringIO.new(string)
26
+ ff = Bio::FlatFile.auto(st)
27
+
28
+ seqs = []
29
+ ff.each_entry do |entry|
30
+ seq = entry.seq
31
+ seqs << seq[frameshift..-1]
32
+ end
33
+
34
+ length = 70
35
+
36
+ seqs.each do |seq|
37
+ bsq = Bio::Sequence::NA.new(seq)
38
+ protseq = bsq.translate
39
+ start = 0
40
+ loop do
41
+ break if start >= seq.length
42
+ frag = seq[start, length]
43
+ puts frag
44
+ prot_line = (start...(start+length)).to_a.map do |x|
45
+ if x % 3 == 0
46
+ prot_i = x / 3
47
+ protseq[prot_i,1]
48
+ else
49
+ " "
50
+ end
51
+ end.join
52
+ puts prot_line
53
+ start += length
54
+ end
55
+ print "NUM START/STOP CODONS: "
56
+ puts protseq.to_s.split("").select {|v| v == '*' }.size
57
+ end
58
+
59
+
@@ -0,0 +1,313 @@
1
+
2
+
3
+ module Bio
4
+ module Alignment
5
+ module DNASequenceReads
6
+
7
+ module_function
8
+ CLUSTALW_OPTS = %w(gapopen gapext dnamatrix type)
9
+
10
+ # returns the index of the starting run of good chars
11
+ def find_start_good_section(iupac_concensus_string, min_length)
12
+ good_char_count = 0
13
+ char_index = 0
14
+ iupac_concensus_string.each_char do |char|
15
+ if char =~ /[^\?\-Nn]/
16
+ good_char_count += 1
17
+ if good_char_count >= min_length
18
+ break
19
+ end
20
+ else
21
+ good_char_count = 0
22
+ end
23
+ char_index += 1
24
+ end
25
+ char_index - (good_char_count - 1)
26
+ end
27
+
28
+ # returns (start, length) where min_length reads are correct
29
+ def find_good_section(iupac_concensus_string, min_length)
30
+ start = find_start_good_section(iupac_concensus_string, min_length)
31
+ from_end = find_start_good_section(iupac_concensus_string.reverse, min_length)
32
+ length = iupac_concensus_string.length - start - from_end
33
+ if length < 0
34
+ nil
35
+ else
36
+ [start, length]
37
+ end
38
+ end
39
+
40
+ def hash_opts_to_clustalopts(hash)
41
+ array = []
42
+ hash.each do |k,v|
43
+ if CLUSTALW_OPTS.include?(k.to_s)
44
+ array << "-#{k}=#{v}"
45
+ end
46
+ end
47
+ array
48
+ end
49
+
50
+ def lstrip_dash(string)
51
+ chr = first_non_dash_char(string)
52
+ string[chr..-1]
53
+ end
54
+
55
+ def strip_dash(string)
56
+ ls = lstrip_dash(string)
57
+ lstrip_dash(ls.reverse).reverse
58
+ end
59
+
60
+ def first_non_dash_char(string)
61
+ char_cnt = 0
62
+ string.each_char do |char|
63
+ if char != '-'
64
+ break
65
+ end
66
+ char_cnt += 1
67
+ end
68
+ char_cnt
69
+ end
70
+
71
+ def clustal_align(bioseqs, factory)
72
+ al = Bio::Alignment.new(bioseqs)
73
+ al.do_align(factory)
74
+ end
75
+
76
+ ALIGN_OPTS = {
77
+ :type => 'DNA',
78
+ :gapopen => 20,
79
+ :gapext => 20,
80
+ :dnamatrix => 'IUB', # "IUB" || "CLUSTALW"
81
+ :fidelity_length => 10,
82
+ :consensus_fidelity => true,
83
+ }
84
+
85
+ # returns high quality pairwise alignments
86
+ # based on the fidelity_length option
87
+ def align_pairwise(bioseqs, opt={})
88
+ factory = Bio::ClustalW.new
89
+ clustal_opts = hash_opts_to_clustalopts(opt)
90
+ factory.options = clustal_opts
91
+ template = bioseqs.shift
92
+ start_length = []
93
+ pairwise_aligns = bioseqs.map do |bseq|
94
+ clust_al = clustal_align([template, bseq], factory)
95
+ cl_cons = clust_al.consensus
96
+ aligned_string = clust_al[1].to_s
97
+ #(st, len) = find_good_section(aligned_string, opt[:fidelity_length])
98
+ seq_to_use =
99
+ if opt[:consensus_fidelity]
100
+ cl_cons
101
+ else
102
+ aligned_string
103
+ end
104
+ (st, len) = find_good_section(seq_to_use, opt[:fidelity_length])
105
+ if st
106
+ pristine = aligned_string[st, len].gsub('-','') # pristine read (ends removed)
107
+ clustal_align([template.to_s, Bio::Sequence::NA.new(pristine)], factory)
108
+ else
109
+ warn "a sequence does not meeting min fidelity! using original alignment"
110
+ clust_al
111
+ end
112
+
113
+ end
114
+ end
115
+
116
+ # assumes all were aligned to the same template (the first of a pair)
117
+ def merge_pairwise(aligns)
118
+ ps = aligns.map do |align|
119
+ seqs = []
120
+ align.each do |bioseq|
121
+ seqs << bioseq.to_s
122
+ end
123
+ seqs
124
+ end
125
+ template = []
126
+ #m,x,n
127
+ x = 2
128
+ ftemp = ps.first.first
129
+ nmax = ps.map {|pair| pair.first.size }.max
130
+ mmax = ps.size
131
+ mar = (0...mmax).to_a
132
+ others = mar.map { [] }
133
+ ns = mar.map { 0 }
134
+ tn = 0
135
+ on = 0
136
+ (0...nmax).each do |n|
137
+ (t_dsh, t_no_dsh) = mar.partition do |m|
138
+ # this is RUBY 1.8 ONLY!!
139
+ ps[m][0][ns[m]] == 45 # '-' is ascii 45
140
+ end
141
+
142
+ # if a template has a dash, all other off-templates need a dash
143
+ if t_dsh.size > 0
144
+ template[tn] = 45
145
+ t_no_dsh.each do |m|
146
+ # don't update these guys counter
147
+ others[m][tn] = 45
148
+ end
149
+ t_dsh.each do |m|
150
+ others[m][tn] = ps[m][1][ns[m]]
151
+ ns[m] += 1
152
+ end
153
+ else # no dashes in the template
154
+ t_no_dsh.each do |m|
155
+ others[m][tn] = ps[m][1][ns[m]]
156
+ end
157
+ template[tn] = ps[0][0][ns[0]]
158
+ ns.map!{|v| v+1 }
159
+ end
160
+ tn += 1
161
+ end
162
+ [cs_to_s(template), others.map! {|ar| cs_to_s(ar) } ]
163
+ end
164
+
165
+ def cs_to_s(ar)
166
+ ar.map {|v| v.nil? ? '-' : v.chr }.join
167
+ end
168
+
169
+ # adjust all pairwise alignments to fit each other
170
+
171
+ #consensus_template = []
172
+ #max_length = pairs_of_strings.map {|pair| pair.first.size }.max
173
+ #(0...max_length).each do |n|
174
+ # pairs_of_strings.map {|pair| pair.map {|st| st[n] } }
175
+ #end
176
+
177
+ # assumes the first is the template
178
+ def consensus_string_and_stats(strings)
179
+ as_chars = strings.map {|v| v.split("") }
180
+ stats = Array.new(6, 0)
181
+ consensus_string = as_chars.shift.zip(*as_chars).map do |chrs|
182
+ consensus_bool_ar = Array.new(6)
183
+ symbols = [' '] + %w(^ = . ^ ?)
184
+ all_gaps = 0
185
+ template_gap = 1
186
+ agreement = 2
187
+ gap_below_template = 3
188
+ all_bad_matches = 4
189
+ non_consensus = 5
190
+
191
+ first = chrs.shift
192
+ if [first, *chrs].all? {|v| v.nil? or (v == '-') }
193
+ consensus_bool_ar[all_gaps] = true
194
+ elsif first == '-'
195
+ consensus_bool_ar[template_gap] = true
196
+ elsif chrs.all? {|v| v == '-'}
197
+ consensus_bool_ar[gap_below_template] = true
198
+ elsif chrs.all? {|v| (v == '-') or (v == first) }
199
+ consensus_bool_ar[agreement] = true
200
+ elsif chrs.all? {|v| (v == '-') or (v != first) }
201
+ consensus_bool_ar[all_bad_matches] = true
202
+ else
203
+ consensus_bool_ar[non_consensus] = true
204
+ end
205
+ consensus_bool_ar.each_with_index {|v,i| stats[i] += 1 if v }
206
+ symbols[consensus_bool_ar.index(true)]
207
+ end.join
208
+ [consensus_string, stats]
209
+ end
210
+
211
+
212
+ def exactly_chars(string, n)
213
+ at_least = "%#{n}s" % string
214
+ at_least[0,n]
215
+ end
216
+
217
+
218
+ # all gaps <blank>
219
+ # template gap ^
220
+ # gap below template .
221
+ # agreement =
222
+ # all bad matches ^
223
+ # non-consensus ?
224
+ #
225
+ # accepts :template => template_sequence
226
+ def print_align(io, sequences, labels, opts={})
227
+ opts = {:cutoff => 70, :start => 0, :chars => 20}.merge(opts)
228
+ (start, length, chars) = opts.values_at(:start, :cutoff, :chars)
229
+ spacer = " "
230
+
231
+ if opts[:template]
232
+ sequences.unshift(opts[:template])
233
+ labels.unshift(opts[:template_label])
234
+ end
235
+
236
+ all_stats = Array.new(6,0)
237
+ loop do
238
+ fin = false
239
+
240
+ max_length = 0
241
+ lines = []
242
+ consensus_line = ""
243
+ fragments = sequences.map do |string|
244
+ fin = (start >= string.length )
245
+ break if fin
246
+
247
+ string_frag = string[start, length]
248
+
249
+ string_frag
250
+ end ; break if fin
251
+
252
+ doubles = fragments.zip(labels)
253
+
254
+ doubles = doubles.select {|frag, _| (frag.size > 0) && (frag =~ /[^-]/) }
255
+
256
+ max_length = doubles.map {|frag, _| frag.size }.max
257
+
258
+ (cs, stats) = consensus_string_and_stats( doubles.map {|frag,_| frag } )
259
+ all_stats = all_stats.zip(stats).map {|a,b| a + b }
260
+
261
+ doubles.push( [cs, "<CONSENSUS>"] )
262
+
263
+ lines = doubles.map {|frag, label| [exactly_chars(label, chars),spacer,frag].join }
264
+
265
+ ## the counters at the top of the line
266
+ start_s = start.to_s
267
+ finish_s = (start + max_length).to_s
268
+ count_line_gap = max_length - (start_s.size + finish_s.size)
269
+ count_line = [start_s, spacer]
270
+ unless count_line_gap < 1
271
+ count_line << " " * count_line_gap
272
+ end
273
+ io.puts [exactly_chars("", chars), spacer, count_line.join].join
274
+
275
+ io.puts lines.join("\n")
276
+
277
+ io.puts " " # separator between lines
278
+ start += length
279
+ end
280
+ end
281
+
282
+ # # accepts :template => template_sequence
283
+ #def print_align(io, sequences, labels, opts={})
284
+ #opts = {:cutoff => 100, :start => 0, :chars => 20}.merge(opts)
285
+ #(start, length, chars) = opts.values_at(:start, :cutoff, :chars)
286
+ #spacer = " "
287
+
288
+ #loop do
289
+ #fin = false
290
+
291
+ ### the counters at the top of the line
292
+ #start_s = start.to_s
293
+ #finish_s = (start + length).to_s
294
+ #count_line_gap = length - (start_s.size + finish_s.size)
295
+
296
+ #count_line = [start_s, " " * count_line_gap, finish_s].join
297
+ #io.puts [exactly_chars("", chars), spacer, count_line].join
298
+
299
+ #sequences.zip(labels) do |string, label|
300
+ #fin = (start >= string.length )
301
+ #break if fin
302
+ #io.puts "#{exactly_chars(label, chars)}#{spacer}#{string[start,length]}"
303
+ #end
304
+ #io.puts " "
305
+ #break if fin
306
+ #start += length
307
+ #end
308
+ #end
309
+
310
+
311
+ end
312
+ end
313
+ end