dna_sequence_aligner 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,160 @@
1
+ #!/usr/bin/ruby
2
+
3
+ require 'bio'
4
+ require 'optparse'
5
+
6
+ def printv(*args)
7
+ if $VERBOSE
8
+ print(*args) ; $stdout.flush
9
+ end
10
+ end
11
+
12
+ def putsv(*args)
13
+ if $VERBOSE
14
+ puts(*args) ; $stdout.flush
15
+ end
16
+ end
17
+
18
+
19
+ def print_align(io, sequences, labels, opts={})
20
+ opts = {:cutoff => 100, :start => 0, :chars => 20}.merge(opts)
21
+ (start, length, chars) = opts.values_at(:start, :cutoff, :chars)
22
+
23
+ loop do
24
+ fin = false
25
+ sequences.zip(labels) do |string, label|
26
+ fin = (start >= string.length )
27
+ break if fin
28
+ io.puts "#{label.exactly_chars(chars)} : #{string[start,length]}"
29
+ end
30
+ io.puts " "
31
+ break if fin
32
+ start += length
33
+ end
34
+ end
35
+
36
+ class String
37
+
38
+ # returns [% same chars, % same letters (template), % same letters self]
39
+ def percent_similar_to(template)
40
+ num_same = 0
41
+ num_same_letters = 0
42
+ num_letters_in_template = 0
43
+ num_letters_in_self = 0
44
+ (0...(template.size)).each do |i|
45
+ if letters = (self[i,1] =~ /[A-Za-z]/)
46
+ num_letters_in_self += 1
47
+ end
48
+ if template[i,1] =~ /[A-Za-z]/
49
+ num_letters_in_template += 1
50
+ end
51
+ if self[i] == template[i]
52
+ num_same += 1
53
+ if letters
54
+ num_same_letters += 1
55
+ end
56
+ end
57
+ end
58
+ [[num_same, template.size], [num_same_letters, num_letters_in_template], [num_same_letters, num_letters_in_self]].map {|a,b| (a.to_f / b) * 100 }
59
+ end
60
+
61
+ def exactly_chars(n)
62
+ at_least = "%#{n}s" % self
63
+ at_least[0,n]
64
+ end
65
+
66
+ end
67
+
68
+ def seqs_and_defs(file)
69
+ ff = Bio::FlatFile.auto(file)
70
+ na_seq_objs = []
71
+ definitions = []
72
+ ff.each_entry do |entry|
73
+ definitions << entry.definition
74
+ na_seq_objs << Bio::Sequence::NA.new(entry.seq.to_s)
75
+ end
76
+ [na_seq_objs, definitions]
77
+ end
78
+
79
+ opt = {
80
+ :min => 80,
81
+ }
82
+
83
+ $VERBOSE = 3
84
+ opts = OptionParser.new do |op|
85
+ op.banner = "usage: #{File.basename(__FILE__)} <many_seqs>.fasta <one_template>.fasta"
86
+ op.separator "output: <many_seqs>__<one_template>.<Threshold>.aligned"
87
+ op.separator " "
88
+ op.separator "description: goes through and does pairwise matching between all sequences and template,"
89
+ op.separator "then does a multiple alignment on all those with the template."
90
+ op.separator " "
91
+ op.on("-t", "--threshold-pct <#{opt[:min]}>", Float, "minimum % match of given sequence") {|v| opt[:min] = v }
92
+ op.on("-q", "--quiet", "don't give any info while running") {|v| $VERBOSE = false }
93
+ end
94
+ opts.parse!
95
+
96
+
97
+ if ARGV.size != 2
98
+ puts opts
99
+ exit
100
+ end
101
+
102
+ (all_fasta_file, template) = ARGV
103
+
104
+ (file_base, template_base) = ARGV.map do |file|
105
+ File.basename(file, ".*")
106
+ end
107
+
108
+ outfile = [[file_base, template_base].join("__"), opt[:min], 'aligned'].join('.')
109
+
110
+
111
+ (seqs, definitions) = seqs_and_defs(all_fasta_file)
112
+
113
+ (template_seq, template_def) = seqs_and_defs(template).map(&:first)
114
+
115
+ #seqs = seqs[184,10]
116
+ #definitions = definitions[184,10]
117
+
118
+ #PSIM = %w(%chars_to_template %letters_to_template %letters_to_self)
119
+
120
+ factory = Bio::ClustalW.new
121
+
122
+ pass_threshold = []
123
+
124
+ printv "performing pairwise alignments [on #{seqs.size} seqs]: "
125
+ seqs.zip(definitions) do |seq, df|
126
+ if seq.to_s !~ /[^N]/i
127
+ printv '- '
128
+ next
129
+ end
130
+ align = Bio::Alignment.new([template_seq, seq])
131
+ result = align.do_align(factory)
132
+ (template_s, seq_s) = result.map do |seq|
133
+ seq.to_s
134
+ end
135
+ psimilar = seq_s.percent_similar_to(template_s)
136
+ printv( ("%.0f" % psimilar.last) + ' ')
137
+ if psimilar.last > opt[:min]
138
+ pass_threshold << [df, seq]
139
+ #printv '*'
140
+ else
141
+ #printv '.'
142
+ end
143
+ end
144
+ putsv "Done!"
145
+
146
+ abort "none found above threshold! #{opt[:min]}" if pass_threshold.size == 0
147
+ putsv "Found #{pass_threshold.size} sequence(s) above #{opt[:min]}% identical"
148
+
149
+ pass_threshold << [template_def, template_seq]
150
+
151
+ multi_align = Bio::Alignment.new( pass_threshold.map {|pair| pair.last } )
152
+ m_result = multi_align.do_align(factory).strip
153
+
154
+ labels = pass_threshold.map {|pair| pair.first }
155
+ aligned_seqs = m_result.map {|seq| seq.to_s }
156
+
157
+ File.open(outfile, 'w') do |out|
158
+ print_align(out, aligned_seqs, labels)
159
+ end
160
+
@@ -0,0 +1,143 @@
1
+ #!/usr/bin/ruby
2
+
3
+ require 'bio'
4
+ require 'optparse'
5
+
6
+ def printv(*args)
7
+ if $VERBOSE
8
+ print(*args) ; $stdout.flush
9
+ end
10
+ end
11
+
12
+ def putsv(*args)
13
+ if $VERBOSE
14
+ puts(*args) ; $stdout.flush
15
+ end
16
+ end
17
+
18
+
19
+ def print_align(io, sequences, labels, opts={})
20
+ opts = {:cutoff => 100, :start => 0, :chars => 20}.merge(opts)
21
+ (start, length, chars) = opts.values_at(:start, :cutoff, :chars)
22
+
23
+ loop do
24
+ fin = false
25
+ sequences.zip(labels) do |string, label|
26
+ fin = (start >= string.length )
27
+ break if fin
28
+ io.puts "#{label.exactly_chars(chars)} : #{string[start,length]}"
29
+ end
30
+ io.puts " "
31
+ break if fin
32
+ start += length
33
+ end
34
+ end
35
+
36
+ class String
37
+
38
+ # returns [% same chars, % same letters (template), % same letters self]
39
+ def percent_similar_to(template)
40
+ num_same = 0
41
+ num_same_letters = 0
42
+ num_letters_in_template = 0
43
+ num_letters_in_self = 0
44
+ (0...(template.size)).each do |i|
45
+ if letters = (self[i,1] =~ /[A-Za-z]/)
46
+ num_letters_in_self += 1
47
+ end
48
+ if template[i,1] =~ /[A-Za-z]/
49
+ num_letters_in_template += 1
50
+ end
51
+ if self[i] == template[i]
52
+ num_same += 1
53
+ if letters
54
+ num_same_letters += 1
55
+ end
56
+ end
57
+ end
58
+ [[num_same, template.size], [num_same_letters, num_letters_in_template], [num_same_letters, num_letters_in_self]].map {|a,b| (a.to_f / b) * 100 }
59
+ end
60
+
61
+ def exactly_chars(n)
62
+ at_least = "%#{n}s" % self
63
+ at_least[0,n]
64
+ end
65
+
66
+ end
67
+
68
+ def seqs_and_defs(file)
69
+ ff = Bio::FlatFile.auto(file)
70
+ na_seq_objs = []
71
+ definitions = []
72
+ ff.each_entry do |entry|
73
+ definitions << entry.definition
74
+ na_seq_objs << Bio::Sequence::NA.new(entry.seq.to_s)
75
+ end
76
+ [na_seq_objs, definitions]
77
+ end
78
+
79
+ outfile = "aligned.txt"
80
+
81
+ $VERBOSE = 3
82
+ opts = OptionParser.new do |op|
83
+ op.banner = "usage: #{File.basename(__FILE__)} template.fasta"
84
+ op.separator "output: aligned.txt"
85
+ op.separator "if template.ANNOTATED.fasta, then strips leading '#' lines and writes template.fasta"
86
+ op.separator " "
87
+ end
88
+ opts.parse!
89
+
90
+
91
+ if ARGV.size != 2
92
+ puts opts
93
+ exit
94
+ end
95
+
96
+ template = ARGV.shift
97
+
98
+ #seqs = seqs[184,10]
99
+ #definitions = definitions[184,10]
100
+
101
+ #PSIM = %w(%chars_to_template %letters_to_template %letters_to_self)
102
+
103
+ factory = Bio::ClustalW.new
104
+
105
+ pass_threshold = []
106
+
107
+ printv "performing pairwise alignments [on #{seqs.size} seqs]: "
108
+ seqs.zip(definitions) do |seq, df|
109
+ if seq.to_s !~ /[^N]/i
110
+ printv '- '
111
+ next
112
+ end
113
+ align = Bio::Alignment.new([template_seq, seq])
114
+ result = align.do_align(factory)
115
+ (template_s, seq_s) = result.map do |seq|
116
+ seq.to_s
117
+ end
118
+ psimilar = seq_s.percent_similar_to(template_s)
119
+ printv( ("%.0f" % psimilar.last) + ' ')
120
+ if psimilar.last > opt[:min]
121
+ pass_threshold << [df, seq]
122
+ #printv '*'
123
+ else
124
+ #printv '.'
125
+ end
126
+ end
127
+ putsv "Done!"
128
+
129
+ abort "none found above threshold! #{opt[:min]}" if pass_threshold.size == 0
130
+ putsv "Found #{pass_threshold.size} sequence(s) above #{opt[:min]}% identical"
131
+
132
+ pass_threshold << [template_def, template_seq]
133
+
134
+ multi_align = Bio::Alignment.new( pass_threshold.map {|pair| pair.last } )
135
+ m_result = multi_align.do_align(factory).strip
136
+
137
+ labels = pass_threshold.map {|pair| pair.first }
138
+ aligned_seqs = m_result.map {|seq| seq.to_s }
139
+
140
+ File.open(outfile, 'w') do |out|
141
+ print_align(out, aligned_seqs, labels)
142
+ end
143
+
@@ -0,0 +1,73 @@
1
+ # http://align.genome.jp/clustalw/clustalw_help.html
2
+
3
+ >>HELP 8 << Help for command line parameters
4
+
5
+ DATA (sequences)
6
+
7
+ -INFILE=file.ext :input sequences.
8
+
9
+
10
+
11
+ VERBS (do things)
12
+
13
+ -OPTIONS :list the command line parameters
14
+ -HELP or -CHECK :outline the command line params.
15
+ -ALIGN :do full multiple alignment
16
+ -TREE :calculate NJ tree.
17
+ -BOOTSTRAP(=n) :bootstrap a NJ tree (n= number of bootstraps; def. = 1000).
18
+ -CONVERT :output the input sequences in a different file format.
19
+
20
+
21
+ PARAMETERS (set things)
22
+
23
+ ***General settings:****
24
+ -INTERACTIVE :read command line, then enter normal interactive menus
25
+ -QUICKTREE :use FAST algorithm for the alignment guide tree
26
+ -TYPE= :PROTEIN or DNA sequences
27
+ -NEGATIVE :protein alignment with negative values in matrix
28
+ -OUTFILE= :sequence alignment file name
29
+ -OUTPUT= :GCG, GDE, PHYLIP, PIR or NEXUS
30
+ -OUTORDER= :INPUT or ALIGNED
31
+ -CASE :LOWER or UPPER (for GDE output only)
32
+ -SEQNOS= :OFF or ON (for Clustal output only)
33
+ -SEQNO_RANGE=:OFF or ON (NEW: for all output formats)
34
+ -RANGE=m,n :sequence range to write starting m to m+n.
35
+
36
+ ***Fast Pairwise Alignments:***
37
+ -KTUPLE=n :word size
38
+ -TOPDIAGS=n :number of best diags.
39
+ -WINDOW=n :window around best diags.
40
+ -PAIRGAP=n :gap penalty
41
+ -SCORE :PERCENT or ABSOLUTE
42
+
43
+
44
+ ***Slow Pairwise Alignments:***
45
+ -PWMATRIX= :Protein weight matrix=BLOSUM, PAM, GONNET, ID or filename
46
+ -PWDNAMATRIX= :DNA weight matrix=IUB, CLUSTALW or filename
47
+ -PWGAPOPEN=f :gap opening penalty
48
+ -PWGAPEXT=f :gap opening penalty
49
+
50
+
51
+ ***Multiple Alignments:***
52
+ -NEWTREE= :file for new guide tree
53
+ -USETREE= :file for old guide tree
54
+ -MATRIX= :Protein weight matrix=BLOSUM, PAM, GONNET, ID or filename
55
+ -DNAMATRIX= :DNA weight matrix=IUB, CLUSTALW or filename
56
+ -GAPOPEN=f :gap opening penalty
57
+ -GAPEXT=f :gap extension penalty
58
+ -ENDGAPS :no end gap separation pen.
59
+ -GAPDIST=n :gap separation pen. range
60
+ -NOPGAP :residue-specific gaps off
61
+ -NOHGAP :hydrophilic gaps off
62
+ -HGAPRESIDUES= :list hydrophilic res.
63
+ -MAXDIV=n :% ident. for delay
64
+ -TYPE= :PROTEIN or DNA
65
+ -TRANSWEIGHT=f :transitions weighting
66
+
67
+
68
+ ***Trees:***
69
+ -OUTPUTTREE=nj OR phylip OR dist OR nexus
70
+ -SEED=n :seed number for bootstraps.
71
+ -KIMURA :use Kimura's correction.
72
+ -TOSSGAPS :ignore positions with gaps.
73
+ -BOOTLABELS=node OR branch :position of bootstrap values in tree display
@@ -0,0 +1,19 @@
1
+ #!/usr/bin/ruby
2
+
3
+ outfile = "ANALYZE.FASTA"
4
+
5
+ if ARGV.size == 0
6
+ puts "usage: #{File.basename(__FILE__)} <file>.fasta ..."
7
+ puts "comments (starting with '#') are ok"
8
+ puts "outputs: #{outfile}"
9
+ exit
10
+ end
11
+
12
+ all_text = ARGV.map do |file|
13
+ IO.read(file).split("\n").reject {|line| line =~ /^\#/ }.join("\n")
14
+ end.join("\n")
15
+
16
+ File.open(outfile, 'w') do |out|
17
+ out.print all_text
18
+ end
19
+
@@ -0,0 +1,67 @@
1
+
2
+ require File.dirname(__FILE__) + '/spec_helper'
3
+ require 'bio/alignment/dna_sequence'
4
+
5
+ DNAReads = Bio::Alignment::DNASequenceReads
6
+
7
+ describe 'aligning' do
8
+
9
+ before do
10
+ @string = 'AAAATTTTGGGGGCCCCCC'
11
+ @conc = '--A?A-AT?TTGGGGGCCCAAC?C---'
12
+ @testcase = "testcase.fasta"
13
+
14
+ @pa = [ ["--ABCDEFGHIJKLMNOP",
15
+ "-----DEFGHIJK-MN--"],
16
+ ["--ABCDEFGHIJKLM-NOP",
17
+ "--ABCDE---IJKLMZNOP"],
18
+ ["--ABCDEFGHIJKLMNOP",
19
+ "-------------LMNOP"],
20
+ ["--ABCDEFGHIJKLMNOP",
21
+ "--ABCDEFGHIJKLMN--"],
22
+ ["--ABCDEFGHIJKLMNOP",
23
+ "--ABC------JKLM--P"],
24
+ ["--ABC--DEFGHIJKLMNOP",
25
+ "--ABCZZDEFGHIJKLMNOP"],
26
+ ]
27
+ @template = "--ABC--DEFGHIJKLM-NO"
28
+ @aligned = ["-------DEFGHIJK-M-N-",
29
+ "--ABC--DE---IJKLMZNO",
30
+ "---------------LM-NO",
31
+ "--ABC--DEFGHIJKLM-N-",
32
+ "--ABC--------JKLM---",
33
+ "--ABCZZDEFGHIJKLM-NO"
34
+ ]
35
+
36
+ @labels = %w(one two three four five six)
37
+
38
+ end
39
+
40
+ it 'removes bad ends' do
41
+ (start, len) = DNAReads.find_good_section(@conc, 4)
42
+ @conc[start, len].is "TTGGGGGCCCAAC"
43
+ end
44
+
45
+ it 'aligns pairwise' do
46
+ (template, others) = DNAReads.merge_pairwise(@pa)
47
+ template.is @template
48
+ @aligned.enums others
49
+ end
50
+
51
+ it 'can create a good consensus string' do
52
+ (string, stats) = DNAReads.consensus_string_and_stats([@template, *@aligned])
53
+ string.is " ===^^==========^=="
54
+ stats.enums [2, 3, 15, 0, 0, 0]
55
+ (string, stats) = DNAReads.consensus_string_and_stats([@template, "-------DEFGHIJK-M-N-"])
56
+ string.is " ... ========.= =."
57
+ stats.enums [5, 0, 10, 5, 0, 0]
58
+ end
59
+
60
+ xit 'prints useful printout' do
61
+ st = StringIO.new
62
+ DNAReads.print_align(st, @aligned, @labels, :template => @template, :template_label => "template", :chars => 8)
63
+ puts " "
64
+ puts st.string
65
+ 1.is 1
66
+ end
67
+ end