dna_sequence_aligner 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +8 -0
- data/History +10 -0
- data/LICENSE +22 -0
- data/README.rdoc +58 -0
- data/Rakefile +45 -0
- data/VERSION +1 -0
- data/bin/dna_sequence_aligner +61 -0
- data/bin/dna_translator.rb +59 -0
- data/lib/bio/alignment/dna_sequence.rb +313 -0
- data/older/align_all.rb +160 -0
- data/older/align_to_template.rb +143 -0
- data/reference/clustalw_opts.txt +73 -0
- data/script/fasta_compile_annotated.rb +19 -0
- data/spec/align_spec.rb +67 -0
- data/spec/spec_helper.rb +6 -0
- data/spec/testfiles/HA-mKSR1-KSRF1.txt +19 -0
- data/spec/testfiles/HA-mKSR1-KSRF2.txt +20 -0
- data/spec/testfiles/HA-mKSR1-KSRF3.txt +20 -0
- data/spec/testfiles/HA-mKSR1-KSRF4.txt +20 -0
- data/spec/testfiles/HA-mKSR1-KSRF5.txt +20 -0
- data/spec/testfiles/HA-mKSR1-OXL33.txt +20 -0
- data/spec/testfiles/KSR1_mouse_NM_013571_in_HA_pREX.ANNOTATED.fasta +77 -0
- data/spec/testfiles/testcase.fasta +55 -0
- metadata +99 -0
data/older/align_all.rb
ADDED
@@ -0,0 +1,160 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
|
3
|
+
require 'bio'
|
4
|
+
require 'optparse'
|
5
|
+
|
6
|
+
def printv(*args)
|
7
|
+
if $VERBOSE
|
8
|
+
print(*args) ; $stdout.flush
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
def putsv(*args)
|
13
|
+
if $VERBOSE
|
14
|
+
puts(*args) ; $stdout.flush
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
|
19
|
+
def print_align(io, sequences, labels, opts={})
|
20
|
+
opts = {:cutoff => 100, :start => 0, :chars => 20}.merge(opts)
|
21
|
+
(start, length, chars) = opts.values_at(:start, :cutoff, :chars)
|
22
|
+
|
23
|
+
loop do
|
24
|
+
fin = false
|
25
|
+
sequences.zip(labels) do |string, label|
|
26
|
+
fin = (start >= string.length )
|
27
|
+
break if fin
|
28
|
+
io.puts "#{label.exactly_chars(chars)} : #{string[start,length]}"
|
29
|
+
end
|
30
|
+
io.puts " "
|
31
|
+
break if fin
|
32
|
+
start += length
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
class String
|
37
|
+
|
38
|
+
# returns [% same chars, % same letters (template), % same letters self]
|
39
|
+
def percent_similar_to(template)
|
40
|
+
num_same = 0
|
41
|
+
num_same_letters = 0
|
42
|
+
num_letters_in_template = 0
|
43
|
+
num_letters_in_self = 0
|
44
|
+
(0...(template.size)).each do |i|
|
45
|
+
if letters = (self[i,1] =~ /[A-Za-z]/)
|
46
|
+
num_letters_in_self += 1
|
47
|
+
end
|
48
|
+
if template[i,1] =~ /[A-Za-z]/
|
49
|
+
num_letters_in_template += 1
|
50
|
+
end
|
51
|
+
if self[i] == template[i]
|
52
|
+
num_same += 1
|
53
|
+
if letters
|
54
|
+
num_same_letters += 1
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
[[num_same, template.size], [num_same_letters, num_letters_in_template], [num_same_letters, num_letters_in_self]].map {|a,b| (a.to_f / b) * 100 }
|
59
|
+
end
|
60
|
+
|
61
|
+
def exactly_chars(n)
|
62
|
+
at_least = "%#{n}s" % self
|
63
|
+
at_least[0,n]
|
64
|
+
end
|
65
|
+
|
66
|
+
end
|
67
|
+
|
68
|
+
def seqs_and_defs(file)
|
69
|
+
ff = Bio::FlatFile.auto(file)
|
70
|
+
na_seq_objs = []
|
71
|
+
definitions = []
|
72
|
+
ff.each_entry do |entry|
|
73
|
+
definitions << entry.definition
|
74
|
+
na_seq_objs << Bio::Sequence::NA.new(entry.seq.to_s)
|
75
|
+
end
|
76
|
+
[na_seq_objs, definitions]
|
77
|
+
end
|
78
|
+
|
79
|
+
opt = {
|
80
|
+
:min => 80,
|
81
|
+
}
|
82
|
+
|
83
|
+
$VERBOSE = 3
|
84
|
+
opts = OptionParser.new do |op|
|
85
|
+
op.banner = "usage: #{File.basename(__FILE__)} <many_seqs>.fasta <one_template>.fasta"
|
86
|
+
op.separator "output: <many_seqs>__<one_template>.<Threshold>.aligned"
|
87
|
+
op.separator " "
|
88
|
+
op.separator "description: goes through and does pairwise matching between all sequences and template,"
|
89
|
+
op.separator "then does a multiple alignment on all those with the template."
|
90
|
+
op.separator " "
|
91
|
+
op.on("-t", "--threshold-pct <#{opt[:min]}>", Float, "minimum % match of given sequence") {|v| opt[:min] = v }
|
92
|
+
op.on("-q", "--quiet", "don't give any info while running") {|v| $VERBOSE = false }
|
93
|
+
end
|
94
|
+
opts.parse!
|
95
|
+
|
96
|
+
|
97
|
+
if ARGV.size != 2
|
98
|
+
puts opts
|
99
|
+
exit
|
100
|
+
end
|
101
|
+
|
102
|
+
(all_fasta_file, template) = ARGV
|
103
|
+
|
104
|
+
(file_base, template_base) = ARGV.map do |file|
|
105
|
+
File.basename(file, ".*")
|
106
|
+
end
|
107
|
+
|
108
|
+
outfile = [[file_base, template_base].join("__"), opt[:min], 'aligned'].join('.')
|
109
|
+
|
110
|
+
|
111
|
+
(seqs, definitions) = seqs_and_defs(all_fasta_file)
|
112
|
+
|
113
|
+
(template_seq, template_def) = seqs_and_defs(template).map(&:first)
|
114
|
+
|
115
|
+
#seqs = seqs[184,10]
|
116
|
+
#definitions = definitions[184,10]
|
117
|
+
|
118
|
+
#PSIM = %w(%chars_to_template %letters_to_template %letters_to_self)
|
119
|
+
|
120
|
+
factory = Bio::ClustalW.new
|
121
|
+
|
122
|
+
pass_threshold = []
|
123
|
+
|
124
|
+
printv "performing pairwise alignments [on #{seqs.size} seqs]: "
|
125
|
+
seqs.zip(definitions) do |seq, df|
|
126
|
+
if seq.to_s !~ /[^N]/i
|
127
|
+
printv '- '
|
128
|
+
next
|
129
|
+
end
|
130
|
+
align = Bio::Alignment.new([template_seq, seq])
|
131
|
+
result = align.do_align(factory)
|
132
|
+
(template_s, seq_s) = result.map do |seq|
|
133
|
+
seq.to_s
|
134
|
+
end
|
135
|
+
psimilar = seq_s.percent_similar_to(template_s)
|
136
|
+
printv( ("%.0f" % psimilar.last) + ' ')
|
137
|
+
if psimilar.last > opt[:min]
|
138
|
+
pass_threshold << [df, seq]
|
139
|
+
#printv '*'
|
140
|
+
else
|
141
|
+
#printv '.'
|
142
|
+
end
|
143
|
+
end
|
144
|
+
putsv "Done!"
|
145
|
+
|
146
|
+
abort "none found above threshold! #{opt[:min]}" if pass_threshold.size == 0
|
147
|
+
putsv "Found #{pass_threshold.size} sequence(s) above #{opt[:min]}% identical"
|
148
|
+
|
149
|
+
pass_threshold << [template_def, template_seq]
|
150
|
+
|
151
|
+
multi_align = Bio::Alignment.new( pass_threshold.map {|pair| pair.last } )
|
152
|
+
m_result = multi_align.do_align(factory).strip
|
153
|
+
|
154
|
+
labels = pass_threshold.map {|pair| pair.first }
|
155
|
+
aligned_seqs = m_result.map {|seq| seq.to_s }
|
156
|
+
|
157
|
+
File.open(outfile, 'w') do |out|
|
158
|
+
print_align(out, aligned_seqs, labels)
|
159
|
+
end
|
160
|
+
|
@@ -0,0 +1,143 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
|
3
|
+
require 'bio'
|
4
|
+
require 'optparse'
|
5
|
+
|
6
|
+
def printv(*args)
|
7
|
+
if $VERBOSE
|
8
|
+
print(*args) ; $stdout.flush
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
def putsv(*args)
|
13
|
+
if $VERBOSE
|
14
|
+
puts(*args) ; $stdout.flush
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
|
19
|
+
def print_align(io, sequences, labels, opts={})
|
20
|
+
opts = {:cutoff => 100, :start => 0, :chars => 20}.merge(opts)
|
21
|
+
(start, length, chars) = opts.values_at(:start, :cutoff, :chars)
|
22
|
+
|
23
|
+
loop do
|
24
|
+
fin = false
|
25
|
+
sequences.zip(labels) do |string, label|
|
26
|
+
fin = (start >= string.length )
|
27
|
+
break if fin
|
28
|
+
io.puts "#{label.exactly_chars(chars)} : #{string[start,length]}"
|
29
|
+
end
|
30
|
+
io.puts " "
|
31
|
+
break if fin
|
32
|
+
start += length
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
class String
|
37
|
+
|
38
|
+
# returns [% same chars, % same letters (template), % same letters self]
|
39
|
+
def percent_similar_to(template)
|
40
|
+
num_same = 0
|
41
|
+
num_same_letters = 0
|
42
|
+
num_letters_in_template = 0
|
43
|
+
num_letters_in_self = 0
|
44
|
+
(0...(template.size)).each do |i|
|
45
|
+
if letters = (self[i,1] =~ /[A-Za-z]/)
|
46
|
+
num_letters_in_self += 1
|
47
|
+
end
|
48
|
+
if template[i,1] =~ /[A-Za-z]/
|
49
|
+
num_letters_in_template += 1
|
50
|
+
end
|
51
|
+
if self[i] == template[i]
|
52
|
+
num_same += 1
|
53
|
+
if letters
|
54
|
+
num_same_letters += 1
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
[[num_same, template.size], [num_same_letters, num_letters_in_template], [num_same_letters, num_letters_in_self]].map {|a,b| (a.to_f / b) * 100 }
|
59
|
+
end
|
60
|
+
|
61
|
+
def exactly_chars(n)
|
62
|
+
at_least = "%#{n}s" % self
|
63
|
+
at_least[0,n]
|
64
|
+
end
|
65
|
+
|
66
|
+
end
|
67
|
+
|
68
|
+
def seqs_and_defs(file)
|
69
|
+
ff = Bio::FlatFile.auto(file)
|
70
|
+
na_seq_objs = []
|
71
|
+
definitions = []
|
72
|
+
ff.each_entry do |entry|
|
73
|
+
definitions << entry.definition
|
74
|
+
na_seq_objs << Bio::Sequence::NA.new(entry.seq.to_s)
|
75
|
+
end
|
76
|
+
[na_seq_objs, definitions]
|
77
|
+
end
|
78
|
+
|
79
|
+
outfile = "aligned.txt"
|
80
|
+
|
81
|
+
$VERBOSE = 3
|
82
|
+
opts = OptionParser.new do |op|
|
83
|
+
op.banner = "usage: #{File.basename(__FILE__)} template.fasta"
|
84
|
+
op.separator "output: aligned.txt"
|
85
|
+
op.separator "if template.ANNOTATED.fasta, then strips leading '#' lines and writes template.fasta"
|
86
|
+
op.separator " "
|
87
|
+
end
|
88
|
+
opts.parse!
|
89
|
+
|
90
|
+
|
91
|
+
if ARGV.size != 2
|
92
|
+
puts opts
|
93
|
+
exit
|
94
|
+
end
|
95
|
+
|
96
|
+
template = ARGV.shift
|
97
|
+
|
98
|
+
#seqs = seqs[184,10]
|
99
|
+
#definitions = definitions[184,10]
|
100
|
+
|
101
|
+
#PSIM = %w(%chars_to_template %letters_to_template %letters_to_self)
|
102
|
+
|
103
|
+
factory = Bio::ClustalW.new
|
104
|
+
|
105
|
+
pass_threshold = []
|
106
|
+
|
107
|
+
printv "performing pairwise alignments [on #{seqs.size} seqs]: "
|
108
|
+
seqs.zip(definitions) do |seq, df|
|
109
|
+
if seq.to_s !~ /[^N]/i
|
110
|
+
printv '- '
|
111
|
+
next
|
112
|
+
end
|
113
|
+
align = Bio::Alignment.new([template_seq, seq])
|
114
|
+
result = align.do_align(factory)
|
115
|
+
(template_s, seq_s) = result.map do |seq|
|
116
|
+
seq.to_s
|
117
|
+
end
|
118
|
+
psimilar = seq_s.percent_similar_to(template_s)
|
119
|
+
printv( ("%.0f" % psimilar.last) + ' ')
|
120
|
+
if psimilar.last > opt[:min]
|
121
|
+
pass_threshold << [df, seq]
|
122
|
+
#printv '*'
|
123
|
+
else
|
124
|
+
#printv '.'
|
125
|
+
end
|
126
|
+
end
|
127
|
+
putsv "Done!"
|
128
|
+
|
129
|
+
abort "none found above threshold! #{opt[:min]}" if pass_threshold.size == 0
|
130
|
+
putsv "Found #{pass_threshold.size} sequence(s) above #{opt[:min]}% identical"
|
131
|
+
|
132
|
+
pass_threshold << [template_def, template_seq]
|
133
|
+
|
134
|
+
multi_align = Bio::Alignment.new( pass_threshold.map {|pair| pair.last } )
|
135
|
+
m_result = multi_align.do_align(factory).strip
|
136
|
+
|
137
|
+
labels = pass_threshold.map {|pair| pair.first }
|
138
|
+
aligned_seqs = m_result.map {|seq| seq.to_s }
|
139
|
+
|
140
|
+
File.open(outfile, 'w') do |out|
|
141
|
+
print_align(out, aligned_seqs, labels)
|
142
|
+
end
|
143
|
+
|
@@ -0,0 +1,73 @@
|
|
1
|
+
# http://align.genome.jp/clustalw/clustalw_help.html
|
2
|
+
|
3
|
+
>>HELP 8 << Help for command line parameters
|
4
|
+
|
5
|
+
DATA (sequences)
|
6
|
+
|
7
|
+
-INFILE=file.ext :input sequences.
|
8
|
+
|
9
|
+
|
10
|
+
|
11
|
+
VERBS (do things)
|
12
|
+
|
13
|
+
-OPTIONS :list the command line parameters
|
14
|
+
-HELP or -CHECK :outline the command line params.
|
15
|
+
-ALIGN :do full multiple alignment
|
16
|
+
-TREE :calculate NJ tree.
|
17
|
+
-BOOTSTRAP(=n) :bootstrap a NJ tree (n= number of bootstraps; def. = 1000).
|
18
|
+
-CONVERT :output the input sequences in a different file format.
|
19
|
+
|
20
|
+
|
21
|
+
PARAMETERS (set things)
|
22
|
+
|
23
|
+
***General settings:****
|
24
|
+
-INTERACTIVE :read command line, then enter normal interactive menus
|
25
|
+
-QUICKTREE :use FAST algorithm for the alignment guide tree
|
26
|
+
-TYPE= :PROTEIN or DNA sequences
|
27
|
+
-NEGATIVE :protein alignment with negative values in matrix
|
28
|
+
-OUTFILE= :sequence alignment file name
|
29
|
+
-OUTPUT= :GCG, GDE, PHYLIP, PIR or NEXUS
|
30
|
+
-OUTORDER= :INPUT or ALIGNED
|
31
|
+
-CASE :LOWER or UPPER (for GDE output only)
|
32
|
+
-SEQNOS= :OFF or ON (for Clustal output only)
|
33
|
+
-SEQNO_RANGE=:OFF or ON (NEW: for all output formats)
|
34
|
+
-RANGE=m,n :sequence range to write starting m to m+n.
|
35
|
+
|
36
|
+
***Fast Pairwise Alignments:***
|
37
|
+
-KTUPLE=n :word size
|
38
|
+
-TOPDIAGS=n :number of best diags.
|
39
|
+
-WINDOW=n :window around best diags.
|
40
|
+
-PAIRGAP=n :gap penalty
|
41
|
+
-SCORE :PERCENT or ABSOLUTE
|
42
|
+
|
43
|
+
|
44
|
+
***Slow Pairwise Alignments:***
|
45
|
+
-PWMATRIX= :Protein weight matrix=BLOSUM, PAM, GONNET, ID or filename
|
46
|
+
-PWDNAMATRIX= :DNA weight matrix=IUB, CLUSTALW or filename
|
47
|
+
-PWGAPOPEN=f :gap opening penalty
|
48
|
+
-PWGAPEXT=f :gap opening penalty
|
49
|
+
|
50
|
+
|
51
|
+
***Multiple Alignments:***
|
52
|
+
-NEWTREE= :file for new guide tree
|
53
|
+
-USETREE= :file for old guide tree
|
54
|
+
-MATRIX= :Protein weight matrix=BLOSUM, PAM, GONNET, ID or filename
|
55
|
+
-DNAMATRIX= :DNA weight matrix=IUB, CLUSTALW or filename
|
56
|
+
-GAPOPEN=f :gap opening penalty
|
57
|
+
-GAPEXT=f :gap extension penalty
|
58
|
+
-ENDGAPS :no end gap separation pen.
|
59
|
+
-GAPDIST=n :gap separation pen. range
|
60
|
+
-NOPGAP :residue-specific gaps off
|
61
|
+
-NOHGAP :hydrophilic gaps off
|
62
|
+
-HGAPRESIDUES= :list hydrophilic res.
|
63
|
+
-MAXDIV=n :% ident. for delay
|
64
|
+
-TYPE= :PROTEIN or DNA
|
65
|
+
-TRANSWEIGHT=f :transitions weighting
|
66
|
+
|
67
|
+
|
68
|
+
***Trees:***
|
69
|
+
-OUTPUTTREE=nj OR phylip OR dist OR nexus
|
70
|
+
-SEED=n :seed number for bootstraps.
|
71
|
+
-KIMURA :use Kimura's correction.
|
72
|
+
-TOSSGAPS :ignore positions with gaps.
|
73
|
+
-BOOTLABELS=node OR branch :position of bootstrap values in tree display
|
@@ -0,0 +1,19 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
|
3
|
+
outfile = "ANALYZE.FASTA"
|
4
|
+
|
5
|
+
if ARGV.size == 0
|
6
|
+
puts "usage: #{File.basename(__FILE__)} <file>.fasta ..."
|
7
|
+
puts "comments (starting with '#') are ok"
|
8
|
+
puts "outputs: #{outfile}"
|
9
|
+
exit
|
10
|
+
end
|
11
|
+
|
12
|
+
all_text = ARGV.map do |file|
|
13
|
+
IO.read(file).split("\n").reject {|line| line =~ /^\#/ }.join("\n")
|
14
|
+
end.join("\n")
|
15
|
+
|
16
|
+
File.open(outfile, 'w') do |out|
|
17
|
+
out.print all_text
|
18
|
+
end
|
19
|
+
|
data/spec/align_spec.rb
ADDED
@@ -0,0 +1,67 @@
|
|
1
|
+
|
2
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
3
|
+
require 'bio/alignment/dna_sequence'
|
4
|
+
|
5
|
+
DNAReads = Bio::Alignment::DNASequenceReads
|
6
|
+
|
7
|
+
describe 'aligning' do
|
8
|
+
|
9
|
+
before do
|
10
|
+
@string = 'AAAATTTTGGGGGCCCCCC'
|
11
|
+
@conc = '--A?A-AT?TTGGGGGCCCAAC?C---'
|
12
|
+
@testcase = "testcase.fasta"
|
13
|
+
|
14
|
+
@pa = [ ["--ABCDEFGHIJKLMNOP",
|
15
|
+
"-----DEFGHIJK-MN--"],
|
16
|
+
["--ABCDEFGHIJKLM-NOP",
|
17
|
+
"--ABCDE---IJKLMZNOP"],
|
18
|
+
["--ABCDEFGHIJKLMNOP",
|
19
|
+
"-------------LMNOP"],
|
20
|
+
["--ABCDEFGHIJKLMNOP",
|
21
|
+
"--ABCDEFGHIJKLMN--"],
|
22
|
+
["--ABCDEFGHIJKLMNOP",
|
23
|
+
"--ABC------JKLM--P"],
|
24
|
+
["--ABC--DEFGHIJKLMNOP",
|
25
|
+
"--ABCZZDEFGHIJKLMNOP"],
|
26
|
+
]
|
27
|
+
@template = "--ABC--DEFGHIJKLM-NO"
|
28
|
+
@aligned = ["-------DEFGHIJK-M-N-",
|
29
|
+
"--ABC--DE---IJKLMZNO",
|
30
|
+
"---------------LM-NO",
|
31
|
+
"--ABC--DEFGHIJKLM-N-",
|
32
|
+
"--ABC--------JKLM---",
|
33
|
+
"--ABCZZDEFGHIJKLM-NO"
|
34
|
+
]
|
35
|
+
|
36
|
+
@labels = %w(one two three four five six)
|
37
|
+
|
38
|
+
end
|
39
|
+
|
40
|
+
it 'removes bad ends' do
|
41
|
+
(start, len) = DNAReads.find_good_section(@conc, 4)
|
42
|
+
@conc[start, len].is "TTGGGGGCCCAAC"
|
43
|
+
end
|
44
|
+
|
45
|
+
it 'aligns pairwise' do
|
46
|
+
(template, others) = DNAReads.merge_pairwise(@pa)
|
47
|
+
template.is @template
|
48
|
+
@aligned.enums others
|
49
|
+
end
|
50
|
+
|
51
|
+
it 'can create a good consensus string' do
|
52
|
+
(string, stats) = DNAReads.consensus_string_and_stats([@template, *@aligned])
|
53
|
+
string.is " ===^^==========^=="
|
54
|
+
stats.enums [2, 3, 15, 0, 0, 0]
|
55
|
+
(string, stats) = DNAReads.consensus_string_and_stats([@template, "-------DEFGHIJK-M-N-"])
|
56
|
+
string.is " ... ========.= =."
|
57
|
+
stats.enums [5, 0, 10, 5, 0, 0]
|
58
|
+
end
|
59
|
+
|
60
|
+
xit 'prints useful printout' do
|
61
|
+
st = StringIO.new
|
62
|
+
DNAReads.print_align(st, @aligned, @labels, :template => @template, :template_label => "template", :chars => 8)
|
63
|
+
puts " "
|
64
|
+
puts st.string
|
65
|
+
1.is 1
|
66
|
+
end
|
67
|
+
end
|