dna_sequence_aligner 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,160 @@
1
+ #!/usr/bin/ruby
2
+
3
+ require 'bio'
4
+ require 'optparse'
5
+
6
+ def printv(*args)
7
+ if $VERBOSE
8
+ print(*args) ; $stdout.flush
9
+ end
10
+ end
11
+
12
+ def putsv(*args)
13
+ if $VERBOSE
14
+ puts(*args) ; $stdout.flush
15
+ end
16
+ end
17
+
18
+
19
+ def print_align(io, sequences, labels, opts={})
20
+ opts = {:cutoff => 100, :start => 0, :chars => 20}.merge(opts)
21
+ (start, length, chars) = opts.values_at(:start, :cutoff, :chars)
22
+
23
+ loop do
24
+ fin = false
25
+ sequences.zip(labels) do |string, label|
26
+ fin = (start >= string.length )
27
+ break if fin
28
+ io.puts "#{label.exactly_chars(chars)} : #{string[start,length]}"
29
+ end
30
+ io.puts " "
31
+ break if fin
32
+ start += length
33
+ end
34
+ end
35
+
36
+ class String
37
+
38
+ # returns [% same chars, % same letters (template), % same letters self]
39
+ def percent_similar_to(template)
40
+ num_same = 0
41
+ num_same_letters = 0
42
+ num_letters_in_template = 0
43
+ num_letters_in_self = 0
44
+ (0...(template.size)).each do |i|
45
+ if letters = (self[i,1] =~ /[A-Za-z]/)
46
+ num_letters_in_self += 1
47
+ end
48
+ if template[i,1] =~ /[A-Za-z]/
49
+ num_letters_in_template += 1
50
+ end
51
+ if self[i] == template[i]
52
+ num_same += 1
53
+ if letters
54
+ num_same_letters += 1
55
+ end
56
+ end
57
+ end
58
+ [[num_same, template.size], [num_same_letters, num_letters_in_template], [num_same_letters, num_letters_in_self]].map {|a,b| (a.to_f / b) * 100 }
59
+ end
60
+
61
+ def exactly_chars(n)
62
+ at_least = "%#{n}s" % self
63
+ at_least[0,n]
64
+ end
65
+
66
+ end
67
+
68
+ def seqs_and_defs(file)
69
+ ff = Bio::FlatFile.auto(file)
70
+ na_seq_objs = []
71
+ definitions = []
72
+ ff.each_entry do |entry|
73
+ definitions << entry.definition
74
+ na_seq_objs << Bio::Sequence::NA.new(entry.seq.to_s)
75
+ end
76
+ [na_seq_objs, definitions]
77
+ end
78
+
79
+ opt = {
80
+ :min => 80,
81
+ }
82
+
83
+ $VERBOSE = 3
84
+ opts = OptionParser.new do |op|
85
+ op.banner = "usage: #{File.basename(__FILE__)} <many_seqs>.fasta <one_template>.fasta"
86
+ op.separator "output: <many_seqs>__<one_template>.<Threshold>.aligned"
87
+ op.separator " "
88
+ op.separator "description: goes through and does pairwise matching between all sequences and template,"
89
+ op.separator "then does a multiple alignment on all those with the template."
90
+ op.separator " "
91
+ op.on("-t", "--threshold-pct <#{opt[:min]}>", Float, "minimum % match of given sequence") {|v| opt[:min] = v }
92
+ op.on("-q", "--quiet", "don't give any info while running") {|v| $VERBOSE = false }
93
+ end
94
+ opts.parse!
95
+
96
+
97
+ if ARGV.size != 2
98
+ puts opts
99
+ exit
100
+ end
101
+
102
+ (all_fasta_file, template) = ARGV
103
+
104
+ (file_base, template_base) = ARGV.map do |file|
105
+ File.basename(file, ".*")
106
+ end
107
+
108
+ outfile = [[file_base, template_base].join("__"), opt[:min], 'aligned'].join('.')
109
+
110
+
111
+ (seqs, definitions) = seqs_and_defs(all_fasta_file)
112
+
113
+ (template_seq, template_def) = seqs_and_defs(template).map(&:first)
114
+
115
+ #seqs = seqs[184,10]
116
+ #definitions = definitions[184,10]
117
+
118
+ #PSIM = %w(%chars_to_template %letters_to_template %letters_to_self)
119
+
120
+ factory = Bio::ClustalW.new
121
+
122
+ pass_threshold = []
123
+
124
+ printv "performing pairwise alignments [on #{seqs.size} seqs]: "
125
+ seqs.zip(definitions) do |seq, df|
126
+ if seq.to_s !~ /[^N]/i
127
+ printv '- '
128
+ next
129
+ end
130
+ align = Bio::Alignment.new([template_seq, seq])
131
+ result = align.do_align(factory)
132
+ (template_s, seq_s) = result.map do |seq|
133
+ seq.to_s
134
+ end
135
+ psimilar = seq_s.percent_similar_to(template_s)
136
+ printv( ("%.0f" % psimilar.last) + ' ')
137
+ if psimilar.last > opt[:min]
138
+ pass_threshold << [df, seq]
139
+ #printv '*'
140
+ else
141
+ #printv '.'
142
+ end
143
+ end
144
+ putsv "Done!"
145
+
146
+ abort "none found above threshold! #{opt[:min]}" if pass_threshold.size == 0
147
+ putsv "Found #{pass_threshold.size} sequence(s) above #{opt[:min]}% identical"
148
+
149
+ pass_threshold << [template_def, template_seq]
150
+
151
+ multi_align = Bio::Alignment.new( pass_threshold.map {|pair| pair.last } )
152
+ m_result = multi_align.do_align(factory).strip
153
+
154
+ labels = pass_threshold.map {|pair| pair.first }
155
+ aligned_seqs = m_result.map {|seq| seq.to_s }
156
+
157
+ File.open(outfile, 'w') do |out|
158
+ print_align(out, aligned_seqs, labels)
159
+ end
160
+
@@ -0,0 +1,143 @@
1
+ #!/usr/bin/ruby
2
+
3
+ require 'bio'
4
+ require 'optparse'
5
+
6
+ def printv(*args)
7
+ if $VERBOSE
8
+ print(*args) ; $stdout.flush
9
+ end
10
+ end
11
+
12
+ def putsv(*args)
13
+ if $VERBOSE
14
+ puts(*args) ; $stdout.flush
15
+ end
16
+ end
17
+
18
+
19
+ def print_align(io, sequences, labels, opts={})
20
+ opts = {:cutoff => 100, :start => 0, :chars => 20}.merge(opts)
21
+ (start, length, chars) = opts.values_at(:start, :cutoff, :chars)
22
+
23
+ loop do
24
+ fin = false
25
+ sequences.zip(labels) do |string, label|
26
+ fin = (start >= string.length )
27
+ break if fin
28
+ io.puts "#{label.exactly_chars(chars)} : #{string[start,length]}"
29
+ end
30
+ io.puts " "
31
+ break if fin
32
+ start += length
33
+ end
34
+ end
35
+
36
+ class String
37
+
38
+ # returns [% same chars, % same letters (template), % same letters self]
39
+ def percent_similar_to(template)
40
+ num_same = 0
41
+ num_same_letters = 0
42
+ num_letters_in_template = 0
43
+ num_letters_in_self = 0
44
+ (0...(template.size)).each do |i|
45
+ if letters = (self[i,1] =~ /[A-Za-z]/)
46
+ num_letters_in_self += 1
47
+ end
48
+ if template[i,1] =~ /[A-Za-z]/
49
+ num_letters_in_template += 1
50
+ end
51
+ if self[i] == template[i]
52
+ num_same += 1
53
+ if letters
54
+ num_same_letters += 1
55
+ end
56
+ end
57
+ end
58
+ [[num_same, template.size], [num_same_letters, num_letters_in_template], [num_same_letters, num_letters_in_self]].map {|a,b| (a.to_f / b) * 100 }
59
+ end
60
+
61
+ def exactly_chars(n)
62
+ at_least = "%#{n}s" % self
63
+ at_least[0,n]
64
+ end
65
+
66
+ end
67
+
68
+ def seqs_and_defs(file)
69
+ ff = Bio::FlatFile.auto(file)
70
+ na_seq_objs = []
71
+ definitions = []
72
+ ff.each_entry do |entry|
73
+ definitions << entry.definition
74
+ na_seq_objs << Bio::Sequence::NA.new(entry.seq.to_s)
75
+ end
76
+ [na_seq_objs, definitions]
77
+ end
78
+
79
+ outfile = "aligned.txt"
80
+
81
+ $VERBOSE = 3
82
+ opts = OptionParser.new do |op|
83
+ op.banner = "usage: #{File.basename(__FILE__)} template.fasta"
84
+ op.separator "output: aligned.txt"
85
+ op.separator "if template.ANNOTATED.fasta, then strips leading '#' lines and writes template.fasta"
86
+ op.separator " "
87
+ end
88
+ opts.parse!
89
+
90
+
91
+ if ARGV.size != 2
92
+ puts opts
93
+ exit
94
+ end
95
+
96
+ template = ARGV.shift
97
+
98
+ #seqs = seqs[184,10]
99
+ #definitions = definitions[184,10]
100
+
101
+ #PSIM = %w(%chars_to_template %letters_to_template %letters_to_self)
102
+
103
+ factory = Bio::ClustalW.new
104
+
105
+ pass_threshold = []
106
+
107
+ printv "performing pairwise alignments [on #{seqs.size} seqs]: "
108
+ seqs.zip(definitions) do |seq, df|
109
+ if seq.to_s !~ /[^N]/i
110
+ printv '- '
111
+ next
112
+ end
113
+ align = Bio::Alignment.new([template_seq, seq])
114
+ result = align.do_align(factory)
115
+ (template_s, seq_s) = result.map do |seq|
116
+ seq.to_s
117
+ end
118
+ psimilar = seq_s.percent_similar_to(template_s)
119
+ printv( ("%.0f" % psimilar.last) + ' ')
120
+ if psimilar.last > opt[:min]
121
+ pass_threshold << [df, seq]
122
+ #printv '*'
123
+ else
124
+ #printv '.'
125
+ end
126
+ end
127
+ putsv "Done!"
128
+
129
+ abort "none found above threshold! #{opt[:min]}" if pass_threshold.size == 0
130
+ putsv "Found #{pass_threshold.size} sequence(s) above #{opt[:min]}% identical"
131
+
132
+ pass_threshold << [template_def, template_seq]
133
+
134
+ multi_align = Bio::Alignment.new( pass_threshold.map {|pair| pair.last } )
135
+ m_result = multi_align.do_align(factory).strip
136
+
137
+ labels = pass_threshold.map {|pair| pair.first }
138
+ aligned_seqs = m_result.map {|seq| seq.to_s }
139
+
140
+ File.open(outfile, 'w') do |out|
141
+ print_align(out, aligned_seqs, labels)
142
+ end
143
+
@@ -0,0 +1,73 @@
1
+ # http://align.genome.jp/clustalw/clustalw_help.html
2
+
3
+ >>HELP 8 << Help for command line parameters
4
+
5
+ DATA (sequences)
6
+
7
+ -INFILE=file.ext :input sequences.
8
+
9
+
10
+
11
+ VERBS (do things)
12
+
13
+ -OPTIONS :list the command line parameters
14
+ -HELP or -CHECK :outline the command line params.
15
+ -ALIGN :do full multiple alignment
16
+ -TREE :calculate NJ tree.
17
+ -BOOTSTRAP(=n) :bootstrap a NJ tree (n= number of bootstraps; def. = 1000).
18
+ -CONVERT :output the input sequences in a different file format.
19
+
20
+
21
+ PARAMETERS (set things)
22
+
23
+ ***General settings:****
24
+ -INTERACTIVE :read command line, then enter normal interactive menus
25
+ -QUICKTREE :use FAST algorithm for the alignment guide tree
26
+ -TYPE= :PROTEIN or DNA sequences
27
+ -NEGATIVE :protein alignment with negative values in matrix
28
+ -OUTFILE= :sequence alignment file name
29
+ -OUTPUT= :GCG, GDE, PHYLIP, PIR or NEXUS
30
+ -OUTORDER= :INPUT or ALIGNED
31
+ -CASE :LOWER or UPPER (for GDE output only)
32
+ -SEQNOS= :OFF or ON (for Clustal output only)
33
+ -SEQNO_RANGE=:OFF or ON (NEW: for all output formats)
34
+ -RANGE=m,n :sequence range to write starting m to m+n.
35
+
36
+ ***Fast Pairwise Alignments:***
37
+ -KTUPLE=n :word size
38
+ -TOPDIAGS=n :number of best diags.
39
+ -WINDOW=n :window around best diags.
40
+ -PAIRGAP=n :gap penalty
41
+ -SCORE :PERCENT or ABSOLUTE
42
+
43
+
44
+ ***Slow Pairwise Alignments:***
45
+ -PWMATRIX= :Protein weight matrix=BLOSUM, PAM, GONNET, ID or filename
46
+ -PWDNAMATRIX= :DNA weight matrix=IUB, CLUSTALW or filename
47
+ -PWGAPOPEN=f :gap opening penalty
48
+ -PWGAPEXT=f :gap opening penalty
49
+
50
+
51
+ ***Multiple Alignments:***
52
+ -NEWTREE= :file for new guide tree
53
+ -USETREE= :file for old guide tree
54
+ -MATRIX= :Protein weight matrix=BLOSUM, PAM, GONNET, ID or filename
55
+ -DNAMATRIX= :DNA weight matrix=IUB, CLUSTALW or filename
56
+ -GAPOPEN=f :gap opening penalty
57
+ -GAPEXT=f :gap extension penalty
58
+ -ENDGAPS :no end gap separation pen.
59
+ -GAPDIST=n :gap separation pen. range
60
+ -NOPGAP :residue-specific gaps off
61
+ -NOHGAP :hydrophilic gaps off
62
+ -HGAPRESIDUES= :list hydrophilic res.
63
+ -MAXDIV=n :% ident. for delay
64
+ -TYPE= :PROTEIN or DNA
65
+ -TRANSWEIGHT=f :transitions weighting
66
+
67
+
68
+ ***Trees:***
69
+ -OUTPUTTREE=nj OR phylip OR dist OR nexus
70
+ -SEED=n :seed number for bootstraps.
71
+ -KIMURA :use Kimura's correction.
72
+ -TOSSGAPS :ignore positions with gaps.
73
+ -BOOTLABELS=node OR branch :position of bootstrap values in tree display
@@ -0,0 +1,19 @@
1
+ #!/usr/bin/ruby
2
+
3
+ outfile = "ANALYZE.FASTA"
4
+
5
+ if ARGV.size == 0
6
+ puts "usage: #{File.basename(__FILE__)} <file>.fasta ..."
7
+ puts "comments (starting with '#') are ok"
8
+ puts "outputs: #{outfile}"
9
+ exit
10
+ end
11
+
12
+ all_text = ARGV.map do |file|
13
+ IO.read(file).split("\n").reject {|line| line =~ /^\#/ }.join("\n")
14
+ end.join("\n")
15
+
16
+ File.open(outfile, 'w') do |out|
17
+ out.print all_text
18
+ end
19
+
@@ -0,0 +1,67 @@
1
+
2
+ require File.dirname(__FILE__) + '/spec_helper'
3
+ require 'bio/alignment/dna_sequence'
4
+
5
+ DNAReads = Bio::Alignment::DNASequenceReads
6
+
7
+ describe 'aligning' do
8
+
9
+ before do
10
+ @string = 'AAAATTTTGGGGGCCCCCC'
11
+ @conc = '--A?A-AT?TTGGGGGCCCAAC?C---'
12
+ @testcase = "testcase.fasta"
13
+
14
+ @pa = [ ["--ABCDEFGHIJKLMNOP",
15
+ "-----DEFGHIJK-MN--"],
16
+ ["--ABCDEFGHIJKLM-NOP",
17
+ "--ABCDE---IJKLMZNOP"],
18
+ ["--ABCDEFGHIJKLMNOP",
19
+ "-------------LMNOP"],
20
+ ["--ABCDEFGHIJKLMNOP",
21
+ "--ABCDEFGHIJKLMN--"],
22
+ ["--ABCDEFGHIJKLMNOP",
23
+ "--ABC------JKLM--P"],
24
+ ["--ABC--DEFGHIJKLMNOP",
25
+ "--ABCZZDEFGHIJKLMNOP"],
26
+ ]
27
+ @template = "--ABC--DEFGHIJKLM-NO"
28
+ @aligned = ["-------DEFGHIJK-M-N-",
29
+ "--ABC--DE---IJKLMZNO",
30
+ "---------------LM-NO",
31
+ "--ABC--DEFGHIJKLM-N-",
32
+ "--ABC--------JKLM---",
33
+ "--ABCZZDEFGHIJKLM-NO"
34
+ ]
35
+
36
+ @labels = %w(one two three four five six)
37
+
38
+ end
39
+
40
+ it 'removes bad ends' do
41
+ (start, len) = DNAReads.find_good_section(@conc, 4)
42
+ @conc[start, len].is "TTGGGGGCCCAAC"
43
+ end
44
+
45
+ it 'aligns pairwise' do
46
+ (template, others) = DNAReads.merge_pairwise(@pa)
47
+ template.is @template
48
+ @aligned.enums others
49
+ end
50
+
51
+ it 'can create a good consensus string' do
52
+ (string, stats) = DNAReads.consensus_string_and_stats([@template, *@aligned])
53
+ string.is " ===^^==========^=="
54
+ stats.enums [2, 3, 15, 0, 0, 0]
55
+ (string, stats) = DNAReads.consensus_string_and_stats([@template, "-------DEFGHIJK-M-N-"])
56
+ string.is " ... ========.= =."
57
+ stats.enums [5, 0, 10, 5, 0, 0]
58
+ end
59
+
60
+ xit 'prints useful printout' do
61
+ st = StringIO.new
62
+ DNAReads.print_align(st, @aligned, @labels, :template => @template, :template_label => "template", :chars => 8)
63
+ puts " "
64
+ puts st.string
65
+ 1.is 1
66
+ end
67
+ end