RubyGems - dna_sequence_aligner - Versions diffs - 0.0.2 - Mend

dna_sequence_aligner 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

data/.gitignore +8 -0
data/History +10 -0
data/LICENSE +22 -0
data/README.rdoc +58 -0
data/Rakefile +45 -0
data/VERSION +1 -0
data/bin/dna_sequence_aligner +61 -0
data/bin/dna_translator.rb +59 -0
data/lib/bio/alignment/dna_sequence.rb +313 -0
data/older/align_all.rb +160 -0
data/older/align_to_template.rb +143 -0
data/reference/clustalw_opts.txt +73 -0
data/script/fasta_compile_annotated.rb +19 -0
data/spec/align_spec.rb +67 -0
data/spec/spec_helper.rb +6 -0
data/spec/testfiles/HA-mKSR1-KSRF1.txt +19 -0
data/spec/testfiles/HA-mKSR1-KSRF2.txt +20 -0
data/spec/testfiles/HA-mKSR1-KSRF3.txt +20 -0
data/spec/testfiles/HA-mKSR1-KSRF4.txt +20 -0
data/spec/testfiles/HA-mKSR1-KSRF5.txt +20 -0
data/spec/testfiles/HA-mKSR1-OXL33.txt +20 -0
data/spec/testfiles/KSR1_mouse_NM_013571_in_HA_pREX.ANNOTATED.fasta +77 -0
data/spec/testfiles/testcase.fasta +55 -0
metadata +99 -0

data/older/align_all.rb ADDED

@@ -0,0 +1,160 @@
+#!/usr/bin/ruby
+require 'bio'
+require 'optparse'
+def printv(*args)
+  if $VERBOSE
+    print(*args) ; $stdout.flush
+  end
+end
+def putsv(*args)
+  if $VERBOSE
+    puts(*args) ; $stdout.flush
+  end
+end
+def print_align(io, sequences, labels, opts={})
+  opts = {:cutoff => 100, :start => 0, :chars => 20}.merge(opts)
+  (start, length, chars) = opts.values_at(:start, :cutoff, :chars)
+  loop do
+    fin = false
+    sequences.zip(labels) do |string, label|
+      fin = (start >= string.length )
+      break if fin
+      io.puts "#{label.exactly_chars(chars)} : #{string[start,length]}"
+    end
+    io.puts " "
+    break if fin
+    start += length
+  end
+end
+class String
+  # returns [% same chars, % same letters (template), % same letters self]
+  def percent_similar_to(template)
+    num_same = 0
+    num_same_letters = 0
+    num_letters_in_template = 0
+    num_letters_in_self = 0
+    (0...(template.size)).each do |i|
+      if letters = (self[i,1] =~ /[A-Za-z]/)
+        num_letters_in_self += 1
+      end
+      if template[i,1] =~ /[A-Za-z]/
+        num_letters_in_template += 1
+      end
+      if self[i] == template[i]
+        num_same += 1
+        if letters
+          num_same_letters += 1
+        end
+      end
+    end
+    [[num_same, template.size], [num_same_letters, num_letters_in_template], [num_same_letters, num_letters_in_self]].map {|a,b| (a.to_f / b) * 100 }
+  end
+  def exactly_chars(n)
+    at_least = "%#{n}s" % self
+    at_least[0,n]
+  end
+end
+def seqs_and_defs(file)
+  ff = Bio::FlatFile.auto(file)
+  na_seq_objs = []
+  definitions = []
+  ff.each_entry do |entry|
+    definitions << entry.definition
+    na_seq_objs << Bio::Sequence::NA.new(entry.seq.to_s)
+  end
+  [na_seq_objs, definitions]
+end
+opt = {
+  :min => 80,
+}
+$VERBOSE = 3
+opts = OptionParser.new do |op|
+  op.banner = "usage: #{File.basename(__FILE__)} <many_seqs>.fasta <one_template>.fasta"
+  op.separator "output: <many_seqs>__<one_template>.<Threshold>.aligned"
+  op.separator " "
+  op.separator "description: goes through and does pairwise matching between all sequences and template,"
+  op.separator "then does a multiple alignment on all those with the template."
+  op.separator " "
+  op.on("-t", "--threshold-pct <#{opt[:min]}>", Float, "minimum % match of given sequence") {|v| opt[:min] = v }
+  op.on("-q", "--quiet", "don't give any info while running") {|v| $VERBOSE = false }
+end
+opts.parse!
+if ARGV.size != 2
+  puts opts
+  exit
+end
+(all_fasta_file, template) = ARGV
+(file_base, template_base) = ARGV.map do |file|
+  File.basename(file, ".*")
+end
+outfile = [[file_base, template_base].join("__"), opt[:min], 'aligned'].join('.')
+(seqs, definitions) = seqs_and_defs(all_fasta_file)
+(template_seq, template_def) = seqs_and_defs(template).map(&:first)
+#seqs = seqs[184,10]
+#definitions = definitions[184,10]
+#PSIM = %w(%chars_to_template %letters_to_template %letters_to_self)
+factory = Bio::ClustalW.new
+pass_threshold = []
+printv "performing pairwise alignments [on #{seqs.size} seqs]: "
+seqs.zip(definitions) do |seq, df|
+  if seq.to_s !~ /[^N]/i
+    printv '- '
+    next
+  end
+  align = Bio::Alignment.new([template_seq, seq])
+  result = align.do_align(factory)
+  (template_s, seq_s) = result.map do |seq|
+    seq.to_s
+  end
+  psimilar = seq_s.percent_similar_to(template_s)
+  printv( ("%.0f" % psimilar.last) + ' ')
+  if psimilar.last > opt[:min]
+    pass_threshold << [df, seq]
+    #printv '*'
+  else
+    #printv '.'
+  end
+end
+putsv "Done!"
+abort "none found above threshold! #{opt[:min]}" if pass_threshold.size == 0
+putsv "Found #{pass_threshold.size} sequence(s) above #{opt[:min]}% identical"
+pass_threshold << [template_def, template_seq]
+multi_align = Bio::Alignment.new( pass_threshold.map {|pair| pair.last } )
+m_result = multi_align.do_align(factory).strip
+labels = pass_threshold.map {|pair| pair.first }
+aligned_seqs = m_result.map {|seq| seq.to_s }
+File.open(outfile, 'w') do |out|
+  print_align(out, aligned_seqs, labels)
+end

data/older/align_to_template.rb ADDED

@@ -0,0 +1,143 @@
+#!/usr/bin/ruby
+require 'bio'
+require 'optparse'
+def printv(*args)
+  if $VERBOSE
+    print(*args) ; $stdout.flush
+  end
+end
+def putsv(*args)
+  if $VERBOSE
+    puts(*args) ; $stdout.flush
+  end
+end
+def print_align(io, sequences, labels, opts={})
+  opts = {:cutoff => 100, :start => 0, :chars => 20}.merge(opts)
+  (start, length, chars) = opts.values_at(:start, :cutoff, :chars)
+  loop do
+    fin = false
+    sequences.zip(labels) do |string, label|
+      fin = (start >= string.length )
+      break if fin
+      io.puts "#{label.exactly_chars(chars)} : #{string[start,length]}"
+    end
+    io.puts " "
+    break if fin
+    start += length
+  end
+end
+class String
+  # returns [% same chars, % same letters (template), % same letters self]
+  def percent_similar_to(template)
+    num_same = 0
+    num_same_letters = 0
+    num_letters_in_template = 0
+    num_letters_in_self = 0
+    (0...(template.size)).each do |i|
+      if letters = (self[i,1] =~ /[A-Za-z]/)
+        num_letters_in_self += 1
+      end
+      if template[i,1] =~ /[A-Za-z]/
+        num_letters_in_template += 1
+      end
+      if self[i] == template[i]
+        num_same += 1
+        if letters
+          num_same_letters += 1
+        end
+      end
+    end
+    [[num_same, template.size], [num_same_letters, num_letters_in_template], [num_same_letters, num_letters_in_self]].map {|a,b| (a.to_f / b) * 100 }
+  end
+  def exactly_chars(n)
+    at_least = "%#{n}s" % self
+    at_least[0,n]
+  end
+end
+def seqs_and_defs(file)
+  ff = Bio::FlatFile.auto(file)
+  na_seq_objs = []
+  definitions = []
+  ff.each_entry do |entry|
+    definitions << entry.definition
+    na_seq_objs << Bio::Sequence::NA.new(entry.seq.to_s)
+  end
+  [na_seq_objs, definitions]
+end
+outfile = "aligned.txt"
+$VERBOSE = 3
+opts = OptionParser.new do |op|
+  op.banner = "usage: #{File.basename(__FILE__)} template.fasta"
+  op.separator "output: aligned.txt"
+  op.separator "if template.ANNOTATED.fasta, then strips leading '#' lines and writes template.fasta"
+  op.separator " "
+end
+opts.parse!
+if ARGV.size != 2
+  puts opts
+  exit
+end
+template = ARGV.shift
+#seqs = seqs[184,10]
+#definitions = definitions[184,10]
+#PSIM = %w(%chars_to_template %letters_to_template %letters_to_self)
+factory = Bio::ClustalW.new
+pass_threshold = []
+printv "performing pairwise alignments [on #{seqs.size} seqs]: "
+seqs.zip(definitions) do |seq, df|
+  if seq.to_s !~ /[^N]/i
+    printv '- '
+    next
+  end
+  align = Bio::Alignment.new([template_seq, seq])
+  result = align.do_align(factory)
+  (template_s, seq_s) = result.map do |seq|
+    seq.to_s
+  end
+  psimilar = seq_s.percent_similar_to(template_s)
+  printv( ("%.0f" % psimilar.last) + ' ')
+  if psimilar.last > opt[:min]
+    pass_threshold << [df, seq]
+    #printv '*'
+  else
+    #printv '.'
+  end
+end
+putsv "Done!"
+abort "none found above threshold! #{opt[:min]}" if pass_threshold.size == 0
+putsv "Found #{pass_threshold.size} sequence(s) above #{opt[:min]}% identical"
+pass_threshold << [template_def, template_seq]
+multi_align = Bio::Alignment.new( pass_threshold.map {|pair| pair.last } )
+m_result = multi_align.do_align(factory).strip
+labels = pass_threshold.map {|pair| pair.first }
+aligned_seqs = m_result.map {|seq| seq.to_s }
+File.open(outfile, 'w') do |out|
+  print_align(out, aligned_seqs, labels)
+end

data/reference/clustalw_opts.txt ADDED

@@ -0,0 +1,73 @@
+# http://align.genome.jp/clustalw/clustalw_help.html
+ >>HELP 8 <<      Help for command line parameters
+                DATA (sequences)
+-INFILE=file.ext                             :input sequences.
+                VERBS (do things)
+-OPTIONS	    :list the command line parameters
+-HELP  or -CHECK    :outline the command line params.
+-ALIGN              :do full multiple alignment
+-TREE               :calculate NJ tree.
+-BOOTSTRAP(=n)      :bootstrap a NJ tree (n= number of bootstraps; def. = 1000).
+-CONVERT            :output the input sequences in a different file format.
+                PARAMETERS (set things)
+***General settings:****
+-INTERACTIVE :read command line, then enter normal interactive menus
+-QUICKTREE   :use FAST algorithm for the alignment guide tree
+-TYPE=       :PROTEIN or DNA sequences
+-NEGATIVE    :protein alignment with negative values in matrix
+-OUTFILE=    :sequence alignment file name
+-OUTPUT=     :GCG, GDE, PHYLIP, PIR or NEXUS
+-OUTORDER=   :INPUT or ALIGNED
+-CASE        :LOWER or UPPER (for GDE output only)
+-SEQNOS=     :OFF or ON (for Clustal output only)
+-SEQNO_RANGE=:OFF or ON (NEW: for all output formats)
+-RANGE=m,n   :sequence range to write starting m to m+n.
+***Fast Pairwise Alignments:***
+-KTUPLE=n    :word size
+-TOPDIAGS=n  :number of best diags.
+-WINDOW=n    :window around best diags.
+-PAIRGAP=n   :gap penalty
+-SCORE       :PERCENT or ABSOLUTE
+***Slow Pairwise Alignments:***
+-PWMATRIX=    :Protein weight matrix=BLOSUM, PAM, GONNET, ID or filename
+-PWDNAMATRIX= :DNA weight matrix=IUB, CLUSTALW or filename
+-PWGAPOPEN=f  :gap opening penalty
+-PWGAPEXT=f   :gap opening penalty
+***Multiple Alignments:***
+-NEWTREE=      :file for new guide tree
+-USETREE=      :file for old guide tree
+-MATRIX=       :Protein weight matrix=BLOSUM, PAM, GONNET, ID or filename
+-DNAMATRIX=    :DNA weight matrix=IUB, CLUSTALW or filename
+-GAPOPEN=f     :gap opening penalty
+-GAPEXT=f      :gap extension penalty
+-ENDGAPS       :no end gap separation pen.
+-GAPDIST=n     :gap separation pen. range
+-NOPGAP        :residue-specific gaps off
+-NOHGAP        :hydrophilic gaps off
+-HGAPRESIDUES= :list hydrophilic res.
+-MAXDIV=n      :% ident. for delay
+-TYPE=         :PROTEIN or DNA
+-TRANSWEIGHT=f :transitions weighting
+***Trees:***
+-OUTPUTTREE=nj OR phylip OR dist OR nexus
+-SEED=n        :seed number for bootstraps.
+-KIMURA        :use Kimura's correction.
+-TOSSGAPS      :ignore positions with gaps.
+-BOOTLABELS=node OR branch :position of bootstrap values in tree display

data/script/fasta_compile_annotated.rb ADDED

@@ -0,0 +1,19 @@
+#!/usr/bin/ruby
+outfile = "ANALYZE.FASTA"
+if ARGV.size == 0
+  puts "usage: #{File.basename(__FILE__)} <file>.fasta ..."
+  puts "comments (starting with '#') are ok"
+  puts "outputs: #{outfile}"
+  exit
+end
+all_text = ARGV.map do |file|
+  IO.read(file).split("\n").reject {|line| line =~ /^\#/ }.join("\n")
+end.join("\n")
+File.open(outfile, 'w') do |out|
+  out.print all_text
+end

data/spec/align_spec.rb ADDED

@@ -0,0 +1,67 @@
+require File.dirname(__FILE__) + '/spec_helper'
+require 'bio/alignment/dna_sequence'
+DNAReads = Bio::Alignment::DNASequenceReads
+describe 'aligning' do
+  before do
+    @string = 'AAAATTTTGGGGGCCCCCC'
+    @conc = '--A?A-AT?TTGGGGGCCCAAC?C---'
+    @testcase = "testcase.fasta"
+    @pa = [ ["--ABCDEFGHIJKLMNOP",
+             "-----DEFGHIJK-MN--"],
+            ["--ABCDEFGHIJKLM-NOP",
+             "--ABCDE---IJKLMZNOP"],
+            ["--ABCDEFGHIJKLMNOP",
+             "-------------LMNOP"],
+            ["--ABCDEFGHIJKLMNOP",
+             "--ABCDEFGHIJKLMN--"],
+            ["--ABCDEFGHIJKLMNOP",
+             "--ABC------JKLM--P"],
+            ["--ABC--DEFGHIJKLMNOP",
+              "--ABCZZDEFGHIJKLMNOP"],
+    ]
+    @template = "--ABC--DEFGHIJKLM-NO"
+    @aligned = ["-------DEFGHIJK-M-N-",
+                "--ABC--DE---IJKLMZNO",
+                "---------------LM-NO",
+                "--ABC--DEFGHIJKLM-N-",
+                "--ABC--------JKLM---",
+                "--ABCZZDEFGHIJKLM-NO"
+                ]
+    @labels = %w(one two three four five six)
+  end
+  it 'removes bad ends' do
+    (start, len) = DNAReads.find_good_section(@conc, 4)
+    @conc[start, len].is "TTGGGGGCCCAAC"
+  end
+  it 'aligns pairwise' do
+    (template, others) = DNAReads.merge_pairwise(@pa)
+    template.is @template
+    @aligned.enums others
+  end
+  it 'can create a good consensus string' do
+    (string, stats) = DNAReads.consensus_string_and_stats([@template, *@aligned])
+    string.is "  ===^^==========^=="
+    stats.enums [2, 3, 15, 0, 0, 0]
+    (string, stats) = DNAReads.consensus_string_and_stats([@template, "-------DEFGHIJK-M-N-"])
+    string.is "  ...  ========.= =."
+    stats.enums [5, 0, 10, 5, 0, 0]
+  end
+  xit 'prints useful printout' do
+    st = StringIO.new
+    DNAReads.print_align(st, @aligned, @labels, :template => @template, :template_label => "template", :chars => 8)
+    puts " "
+    puts st.string
+    1.is 1
+  end
+end