RubyGems - dna_sequence_aligner - Versions diffs - 0.0.2 - Mend

dna_sequence_aligner 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

data/.gitignore +8 -0
data/History +10 -0
data/LICENSE +22 -0
data/README.rdoc +58 -0
data/Rakefile +45 -0
data/VERSION +1 -0
data/bin/dna_sequence_aligner +61 -0
data/bin/dna_translator.rb +59 -0
data/lib/bio/alignment/dna_sequence.rb +313 -0
data/older/align_all.rb +160 -0
data/older/align_to_template.rb +143 -0
data/reference/clustalw_opts.txt +73 -0
data/script/fasta_compile_annotated.rb +19 -0
data/spec/align_spec.rb +67 -0
data/spec/spec_helper.rb +6 -0
data/spec/testfiles/HA-mKSR1-KSRF1.txt +19 -0
data/spec/testfiles/HA-mKSR1-KSRF2.txt +20 -0
data/spec/testfiles/HA-mKSR1-KSRF3.txt +20 -0
data/spec/testfiles/HA-mKSR1-KSRF4.txt +20 -0
data/spec/testfiles/HA-mKSR1-KSRF5.txt +20 -0
data/spec/testfiles/HA-mKSR1-OXL33.txt +20 -0
data/spec/testfiles/KSR1_mouse_NM_013571_in_HA_pREX.ANNOTATED.fasta +77 -0
data/spec/testfiles/testcase.fasta +55 -0
metadata +99 -0

data/.gitignore ADDED

@@ -0,0 +1,8 @@
+.DS_Store
+pkg/
+rdoc/
+backup/
+config/
+data/
+*.swp
+*.gemspec

data/History ADDED

@@ -0,0 +1,10 @@
+== 0.0.2 / 2010-02-12
+* standardized output, used it to align 8 different sequences with roughly 40
+total reads.
+== 0.0.1 / 2010-02-11
+* Beginning

data/LICENSE ADDED

@@ -0,0 +1,22 @@
+The MIT License
+Copyright (c) 2010 Howard Hughes Medical Institute
+Authored by John T. Prince
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.

data/README.rdoc ADDED

@@ -0,0 +1,58 @@
+= DNASequenceAligner
+dna_sequence_aligner assumes you have a template dna sequence.  All other DNA
+sequences are matched up with the template and then they are all merged into
+one template-centric alignment.  The output was custom designed to show
+coverage at a glance in a template-centric fashion.
+The software is also written so that you can annotate your template fasta file
+with comments (must lead with a '#' character).
+== Dependencies
+*Clustalw* must be installed (*clustalw* package in ubuntu/debian) and generally
+accessible.
+[Bioruby is heavily relied on, but it is explicitly stated as a gem dependency
+so you shouldn't have to worry about it if installed by gem]
+== Examples
+The executable is the main item of interest.  It takes one (or more) sequence
+files.  Your template should be the first fasta encountered.
+    dna_sequence_aligner template.fasta others.fasta > output.aligned.txt
+    # sequences in separate files
+    dna_sequence_aligner template.fasta other1.fasta other2.fasta > output.aligned.txt
+    # all sequences in one file (template first)
+    dna_sequence_aligner all_seqs.fasta > output.aligned.txt
+A comment (#) aware DNA sequence translator is provided to check and see if
+things are in register and so forth.  It outputs the DNA sequence and protein
+sequence below it.
+    # -s 2 is a 2 nucleotide frameshift
+    dna_translator.rb -s 2 dna_annotated.fasta > protein.txt
+== Legend
+    all gaps            <blank>
+    template gap        ^
+    gap below template  .
+    agreement           =
+    all bad matches     ^
+    non-consensus       ?
+== NOTE
+This is very much alpha software at the moment.  It was written in a time
+crunch and so it is a little rough around the edges.  However, key components
+have specs written and appear to work properly.  If I have to do more
+alignments or you send me pull requests then this may get to be nicer
+software.
+== Copyright
+See LICENSE

data/Rakefile ADDED

@@ -0,0 +1,45 @@
+require 'rubygems'
+require 'rake'
+require 'jeweler'
+require 'rake/testtask'
+# require 'rcov/rcovtask'
+NAME = "dna_sequence_aligner"
+WEBSITE_BASE = "website"
+WEBSITE_OUTPUT = WEBSITE_BASE + "/output"
+gemspec = Gem::Specification.new do |s|
+  s.name = NAME
+  s.authors = ["John T. Prince"]
+  s.email = "jtprince@gmail.com"
+  s.homepage = "http://jtprince.github.com/" + NAME
+  s.summary = "does high pairwise alignment of sequencing reads with a template"
+  s.description = "does high pairwise alignment of sequencing reads with a template using bioruby and clustalw.  gives template-centric output."
+  s.add_dependency("bio")
+  s.add_development_dependency("spec-more")
+end
+Jeweler::Tasks.new(gemspec)
+Rake::TestTask.new(:spec) do |spec|
+  spec.libs << 'lib' << 'spec'
+  spec.pattern = 'spec/**/*_spec.rb'
+  spec.verbose = true
+end
+require 'rake/rdoctask'
+Rake::RDocTask.new do |rdoc|
+  base_rdoc_output_dir = WEBSITE_OUTPUT + '/rdoc'
+  version = File.read('VERSION')
+  rdoc.rdoc_dir = base_rdoc_output_dir + "/#{version}"
+  rdoc.title = NAME + ' ' + version
+  rdoc.rdoc_files.include('README*')
+  rdoc.rdoc_files.include('lib/**/*.rb')
+end
+task :default => :spec
+task :build => :gemspec
+# credit: Rakefile modeled after Jeweler's

data/VERSION ADDED

	@@ -0,0 +1 @@
1	+ 0.0.2

data/bin/dna_sequence_aligner ADDED

@@ -0,0 +1,61 @@
+#!/usr/bin/ruby
+require 'bio'
+require 'optparse'
+require 'bio/alignment/dna_sequence'
+# returns an Array of entry objects.  The method will remove any commented
+# lines first
+def fasta_entries(file)
+  clean_string = IO.read(file).split("\n").reject {|line| line =~ /^\#/ }.join("\n")
+    io = StringIO.new clean_string
+  objects = []
+  Bio::FlatFile.auto(io) do |ff|
+    ff.each_entry do |entry|
+      objects << entry
+    end
+  end
+  objects
+end
+DNAReads = Bio::Alignment::DNASequenceReads
+opt = DNAReads::ALIGN_OPTS.dup
+op = OptionParser.new do |op|
+  op.banner = "usage: #{File.basename(__FILE__)} <template>.fasta <read>.txt ..."
+  op.separator " "
+  op.on("--fidelity-length <#{opt[:fidelity_length]}>", Integer, "min length of correct reads on the ends") {|v| opt[:fidelity_length] = v }
+  op.separator " "
+  op.on("--type <DNA|PROTEIN>", "type of bio sequences") {|v| opt[:type] = v }
+  op.on("--gapopen <#{opt[:gapopen]}>", Float, "gap opening penalty") {|v| opt[:gapopen] = v }
+  op.on("--gapext <#{opt[:gapext]}>", Float, "gap extension penalty") {|v| opt[:gapext] = v }
+  op.on("--dnamatrix <String>", "DNA weight matrix IUB|CLUSTALW") {|v| opt[:dnamatrix] = v }
+  op.separator " "
+  op.on("-n", "--no-consensus", "use original seq for fidelity (not cons.)") {|v| opt[:consensus_fidelity] = false }
+  op.separator " "
+  op.separator "the first sequence is assumed the template sequence"
+end
+if ARGV.size == 0
+  puts op
+  exit
+end
+labels = nil
+files = ARGV.map
+fasta_entries = files.inject([]) {|ar, file| ar.push( *fasta_entries(file) ) }
+bioseqs = fasta_entries.map {|entry| Bio::Sequence::NA.new(entry.seq) }
+labels = fasta_entries.map {|entry| entry.definition }
+pairwise = DNAReads.align_pairwise(bioseqs, opt)
+(template, others) = DNAReads.merge_pairwise(pairwise)
+template_label = labels.shift
+DNAReads.print_align(STDOUT, others, labels, :template => template, :template_label => template_label, :chars => 30)

data/bin/dna_translator.rb ADDED

@@ -0,0 +1,59 @@
+#!/usr/bin/ruby
+require 'bio'
+require 'optparse'
+opt = {
+  :frameshift => 0
+}
+op = OptionParser.new do |op|
+  op.banner = "usage: #{File.basename(__FILE__)} <dnaseq>.fasta"
+  op.on("-s", "--shift <int>", Integer, "frameshift") {|v| opt[:frameshift] = v }
+end
+op.parse!
+frameshift = opt[:frameshift]
+p frameshift
+if ARGV.size == 0
+  puts op
+  exit
+end
+file = ARGV.shift
+string = IO.read(file).split("\n").reject {|line| line =~ /^\#/ }.join("\n")
+st = StringIO.new(string)
+ff = Bio::FlatFile.auto(st)
+seqs = []
+ff.each_entry do |entry|
+  seq = entry.seq
+  seqs << seq[frameshift..-1]
+end
+length = 70
+seqs.each do |seq|
+  bsq = Bio::Sequence::NA.new(seq)
+  protseq = bsq.translate
+  start = 0
+  loop do
+    break if start >= seq.length
+    frag = seq[start, length]
+    puts frag
+    prot_line = (start...(start+length)).to_a.map do |x|
+      if x % 3 == 0
+        prot_i = x / 3
+        protseq[prot_i,1]
+      else
+        " "
+      end
+    end.join
+    puts prot_line
+    start += length
+  end
+  print "NUM START/STOP CODONS: "
+  puts protseq.to_s.split("").select {|v| v == '*' }.size
+end

data/lib/bio/alignment/dna_sequence.rb ADDED

@@ -0,0 +1,313 @@
+module Bio
+  module Alignment
+    module DNASequenceReads
+      module_function
+      CLUSTALW_OPTS = %w(gapopen gapext dnamatrix type)
+      # returns the index of the starting run of good chars
+      def find_start_good_section(iupac_concensus_string, min_length)
+        good_char_count = 0
+        char_index = 0
+        iupac_concensus_string.each_char do |char|
+          if char =~ /[^\?\-Nn]/
+            good_char_count += 1
+            if good_char_count >= min_length
+              break
+            end
+          else
+            good_char_count = 0
+          end
+          char_index += 1
+        end
+        char_index - (good_char_count - 1)
+      end
+      # returns (start, length) where min_length reads are correct
+      def find_good_section(iupac_concensus_string, min_length)
+        start = find_start_good_section(iupac_concensus_string, min_length)
+        from_end = find_start_good_section(iupac_concensus_string.reverse, min_length)
+        length = iupac_concensus_string.length - start - from_end
+        if length < 0
+          nil
+        else
+          [start, length]
+        end
+      end
+      def hash_opts_to_clustalopts(hash)
+        array = []
+        hash.each do |k,v|
+          if CLUSTALW_OPTS.include?(k.to_s)
+            array << "-#{k}=#{v}"
+          end
+        end
+        array
+      end
+      def lstrip_dash(string)
+        chr = first_non_dash_char(string)
+        string[chr..-1]
+      end
+      def strip_dash(string)
+        ls = lstrip_dash(string)
+        lstrip_dash(ls.reverse).reverse
+      end
+      def first_non_dash_char(string)
+        char_cnt = 0
+        string.each_char do |char|
+          if char != '-'
+            break
+          end
+          char_cnt += 1
+        end
+        char_cnt
+      end
+      def clustal_align(bioseqs, factory)
+        al = Bio::Alignment.new(bioseqs)
+        al.do_align(factory)
+      end
+      ALIGN_OPTS = {
+        :type => 'DNA',
+        :gapopen => 20,
+        :gapext => 20,
+        :dnamatrix => 'IUB',  # "IUB" || "CLUSTALW"
+        :fidelity_length => 10,
+        :consensus_fidelity => true,
+      }
+      # returns high quality pairwise alignments
+      # based on the fidelity_length option
+      def align_pairwise(bioseqs, opt={})
+        factory = Bio::ClustalW.new
+        clustal_opts = hash_opts_to_clustalopts(opt)
+        factory.options = clustal_opts
+        template = bioseqs.shift
+        start_length = []
+        pairwise_aligns = bioseqs.map do |bseq|
+          clust_al = clustal_align([template, bseq], factory)
+          cl_cons = clust_al.consensus
+          aligned_string = clust_al[1].to_s
+          #(st, len) = find_good_section(aligned_string, opt[:fidelity_length])
+          seq_to_use =
+            if opt[:consensus_fidelity]
+              cl_cons
+            else
+              aligned_string
+            end
+          (st, len) = find_good_section(seq_to_use, opt[:fidelity_length])
+          if st
+            pristine = aligned_string[st, len].gsub('-','')  # pristine read (ends removed)
+            clustal_align([template.to_s, Bio::Sequence::NA.new(pristine)], factory)
+          else
+            warn "a sequence does not meeting min fidelity! using original alignment"
+            clust_al
+          end
+        end
+      end
+      # assumes all were aligned to the same template (the first of a pair)
+      def merge_pairwise(aligns)
+        ps = aligns.map do |align|
+          seqs = []
+          align.each do |bioseq|
+            seqs << bioseq.to_s
+          end
+          seqs
+        end
+        template = []
+        #m,x,n
+        x = 2
+        ftemp = ps.first.first
+        nmax = ps.map {|pair| pair.first.size }.max
+        mmax = ps.size
+        mar = (0...mmax).to_a
+        others = mar.map { [] }
+        ns = mar.map { 0 }
+        tn = 0
+        on = 0
+        (0...nmax).each do |n|
+          (t_dsh, t_no_dsh) = mar.partition do |m|
+            # this is RUBY 1.8 ONLY!!
+            ps[m][0][ns[m]] == 45  # '-' is ascii 45
+          end
+          # if a template has a dash, all other off-templates need a dash
+          if t_dsh.size > 0
+            template[tn] = 45
+            t_no_dsh.each do |m|
+              # don't update these guys counter
+              others[m][tn] = 45
+            end
+            t_dsh.each do |m|
+              others[m][tn] = ps[m][1][ns[m]]
+              ns[m] += 1
+            end
+          else # no dashes in the template
+            t_no_dsh.each do |m|
+              others[m][tn] = ps[m][1][ns[m]]
+            end
+            template[tn] = ps[0][0][ns[0]]
+            ns.map!{|v| v+1 }
+          end
+          tn += 1
+        end
+        [cs_to_s(template), others.map! {|ar| cs_to_s(ar) } ]
+      end
+      def cs_to_s(ar)
+        ar.map {|v| v.nil? ? '-' : v.chr }.join
+      end
+      # adjust all pairwise alignments to fit each other
+      #consensus_template = []
+      #max_length = pairs_of_strings.map {|pair| pair.first.size }.max
+      #(0...max_length).each do |n|
+      #  pairs_of_strings.map {|pair| pair.map {|st| st[n] } }
+      #end
+      # assumes the first is the template
+      def consensus_string_and_stats(strings)
+        as_chars = strings.map {|v| v.split("") }
+        stats = Array.new(6, 0)
+        consensus_string = as_chars.shift.zip(*as_chars).map do |chrs|
+          consensus_bool_ar = Array.new(6)
+          symbols = [' '] + %w(^ = . ^ ?)
+          all_gaps = 0
+          template_gap = 1
+          agreement = 2
+          gap_below_template = 3
+          all_bad_matches = 4
+          non_consensus = 5
+          first = chrs.shift
+          if [first, *chrs].all? {|v| v.nil? or (v == '-') }
+            consensus_bool_ar[all_gaps] = true
+          elsif first == '-'
+            consensus_bool_ar[template_gap] = true
+          elsif chrs.all? {|v| v == '-'}
+            consensus_bool_ar[gap_below_template] = true
+          elsif chrs.all? {|v| (v == '-') or (v == first) }
+            consensus_bool_ar[agreement] = true
+          elsif chrs.all? {|v| (v == '-') or (v != first) }
+            consensus_bool_ar[all_bad_matches] = true
+          else
+            consensus_bool_ar[non_consensus] = true
+          end
+          consensus_bool_ar.each_with_index {|v,i| stats[i] += 1 if v }
+          symbols[consensus_bool_ar.index(true)]
+        end.join
+        [consensus_string, stats]
+      end
+      def exactly_chars(string, n)
+        at_least = "%#{n}s" % string
+        at_least[0,n]
+      end
+      #     all gaps                  <blank>
+      #     template gap              ^
+      #     gap below template        .
+      #     agreement                 =
+      #     all bad matches           ^
+      #     non-consensus             ?
+      #
+      # accepts :template => template_sequence
+      def print_align(io, sequences, labels, opts={})
+        opts = {:cutoff => 70, :start => 0, :chars => 20}.merge(opts)
+        (start, length, chars) = opts.values_at(:start, :cutoff, :chars)
+        spacer = "  "
+        if opts[:template]
+          sequences.unshift(opts[:template])
+          labels.unshift(opts[:template_label])
+        end
+        all_stats = Array.new(6,0)
+        loop do
+          fin = false
+          max_length = 0
+          lines = []
+          consensus_line = ""
+          fragments = sequences.map do |string|
+            fin = (start >= string.length )
+            break if fin
+            string_frag = string[start, length]
+            string_frag
+          end ; break if fin
+          doubles = fragments.zip(labels)
+          doubles = doubles.select {|frag, _| (frag.size > 0) && (frag =~ /[^-]/) }
+          max_length = doubles.map {|frag, _| frag.size }.max
+          (cs, stats) = consensus_string_and_stats( doubles.map {|frag,_| frag } )
+          all_stats = all_stats.zip(stats).map {|a,b| a + b }
+          doubles.push( [cs, "<CONSENSUS>"] )
+          lines = doubles.map {|frag, label| [exactly_chars(label, chars),spacer,frag].join }
+          ## the counters at the top of the line
+          start_s = start.to_s
+          finish_s = (start + max_length).to_s
+          count_line_gap = max_length - (start_s.size + finish_s.size)
+          count_line = [start_s, spacer]
+          unless count_line_gap < 1
+            count_line << " " * count_line_gap
+          end
+          io.puts [exactly_chars("", chars), spacer, count_line.join].join
+          io.puts lines.join("\n")
+          io.puts " "  # separator between lines
+          start += length
+        end
+      end
+      #      # accepts :template => template_sequence
+      #def print_align(io, sequences, labels, opts={})
+      #opts = {:cutoff => 100, :start => 0, :chars => 20}.merge(opts)
+      #(start, length, chars) = opts.values_at(:start, :cutoff, :chars)
+      #spacer = "  "
+      #loop do
+      #fin = false
+      ### the counters at the top of the line
+      #start_s = start.to_s
+      #finish_s = (start + length).to_s
+      #count_line_gap = length - (start_s.size + finish_s.size)
+      #count_line = [start_s, " " * count_line_gap, finish_s].join
+      #io.puts [exactly_chars("", chars), spacer, count_line].join
+      #sequences.zip(labels) do |string, label|
+      #fin = (start >= string.length )
+      #break if fin
+      #io.puts "#{exactly_chars(label, chars)}#{spacer}#{string[start,length]}"
+      #end
+      #io.puts " "
+      #break if fin
+      #start += length
+      #end
+      #end
+    end
+  end
+end