RubyGems - transfuse - Versions diffs - 0.1.4 → 0.4.2 - Mend

transfuse 0.1.4 → 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

checksums.yaml +4 -4
data/README.md +42 -1
data/Rakefile +10 -4
data/bin/transfuse +12 -11
data/deps/deps.yaml +11 -0
data/lib/transfuse/cluster.rb +53 -68
data/lib/transfuse/cmd.rb +1 -1
data/lib/transfuse/consensus.rb +105 -0
data/lib/transfuse/transfuse.rb +108 -88
data/lib/transfuse/version.rb +2 -2
data/lib/transfuse.rb +5 -4
data/test/test_transfuse.rb +38 -40
data/transfuse.gemspec +1 -1
metadata +5 -5
data/Gemfile.lock +0 -87

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: a5b17813bba553fa78a7e3cf24366981e368ebfe
-  data.tar.gz: f57a5011b3296862518ce54a7f3d4bb928e05aab
+  metadata.gz: 03b84ea2236b7c43ce321f64352bb6cb7b7035d4
+  data.tar.gz: ed70d9a9c204a06b550a7b95c01c74afdcb98b51
 SHA512:
-  metadata.gz: 52af21dbf0c5b49aa746d515a214229bccd78a55d5998ddc929b2594d8abe5bafcb9cd1bbc98ec6ea1a41a5cb3240666616837d3d06c5cbacd25acca3fe392ca
-  data.tar.gz: 147cc022f22ad319ff0651a6092ced5d3f1c7e99ab3989a8091970ab5c25a97e42d9b98f7384e28712ef802c4ced8090c3942ba66105963013eb30e46c3b58cb
+  metadata.gz: 6074f06d8afd9e33fce7b77e0221ffc8a300da966cc7916f86eb245d02c1e2b5519a831e1c99962fcfd5b085b5cd3d94765297ec94469270e4246c6c9ab71830
+  data.tar.gz: d992bac7cb1a98cd523e4df28d5b684f42bf8d8d7deb7882d42ecd8bb50bb137a190200006c7528f834d9e30a214b1f304a46b8502176993e22b019658c19061

data/README.md CHANGED Viewed

@@ -1 +1,42 @@
-Transfuse
+## Transfuse
+**Transfuse is currently in development and is not yet ready for use**
+Transfuse intelligently merges your multiple de novo transcriptome assemblies. Run multiple assemblies with different de novo assemblers, or different settings in the same assembler and have them combined into a single high quality transcriptome.
+Transfuse takes in the reads you used to do the assembly and a list of fasta files and produces a single output fasta file.
+### Installation and Running
+To install Transfuse, clone this repo:
+`git clone https://github.com/cboursnell/transfuse.git`
+Then build and install the ruby gem
+`gem build *spec; gem install *gem`
+### Usage
+Transfuse is run on the command line. The options are:
+```
+  -a, --assembly=<s>    assembly files in FASTA format, comma-separated
+  -l, --left=<s>        left reads file in FASTQ format
+  -r, --right=<s>       right reads file in FASTQ format
+  -o, --output=<s>      write merged assembly to file
+  -t, --threads=<i>     number of threads (default: 1)
+  -v, --verbose         be verbose
+  -e, --version         Print version and exit
+  -h, --help            Show this message
+```
+An example command:
+`transfuse --assembly soap-k31.fa,soap-k41.fa,soap-k51.fa --left reads_1.fq --right reads_2.fq --output soap-merged.fa --threads 12`
+### License
+This is adademic software - please cite us if you use it in your work.
+Transfuse is released under the MIT license.

data/Rakefile CHANGED Viewed

@@ -5,15 +5,21 @@ Rake::TestTask.new do |t|
 end
 Rake::TestTask.new do |t|
-  t.name = :corset
+  t.name = :cluster
   t.libs << 'test'
-  t.test_files = ['test/test_corset.rb']
+  t.test_files = ['test/test_cluster.rb']
 end
 Rake::TestTask.new do |t|
-  t.name = :cluster
+  t.name = :fuse
   t.libs << 'test'
-  t.test_files = ['test/test_cluster.rb']
+  t.test_files = ['test/test_transfuse.rb']
+end
+Rake::TestTask.new do |t|
+  t.name = :cons
+  t.libs << 'test'
+  t.test_files = ['test/test_consensus.rb']
 end
 desc "Run tests"

data/bin/transfuse CHANGED Viewed

@@ -22,23 +22,24 @@ opts = Trollop::options do
   OPTIONS:
   EOS
-  opt :assembly, "assembly files in FASTA format, comma-separated",
+  opt :assemblies, "assembly files in FASTA format, comma-separated",
       :type => String, :required => true
-  opt :scores, "transrate contig score output files, comma-separated",
-      :type => String
   opt :left, "left reads file in FASTQ format",
       :type => String
   opt :right, "right reads file in FASTQ format",
       :type => String
+  opt :scores, "transrate contig score output files, comma-separated. Ignored if reads are provided",
+      :type => String
   opt :output, "write merged assembly to file",
       :type => String, :required => :true
   opt :threads, "number of threads", :type => :int, :default => 1
+  opt :id, "sequence identity to cluster at", :type => :float, :default => 1.0
   opt :verbose, "be verbose"
 end
 transfuse = Transfuse::Transfuse.new opts.threads, opts.verbose
-assembly_files = transfuse.check_files opts.assembly
+assembly_files = transfuse.check_files opts.assemblies
 score_files = transfuse.check_files opts.score if opts.score
 left = transfuse.check_files opts.left if opts.left
 right = transfuse.check_files opts.right if opts.right
@@ -54,6 +55,7 @@ else
   abort msg
 end
+# filter out assemblies with low score
 assembly_files = transfuse.filter assembly_files, scores
 # concatenate assemblies into one fasta file
@@ -62,12 +64,11 @@ cat = transfuse.concatenate assembly_files
 # load fasta sequences from concatenated file into hash
 transfuse.load_fasta cat
-# cluster using vsearch or maybe cd-hit-est
-clusters = transfuse.cluster cat
-transfuse.sequence_alignment clusters
-# pull out contigs from each cluster based on the scores
-# best = transfuse.select_contigs clusters, scores
+# cluster using vsearch
+msa = transfuse.cluster cat, opts.id
-# transfuse.output_contigs best, cat, opts.output
+# read the msa from vsearch and produce a consensus fasta
+cons = transfuse.consensus msa, scores, opts.output
+# transrate the consensus output to remove low scoring contigs
+transfuse.transrate_consensus cons, opts.output, left, right

data/deps/deps.yaml CHANGED Viewed

@@ -0,0 +1,11 @@
+vsearch:
+  binaries:
+    - vsearch
+  version:
+    number: '1.1.3'
+    command: 'vsearch --version'
+  url:
+    64bit:
+      linux: https://github.com/torognes/vsearch/releases/download/v1.1.3/vsearch-1.1.3-linux-x86_64
+      macosx: https://github.com/torognes/vsearch/releases/download/v1.1.3/vsearch-1.1.3-osx-x86_64
+  unpack: false

data/lib/transfuse/cluster.rb CHANGED Viewed

@@ -5,61 +5,31 @@ module Transfuse
   class Cluster
-    def initialize threads, verbose
-      @cdhit = Which::which('cd-hit-est').first
-      raise "cd-hit-est was not in the PATH - please install it" unless @cdhit
+    def initialize threads, verbose, id
       @vsearch = Which::which('vsearch').first
       raise "vsearch was not in the PATH - please install it" unless @vsearch
-      @id = "1.00"
+      @id = id.to_s
       @threads = threads
       @verbose = verbose
     end
     def run fasta
-      use_cd_hit = false
-      if use_cd_hit
-        output = cd_hit fasta
-        return parse_output output
-      else
-        output = vsearch fasta
-        return parse_vsearch_output output
-      end
-    end
-    def cd_hit fasta
-      puts "running cd-hit-est" if @verbose
-      output = "#{File.basename(fasta, File.extname(fasta))}_cdhit.fa"
-      cdhit_cmd = generate_cdhit_command fasta, output
-      puts cdhit_cmd if @verbose
-      cluster = Cmd.new cdhit_cmd
-      cluster.run output
-      return "#{output}.clstr"
+      cluster_output, msa_output = vsearch fasta
+      return parse_vsearch_output(cluster_output, msa_output)
     end
     def vsearch fasta
-      puts "running vsearch" if @verbose
-      cluster_output = "#{fasta}.clust"
-      vsearch_cmd = generate_vsearch_command fasta, cluster_output
+      print "running vsearch..." if @verbose
+      cluster_output = "#{File.basename(fasta)}-#{@id}.clust"
+      msa_output = "#{File.basename(fasta)}-#{@id}.aln"
+      vsearch_cmd = generate_vsearch_command fasta, cluster_output, msa_output
       cluster = Cmd.new vsearch_cmd
       cluster.run cluster_output
-      return cluster_output
-    end
-    def generate_cdhit_command fasta, out
-      #cd-hit-est -i all.fa  -o cd-hit-clusters.txt -c 0.99999 -T 24 -d 100
-      cmd = "#{@cdhit}"
-      cmd << " -i #{fasta}"
-      cmd << " -o #{out}"
-      cmd << " -c #{@id}" # similarity = number of identical bases /
-                          #              length of shorter sequences
-      cmd << " -T #{@threads}"
-      cmd << " -n 10" # word length - maybe increase??
-      cmd << " -d 100" # output name width
-      cmd << " -g 1" # slower but more accurate mode
-      cmd << " -M 8000" # increase memory
+      puts " Done. Created #{cluster_output}" if @verbose
+      return [cluster_output, msa_output]
     end
-    def generate_vsearch_command fasta, out
+    def generate_vsearch_command fasta, out, msa
       vsearch = "#{@vsearch}"
       vsearch << " --cluster_fast #{fasta}"
       vsearch << " --id #{@id}"
@@ -67,45 +37,60 @@ module Transfuse
       vsearch << " --qmask none" # no masking
       vsearch << " --strand both"
       vsearch << " --uc #{out}"
+      vsearch << " --msaout #{msa}"
       vsearch << " --threads #{@threads}"
       return vsearch
     end
-    def parse_output cluster_output
-      puts "parsing cd-hit output #{cluster_output}" if @verbose
-      cluster_id = 0
-      clusters = {}
-      File.open(cluster_output).each_line do |line|
-        if line =~ />Cluster\ ([0-9]+)/
-          cluster_id = $1.to_i
-        elsif line =~ /[0-9]+\s+.+nt,\ >(.+)\.\.\.\sat\s([+\-])\/([0-9\.]+)\%/
-          contig_name = $1
-          strand = $2
-          id = $3.to_f
-          clusters[cluster_id] ||= []
-          clusters[cluster_id] << { :name => contig_name, :strand => strand }
-        elsif line =~ /[0-9]+\s+[0-9]+nt,\s>(.+)\.\.\.\s\*/
-          contig_name = $1
-          strand = "+"
-          clusters[cluster_id] ||= []
-          clusters[cluster_id] << { :name => contig_name, :strand => strand }
-        end
-      end
-      return clusters
-    end
-    def parse_vsearch_output cluster_output
+    def parse_vsearch_output cluster_output, msa_output
+      print "parsing vsearch output" if @verbose
       clusters = {}
+      lookup = {}
+      second = 0
+      count = 0
       File.open(cluster_output).each_line do |line|
+        count+=1
         if line.start_with?("S") or line.start_with?("H")
           cols = line.chomp.split("\t")
-          cluster = cols[1].to_i
+          cluster = cols[1]
+          len = cols[2].to_i
+          cigar = cols[7]
+          strand = cols[4]
+          strand = "+" if strand == "*"
           contig_name = cols[8]
           clusters[cluster] ||= []
-          clusters[cluster] << contig_name
+          clusters[cluster] << { :name => contig_name, :strand => strand }
+          lookup[contig_name] = cluster
+        end
+        if count%10_000==0 and @verbose
+          print "."
         end
       end
-      return clusters
+      puts " Done" if @verbose
+      print "parsing msa output    " if @verbose
+      count = 0
+      msa = {}
+      Bio::FastaFormat.open(msa_output).each do |entry|
+        count += 1
+        name = entry.entry_id
+        if name != "consensus"
+          # name = name[1..-1]
+          if name[0]=="*"
+            name = name[1..-1]
+          end
+          # what cluster is name in?
+          cluster = lookup[name]
+          msa[cluster] ||= []
+          msa[cluster] << { :name => name, :seq => entry.seq.seq }
+        end
+        if count%10_000==0 and @verbose
+          print "."
+        end
+      end
+      puts " Done" if @verbose
+      return msa
     end
   end

data/lib/transfuse/cmd.rb CHANGED Viewed

@@ -18,7 +18,7 @@ module Transfuse
     def run file=nil
       unless file.nil?
-        if File.exist?(file)
+        if File.exist?(file) and File.stat(file).size > 0
           @stdout = ""
           @stderr = ""
           @status = Status.new

data/lib/transfuse/consensus.rb ADDED Viewed

@@ -0,0 +1,105 @@
+require 'bio'
+require 'set'
+module Transfuse
+  class Consensus
+    attr_reader :contigs
+    def initialize verbose
+      @verbose = verbose
+    end
+    def run msa, scores, output
+      return 1 if File.exist?(output)
+      print "writing consensus " if @verbose
+      # msa is a hash
+      #   key = cluster id
+      #   value = list
+      #     list of sequences in cluster aligned with gaps
+      preoutput = "#{File.basename(output, File.extname(output))}_cons.fa"
+      count = 0
+      File.open("#{output}.data", "w") do |out2|
+        File.open(preoutput, "w") do |out|
+          msa.each do |id, list|
+            count+=1
+            print "." if count%5_000==0 and @verbose
+            exons={}
+            cons = []
+            length = list[0][:seq].length
+              list.each_with_index do |hash, index|
+                seq = hash[:seq]
+                name = hash[:name]
+                out2.write "#{id}\t#{scores[name][:score]}\t#{name}\n"
+                prev = ""
+                gap = 0
+                exon = 0
+                seq.each_char do |c|
+                  if c=="-"
+                    base="-"
+                  else
+                    base="*"
+                  end
+                  if base!=prev
+                    if c=="-"
+                      gap+=1
+                    else
+                      exon+=1
+                    end
+                  end
+                  if c=="-"
+                    prev = "-"
+                  else
+                    prev = "*"
+                  end
+                end
+                exons[index] = exon
+              end
+            consensus = ""
+            0.upto(length-1) do |i|
+              base="N"
+              counts = {}
+              list.each_with_index do |hash, index|
+                seq = hash[:seq]
+                if seq[i] != "-" and seq[i] != "N"
+                  counts[seq[i]]||=0
+                  counts[seq[i]] += 1
+                  if exons[index]==1
+                    base = seq[i]
+                  end
+                end
+              end
+              if counts.size>0
+                base = counts.sort.last.first
+              end
+              consensus << base
+            end
+            if consensus.count("N") < consensus.length.to_f*0.5
+              cons << consensus
+            end
+            list.each_with_index do |hash, index|
+              if exons[index] > 1
+                cons << hash[:seq].delete("-")
+              end
+            end
+            cons.each_with_index do |s,index|
+              out.write ">contig#{id}.#{index+1}\n"
+              out.write "#{s}\n"
+            end
+          end # msa.each
+        end # file
+      end # file open
+      puts " Done" if @verbose
+      return preoutput
+    end # def
+  end
+end

data/lib/transfuse/transfuse.rb CHANGED Viewed

@@ -6,27 +6,28 @@ end
 module Transfuse
+  require 'bio'
   require 'csv'
   require 'transrate'
+  require 'threach'
   class Transfuse
     def initialize threads, verbose
       @threads = threads
       @verbose = verbose
-      @clustalo = Which::which('clustalo').first
-      raise "clustalo was not in the PATH - please install it" unless @clustalo
     end
     def check_files string
+      # puts "check file string: #{string}" if @verbose
       list = []
       string.split(",").each do |file|
         file = File.expand_path(file)
         if File.exist?(file)
-          puts "#{file} exists" if @verbose
+          puts "#{File.basename(file)} exists" if @verbose
           list << file
         else
-          abort "#{file} not found"
+          abort "#{File.basename(file)} not found"
         end
       end
       return list
@@ -51,42 +52,27 @@ module Transfuse
       return File.expand_path(catted_fasta)
     end
-    def cluster file
-      puts "clustering #{file}" if @verbose
-      cluster = Cluster.new @threads, @verbose
-      return cluster.run file
-    end
     def load_fasta fasta
+      print "loading fasta sequence #{fasta}..." if @verbose
       @sequences = {}
+      count = 1
       Bio::FastaFormat.open(fasta).each do |entry|
         @sequences[entry.entry_id] = entry.seq.to_s
+        print "." if count%10_000==0 and @verbose
+        count +=1
       end
+      puts " Done" if @verbose
     end
-    def sequence_alignment clusters
-      clusters.each do |id, list| # threach
-        if list.size > 5
-          seq = ""
-          list.each do |hash|
-            seq << ">#{hash[:name]}\n"
-            if hash[:strand] == "+"
-              seq << "#{@sequences[hash[:name]]}\n"
-            elsif hash[:strand] == "-"
-              seq << "#{@sequences[hash[:name]].revcomp}\n"
-            else
-              abort "Unknown strand #{hash[:strand]}"
-            end
-          end
-          cmd = "echo -e \"#{seq}\" | #{@clustalo} -i - --outfmt fa "
-          cmd << "--output-order tree-order"
-          align = Cmd.new cmd
-          align.run
-          File.open("cluster#{id}.fa", "wb") do |out|
-            out.write align.stdout
-          end
-        end
-      end
+    def cluster file, id
+      puts "clustering #{file}" if @verbose
+      cluster = Cluster.new @threads, @verbose, id
+      return cluster.run file
+    end
+    def consensus msa, scores, output
+      cons = Consensus.new(@verbose)
+      return cons.run(msa, scores, output)
     end
     def load_scores files
@@ -96,8 +82,10 @@ module Transfuse
                           :header_converters => :symbol,
                           :converters => :all) do |row|
           name = row[:contig_name]
-          score = row[:score]
-          scores[name] = score
+          scores[name] = { :score => row[:score].to_f,
+                           :p_good => row[:p_good].to_f,
+                           :p_bases_covered => row[:p_bases_covered].to_f,
+                           :coverage => row[:coverage].to_f }
         end
       end
       return scores
@@ -107,13 +95,15 @@ module Transfuse
       filtered_files = []
       files.each_with_index do |file, index|
         new_filename = "#{File.basename(file, File.extname(file))}_filtered.fa"
-        unless File.exist?(new_filename)
+        if !File.exist?(new_filename) or File.stat(new_filename).size < 1
           File.open(new_filename, "wb") do |out|
-            puts "opening #{file}..."
+            puts "filtering #{file}..." if @verbose
             Bio::FastaFormat.open(file).each do |entry|
               contig_name = entry.entry_id
               contig_name = "contig#{index}_#{contig_name}"
-              if scores.key?(contig_name) and scores[contig_name] > 0.01
+              if scores.key?(contig_name) and
+                 scores[contig_name][:score] > 0.01 and
+                 scores[contig_name][:coverage] >= 1
                 out.write ">#{contig_name}\n"
                 out.write "#{entry.seq}\n"
               elsif !scores.key?(contig_name)
@@ -127,75 +117,105 @@ module Transfuse
       return filtered_files
     end
+    def transrate_consensus file, output, left, right
+      output = File.expand_path(output)
+      puts "transrate on #{file}" if @verbose
+      file = File.expand_path(file)
+      name = File.basename(file, File.extname(file))
+      dir = "transrate_#{name}"
+      Dir.mkdir(dir) unless Dir.exist?(dir)
+      Dir.chdir(dir) do
+        assembly = Transrate::Assembly.new(file)
+        transrater = Transrate::Transrater.new(assembly, nil, threads:@threads)
+        rename = "assembly_#{name}_score_optimisation.csv"
+        rm = transrater.read_metrics(left.join(','), right.join(','))
+        stats = rm.read_stats
+        File.rename("assembly_score_optimisation.csv", rename)
+        scores={}
+        assembly.each do |name, contig|
+          scores[name] = { :score => contig.score.to_f,
+                           :p_good => contig.p_good.to_f,
+                           :p_bases_covered => contig.p_bases_covered.to_f,
+                           :coverage => contig.coverage.to_f }
+        end
+        scores_file = "#{name}_scores.csv"
+        stats_file = "../#{name}_stats.txt"
+        puts "  writing scores" if @verbose
+        File.open(scores_file, "wb") do |out|
+          scores.each do |name, hash|
+            out.write "#{name}\t#{hash[:score]}\t#{hash[:p_good]}\t"
+            out.write "#{hash[:p_bases_covered]}\t#{hash[:coverage]}\n"
+          end
+        end
+        puts "  writing filtered fasta file" if @verbose
+        File.open(output, "wb") do |out|
+          assembly.each do |name, contig|
+            if contig.score.to_f > 0.01 and contig.coverage.to_f >= 1
+              out.write ">#{name}\n"
+              out.write "#{contig.seq.seq}\n"
+            end
+          end
+        end
+        puts "  writing stats" if @verbose
+        File.open(stats_file, "wb") do |out|
+          stats.each do |key, value|
+            out.write "#{key}\t#{value}\n"
+          end
+          out.write "assembly score:\t#{transrater.assembly_score}\n"
+          optimal = transrater.assembly_optimal_score("prefix")
+          out.write "optimal score :\t#{optimal[0]}\n"
+          out.write "cutoff        :\t#{optimal[1]}\n"
+        end
+      end
+    end
     def transrate files, left, right
       scores = {}
-      scores_file = "scores.csv"
+      shortname = ""
+      files.each do |n|
+        shortname << File.basename(n, File.extname(n))[0..4]
+      end
+      scores_file = "#{shortname}_scores.csv"
       if File.exist?(scores_file)
         puts "loading scores from file" if @verbose
         File.open(scores_file).each do |line|
-          name, score = line.chomp.split("\t")
-          scores[name] = score.to_f
+          name, score, p_good, p_bases_covered, coverage = line.chomp.split("\t")
+          scores[name] = { :score => score.to_f,
+                           :p_good => p_good.to_f,
+                           :p_bases_covered => p_bases_covered.to_f,
+                           :coverage => coverage.to_f }
         end
       else
         files.each_with_index do |fasta, index|
           puts "transrate on #{fasta}" if @verbose
-          assembly = Transrate::Assembly.new(fasta)
-          transrater = Transrate::Transrater.new(assembly, nil, threads:@threads)
-          transrater.read_metrics(left.join(','), right.join(','))
-          assembly.each do |name, contig|
-            name = "contig#{index}_#{name}"
-            scores[name] = contig.score
+          dir = "transrate_#{File.basename(fasta, File.extname(fasta))}"
+          Dir.mkdir(dir) unless Dir.exist?(dir)
+          Dir.chdir(dir) do
+            assembly = Transrate::Assembly.new(fasta)
+            transrater = Transrate::Transrater.new(assembly, nil, threads:@threads)
+            rename = "assembly#{index}_score_optimisation.csv"
+            transrater.read_metrics(left.join(','), right.join(','))
+            File.rename("assembly_score_optimisation.csv", rename)
+            assembly.each do |name, contig|
+              name = "contig#{index}_#{name}"
+              scores[name] = { :score => contig.score.to_f,
+                               :p_good => contig.p_good.to_f,
+                               :p_bases_covered => contig.p_bases_covered.to_f,
+                               :coverage => contig.coverage.to_f }
+            end
           end
         end
         File.open(scores_file, "wb") do |out|
-          scores.each do |name, score|
-            out.write "#{name}\t#{score}\n"
+          scores.each do |name, hash|
+            out.write "#{name}\t#{hash[:score]}\t#{hash[:p_good]}\t"
+            out.write "#{hash[:p_bases_covered]}\t#{hash[:coverage]}\n"
           end
         end
       end
       return scores
     end
-    def select_contigs clusters, scores
-      puts "selecting contigs" if @verbose
-      best = []
-      clusters.each do |cluster_id, list|
-        best_score = 0
-        best_contig = ""
-        list.each do |contig_name|
-          unless scores[contig_name]
-            abort "can't find #{contig_name} in scores hash\n"
-          end
-          if scores[contig_name] > best_score
-            best_score = scores[contig_name]
-            best_contig = contig_name
-          end
-        end
-        best << best_contig
-      end
-      return best
-    end
-    def output_contigs best, fasta, output
-      puts "writing contigs" if @verbose
-      # read in catted fasta sequences
-      sequences = {}
-      Bio::FastaFormat.open(fasta).each do |entry|
-        sequences[entry.entry_id] = entry.seq
-      end
-      File.open(output, "wb") do |out|
-        best.each do |contig_name|
-          if sequences.key?(contig_name)
-            out.write ">#{contig_name}\n"
-            out.write "#{sequences[contig_name]}\n"
-          else
-            puts "can't find #{contig_name} in #{fasta}"
-          end
-        end
-      end
-    end
   end
 end

data/lib/transfuse/version.rb CHANGED Viewed

@@ -7,8 +7,8 @@ module Transfuse
   # Semantic Versioning 2.0 (http://semver.org/).
   module VERSION
     MAJOR = 0
-    MINOR = 1
-    PATCH = 4
+    MINOR = 4
+    PATCH = 2
     BUILD = nil
     STRING = [MAJOR, MINOR, PATCH, BUILD].compact.join('.')

data/lib/transfuse.rb CHANGED Viewed

@@ -1,4 +1,5 @@
-require 'transfuse/cluster.rb'
-require 'transfuse/cmd.rb'
-require 'transfuse/transfuse.rb'
-require 'transfuse/version.rb'
+require 'transfuse/cluster'
+require 'transfuse/cmd'
+require 'transfuse/consensus'
+require 'transfuse/transfuse'
+require 'transfuse/version'

data/test/test_transfuse.rb CHANGED Viewed

@@ -8,13 +8,13 @@ class TestTransfuse < Test::Unit::TestCase
   context 'transfuse' do
     setup do
-      @fuser = Transfuse::Transfuse.new 4
+      @fuser = Transfuse::Transfuse.new 4, true
     end
     teardown do
     end
-    should 'check for existence of files' do
+    should '1 check for existence of files' do
       list = []
       list << File.join(File.dirname(__FILE__), 'data', 'assembly1.fasta')
       list << File.join(File.dirname(__FILE__), 'data', 'assembly2.fasta')
@@ -22,7 +22,7 @@ class TestTransfuse < Test::Unit::TestCase
       assert_equal 2, files.length, "length"
     end
-    should "concatenate two files" do
+    should "2 concatenate two files" do
       list = []
       list << File.join(File.dirname(__FILE__), 'data', 'assembly1.fasta')
       list << File.join(File.dirname(__FILE__), 'data', 'assembly2.fasta')
@@ -36,71 +36,69 @@ class TestTransfuse < Test::Unit::TestCase
       end
     end
-    should "cluster fasta file" do
-      Dir.mktmpdir do |tmpdir|
+    should "3 cluster fasta file" do
+      # Dir.mktmpdir do |tmpdir|
+      tmpdir = Dir.mktmpdir
         Dir.chdir(tmpdir) do
           file = File.join(File.dirname(__FILE__), 'data', 'assembly1.fasta')
           hash = @fuser.cluster file
           assert_equal 250, hash.size, "output size"
         end
-      end
+      # end
     end
-    should "load scores from transrate output" do
+    should "4 load scores from transrate output" do
       files = []
       files << File.join(File.dirname(__FILE__), 'data', 'contig_scores1.csv')
       hash = @fuser.load_scores files
       assert_equal 99, hash.size
     end
-    should "filter contigs" do
+    should "5 run transrate on assembly files with reads" do
       files = []
-      scores = {}
-      files << File.join(File.dirname(__FILE__), 'data', 'assembly1.fasta')
-      scores["soap_contig173359"] = 1
-      scores["soap_contig38533"] = 0.5
-      scores["idba_contig44716"] = 0
-      new_list = @fuser.filter files, scores
-      assert_equal 1, new_list.length
-      cmd = "grep -c \">\" #{new_list.first}"
-      assert_equal 2, `#{cmd}`.chomp.split.first.to_i, "contigs"
+      left = []
+      right = []
+      files << File.join(File.dirname(__FILE__), 'data', 'assembly3.fasta')
+      left << File.join(File.dirname(__FILE__), 'data', 'left.fq')
+      right << File.join(File.dirname(__FILE__), 'data', 'right.fq')
+      # Dir.mktmpdir do |tmpdir|
+      tmpdir = Dir.mktmpdir
+        Dir.chdir(tmpdir) do
+          scores = @fuser.transrate files, left, right
+          assert_equal 100, scores.size, "scores size"
+        end
+      # end
     end
-    should "run transrate on assembly files with reads" do
+    should "6 filter contigs" do
       files = []
       left = []
       right = []
-      files << File.join(File.dirname(__FILE__), 'data', 'assembly3.fasta')
+      files << File.join(File.dirname(__FILE__), 'data', 'assembly1.fasta')
       left << File.join(File.dirname(__FILE__), 'data', 'left.fq')
       right << File.join(File.dirname(__FILE__), 'data', 'right.fq')
-      Dir.mktmpdir do |tmpdir|
+      # Dir.mktmpdir do |tmpdir|
+      tmpdir = Dir.mktmpdir
         Dir.chdir(tmpdir) do
           scores = @fuser.transrate files, left, right
-          assert_equal 100, scores.size, "scores size"
+          scores.each do |contig, score|
+            # puts "#{contig}\t#{score}"
+          end
+          new_list = @fuser.filter files, scores
+          assert_equal 1, new_list.length
+          cmd = "grep -c \">\" #{new_list.first}"
+          assert_equal 1, `#{cmd}`.chomp.split.first.to_i, "number of contigs"
         end
-      end
+      # end
     end
-    should "select contigs" do
-      clusters = {"0" => ["contig1", "contig2"], "1" => ["contig3", "contig4"]}
-      scores = { "contig1" => 0.2,
-                 "contig2" => 0.3,
-                 "contig3" => 0.4,
-                 "contig4" => 0.2 }
-      best = @fuser.select_contigs clusters, scores
-      assert_equal 2, best.size
-      assert_equal "contig2", best[0]
-      assert_equal "contig3", best[1]
+    should "7 get consensus of clusters" do
     end
-    should "output contigs" do
-      best = ["soap_contig173359", "oases_contig80246"]
-      file = File.join(File.dirname(__FILE__), 'data', 'assembly1.fasta')
-      Dir.mktmpdir do |tmpdir|
-        Dir.chdir(tmpdir) do
-          @fuser.output_contigs best, file, "out"
-        end
-      end
+    should "8 not fail when there are duplicated kmers in the input sequences" do
     end
   end

data/transfuse.gemspec CHANGED Viewed

@@ -19,7 +19,7 @@ Gem::Specification.new do |gem|
   gem.add_dependency 'bio', '~> 1.4', '>= 1.4.3'
   gem.add_dependency 'fixwhich', '~> 1.0', '>= 1.0.2'
   gem.add_dependency 'bindeps', '~> 1.0', '>= 1.0.1'
-  gem.add_dependency 'transrate', '~> 1.0', '>= 1.0.0'
+  gem.add_dependency 'transrate', '~> 1.0', '>= 1.0.1'
   gem.add_development_dependency 'rake', '~> 10.3', '>= 10.3.2'
   gem.add_development_dependency 'turn', '~> 0.9', '>= 0.9.7'

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: transfuse
 version: !ruby/object:Gem::Version
-  version: 0.1.4
+  version: 0.4.2
 platform: ruby
 authors:
 - Richard Smith-Unna
@@ -94,7 +94,7 @@ dependencies:
         version: '1.0'
     - - ">="
       - !ruby/object:Gem::Version
-        version: 1.0.0
+        version: 1.0.1
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
@@ -104,7 +104,7 @@ dependencies:
         version: '1.0'
     - - ">="
       - !ruby/object:Gem::Version
-        version: 1.0.0
+        version: 1.0.1
 - !ruby/object:Gem::Dependency
   name: rake
   requirement: !ruby/object:Gem::Requirement
@@ -210,7 +210,6 @@ extra_rdoc_files: []
 files:
 - ".gitignore"
 - Gemfile
-- Gemfile.lock
 - README.md
 - Rakefile
 - bin/transfuse
@@ -218,6 +217,7 @@ files:
 - lib/transfuse.rb
 - lib/transfuse/cluster.rb
 - lib/transfuse/cmd.rb
+- lib/transfuse/consensus.rb
 - lib/transfuse/transfuse.rb
 - lib/transfuse/version.rb
 - notes.md
@@ -248,7 +248,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 2.4.6
+rubygems_version: 2.2.2
 signing_key:
 specification_version: 4
 summary: Merge assemblies

data/Gemfile.lock DELETED Viewed

@@ -1,87 +0,0 @@
-PATH
-  remote: .
-  specs:
-    transfuse (0.1.1)
-      bindeps (~> 1.0, >= 1.0.1)
-      bio (~> 1.4, >= 1.4.3)
-      fixwhich (~> 1.0, >= 1.0.2)
-      transrate (= 1.0.0.beta3)
-      trollop (~> 2.0)
-GEM
-  remote: https://rubygems.org/
-  specs:
-    ansi (1.5.0)
-    bindeps (1.1.2)
-      fixwhich (~> 1.0, >= 1.0.2)
-    bio (1.4.3.0001)
-    coveralls (0.8.1)
-      json (~> 1.8)
-      rest-client (>= 1.6.8, < 2)
-      simplecov (~> 0.10.0)
-      term-ansicolor (~> 1.3)
-      thor (~> 0.19.1)
-    crb-blast (0.6.4)
-      bindeps (~> 1.0, >= 1.0.3)
-      bio (~> 1.4, >= 1.4.3)
-      fixwhich (~> 1.0, >= 1.0.2)
-      threach (~> 0.2, >= 0.2.0)
-      trollop (~> 2.0)
-    docile (1.1.5)
-    domain_name (0.5.24)
-      unf (>= 0.0.5, < 1.0.0)
-    facade (1.0.6)
-    fix-trinity-output (1.0.0)
-      trollop (~> 2.0)
-    fixwhich (1.0.2)
-      pathname2 (~> 1.4, >= 1.4.4)
-    http-cookie (1.0.2)
-      domain_name (~> 0.5)
-    json (1.8.3)
-    mime-types (2.6.1)
-    minitest (4.7.5)
-    netrc (0.10.3)
-    pathname2 (1.7.3)
-      facade
-    rake (10.4.2)
-    rest-client (1.8.0)
-      http-cookie (>= 1.0.2, < 2.0)
-      mime-types (>= 1.16, < 3.0)
-      netrc (~> 0.7)
-    shoulda-context (1.2.1)
-    simplecov (0.10.0)
-      docile (~> 1.1.0)
-      json (~> 1.8)
-      simplecov-html (~> 0.10.0)
-    simplecov-html (0.10.0)
-    term-ansicolor (1.3.0)
-      tins (~> 1.0)
-    thor (0.19.1)
-    threach (0.2.0)
-    tins (1.5.2)
-    transrate (1.0.0.beta3)
-      bindeps (~> 1.1, >= 1.1.2)
-      bio (~> 1.4, >= 1.4.3)
-      crb-blast (~> 0.5, >= 0.5.0)
-      fix-trinity-output (~> 1.0, >= 1.0)
-      trollop (~> 2.0, >= 2.0.0)
-      yell (~> 2.0, >= 2.0.4)
-    trollop (2.1.1)
-    turn (0.9.7)
-      ansi
-      minitest (~> 4)
-    unf (0.1.4)
-      unf_ext
-    unf_ext (0.0.7.1)
-    yell (2.0.5)
-PLATFORMS
-  ruby
-DEPENDENCIES
-  coveralls (~> 0.7)
-  rake (~> 10.3, >= 10.3.2)
-  shoulda-context (~> 1.2, >= 1.2.1)
-  simplecov (~> 0.8, >= 0.8.2)
-  transfuse!
-  turn (~> 0.9, >= 0.9.7)