RubyGems - transrate - Versions diffs - 0.1.0 → 0.2.0 - Mend

transrate 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

checksums.yaml +4 -4
data/.gitignore +16 -1
data/.travis.yml +8 -0
data/README.md +45 -43
data/Rakefile +36 -0
data/bin/transrate +98 -50
data/deps/deps.yaml +55 -0
data/lib/transrate.rb +19 -4
data/lib/transrate/assembly.rb +93 -182
data/lib/transrate/bowtie2.rb +37 -13
data/lib/transrate/cmd.rb +19 -0
data/lib/transrate/comparative_metrics.rb +239 -19
data/lib/transrate/contig.rb +212 -0
data/lib/transrate/contig_metrics.rb +76 -0
data/lib/transrate/read_metrics.rb +83 -41
data/lib/transrate/samtools.rb +73 -0
data/lib/transrate/transrater.rb +31 -11
data/lib/transrate/version.rb +1 -1
data/test/data/150uncovered.l.fq +892 -0
data/test/data/150uncovered.r.fq +892 -0
data/test/data/Os.protein.2.fa +95 -0
data/test/data/Os.protein.fa +199 -0
data/test/data/assembly.2.fa +26 -0
data/test/{assembly.fasta → data/assembly.fasta} +0 -0
data/test/data/bridging_reads.l.fastq +20 -0
data/test/data/bridging_reads.r.fastq +20 -0
data/test/data/sorghum_transcript.fa +4 -0
data/test/data/tiny.sam +4 -0
data/test/helper.rb +33 -2
data/test/test_bowtie.rb +54 -0
data/test/test_cmd.rb +15 -0
data/test/test_comp_metrics.rb +177 -0
data/test/test_contig.rb +61 -0
data/test/test_contig_metrics.rb +50 -0
data/test/test_inline.rb +10 -9
data/test/test_read_metrics.rb +68 -0
data/test/test_samtools.rb +22 -0
data/test/test_transrate.rb +40 -0
data/test/test_transrater.rb +68 -0
data/transrate.gemspec +16 -10
metadata +232 -57
data/lib/transrate/express.rb +0 -37
data/lib/transrate/log.rb +0 -16
data/lib/transrate/rb_hit.rb +0 -33
data/lib/transrate/reciprocal_annotation.rb +0 -105
data/lib/transrate/usearch.rb +0 -66
data/test/test_test.rb +0 -41

data/test/test_cmd.rb ADDED Viewed

@@ -0,0 +1,15 @@
+require 'helper'
+class TestCmd < Test::Unit::TestCase
+  context "Cmd" do
+    should "run commands" do
+      cmd = Transrate::Cmd.new 'echo "success"'
+      cmd.run
+      assert_equal "success", cmd.stdout.chomp, 'run echo command'
+    end
+  end
+end

data/test/test_comp_metrics.rb ADDED Viewed

@@ -0,0 +1,177 @@
+require 'helper'
+class TestCompMetrics < Test::Unit::TestCase
+  context "ComparativeMetrics" do
+    setup do
+      querypath = File.join(File.dirname(__FILE__),
+                            'data',
+                            'assembly.2.fa')
+      targetpath = File.join(File.dirname(__FILE__),
+                            'data',
+                            'Os.protein.2.fa')
+      assembly = Transrate::Assembly.new(querypath)
+      reference = Transrate::Assembly.new(targetpath)
+      threads = 8
+      @comp = Transrate::ComparativeMetrics.new(assembly, reference, threads)
+    end
+    should "run metrics on assembly" do
+      Dir.mktmpdir do |tmpdir|
+        Dir.chdir tmpdir do
+          @comp.run
+          assert @comp.has_run
+        end
+      end
+    end
+    should "calculate ortholog hit ratio" do
+      crb = CRBHelper.new(false)
+      hash = Hash.new
+      (1..11).each do |i|
+        hash["q#{i}"] = []
+      end
+      # Q1   |------------|
+      # Q2                    |--------------|
+      # T1 |------------------------------------|  # coverage = 200/500
+      hash["q1"] << HitHelper.new("q1", "t1", 1, 100, 101, 200, 100, 500)
+      hash["q2"] << HitHelper.new("q2", "t1", 1, 100, 301, 400, 100, 500)
+      # Q3                    |------------|
+      # Q4   |--------------|
+      # T2 |------------------------------------| # coverage = 200/500
+      hash["q3"] << HitHelper.new("q3", "t2", 1, 100, 301, 400, 100, 500)
+      hash["q4"] << HitHelper.new("q4", "t2", 1, 100, 101, 200, 100, 500)
+      # Q5                |------------|
+      # Q6   |-------------------|
+      # T3 |------------------------------------| # coverage = 300/500
+      hash["q5"] << HitHelper.new("q5", "t3", 1, 200, 201, 400, 200, 500)
+      hash["q6"] << HitHelper.new("q6", "t3", 1, 200, 101, 300, 200, 500)
+      # Q7             |------------|
+      # Q8      |------------------------|
+      # T3 |------------------------------------| # coverage = 300/500
+      hash["q7"] << HitHelper.new("q7", "t4", 1, 100, 201, 300, 100, 500)
+      hash["q8"] << HitHelper.new("q8", "t4", 1, 300, 101, 400, 300, 500)
+      # Q9     |--------|
+      # Q10                        |--------|
+      # Q11        |--------------------|
+      # T5 |------------------------------------| # coverage = 600/1000
+      hash["q9"] << HitHelper.new("q9",   "t5", 1, 200, 201, 400, 200, 1000)
+      hash["q10"] << HitHelper.new("q10", "t5", 1, 200, 601, 800, 200, 1000)
+      hash["q11"] << HitHelper.new("q11", "t5", 1, 400, 301, 700, 400, 1000)
+      crb.hash = hash
+      ohr = @comp.ortholog_hit_ratio crb
+      assert_equal 16.0/30.0, ohr
+    end
+    should "calculate potential chimera count" do
+      crb = CRBHelper.new(false)
+      hash = Hash.new
+      (1..3).each do |i|
+        hash["q#{i}"] = []
+      end
+      # T1   |---------|
+      # T2                 |---------|
+      # Q1 |----------------------------| # chimera = true
+      hash["q1"] << HitHelper.new("q1", "t1", 101, 200, 1, 100, 500, 100)
+      hash["q1"] << HitHelper.new("q1", "t2", 301, 400, 1, 100, 400, 100)
+      # T3   |---------|
+      # T3                 |---------|
+      # Q2 |----------------------------|
+      # chimera = true because the reference has the region 1-100 duplicated
+      hash["q2"] << HitHelper.new("q2", "t3", 101, 200, 1, 100, 500, 100)
+      hash["q2"] << HitHelper.new("q2", "t3", 301, 400, 1, 100, 400, 100)
+      # # T4   |---------|
+      # # T4                 |---------|
+      # # Q3 |----------------------------|
+      # # chimera = false because the reference
+      hash["q3"] << HitHelper.new("q3", "t4", 101, 200, 1, 100, 500, 200)
+      hash["q3"] << HitHelper.new("q3", "t4", 301, 400, 101, 200, 400, 200)
+      crb.hash = hash
+      @comp.chimeras crb
+      assert_equal 0.667, @comp.p_chimeras.round(3)
+    end
+    should "calculate overlap amount" do
+      assert_equal 0.5, @comp.overlap_amount(201,500,101,400), "1"
+      assert_equal 0.5, @comp.overlap_amount(101,400,201,500), "2"
+      assert_equal 0.5, @comp.overlap_amount(201,400,101,500), "3"
+      assert_equal 0.5, @comp.overlap_amount(101,500,201,400), "4"
+    end
+    should "calculate number of contigs with crbblast hit" do
+      Dir.mktmpdir do |tmpdir|
+        Dir.chdir tmpdir do
+          @comp.run
+          assert_equal 11, @comp.comp_stats[:n_contigs_with_recip]
+          assert_equal 11/13.0, @comp.comp_stats[:p_contigs_with_recip]
+        end
+      end
+    end
+    should "calculate number of reference sequences with crbblast hit" do
+      Dir.mktmpdir do |tmpdir|
+        Dir.chdir tmpdir do
+          @comp.run
+          assert_equal 10, @comp.comp_stats[:n_refs_with_recip]
+          assert_equal 0.5, @comp.comp_stats[:p_refs_with_recip]
+        end
+      end
+    end
+    should "calculate reference sequence coverage" do
+      # n&p of reference sequences covered to (25, 50, 75, 85, 95%)
+      # of their length by CRB-BLAST hit
+      Dir.mktmpdir do |tmpdir|
+        Dir.chdir tmpdir do
+          @comp.run
+          stats = @comp.comp_stats
+          assert_equal 10, stats[:cov25]
+          assert_equal 10, stats[:cov50]
+          assert_equal 7, stats[:cov75]
+          assert_equal 6, stats[:cov85]
+          assert_equal 3, stats[:cov95]
+        end
+      end
+    end
+    should "number of reference sequences coverage" do
+      # n&p of reference sequences covered to (25, 50, 75, 85, 95%)
+      # of their length by CRB-BLAST hit
+      crb = CRBHelper.new(false)
+      hash = Hash.new
+      (1..5).each do |i|
+        hash["q#{i}"] = []
+      end
+      hash["q1"] << HitHelper.new("q1", "t1", 1, 250, 101, 350, 250, 1000)
+      hash["q2"] << HitHelper.new("q2", "t2", 1, 500, 101, 600, 500, 1000)
+      hash["q3"] << HitHelper.new("q3", "t3", 1, 750, 101, 850, 750, 1000)
+      hash["q4"] << HitHelper.new("q4", "t4", 1, 850, 101, 950, 850, 1000)
+      hash["q5"] << HitHelper.new("q5", "t5", 1, 950, 1, 950, 950, 1000)
+      crb.hash = hash
+      ohr = @comp.ortholog_hit_ratio crb
+      stats = @comp.comp_stats
+      assert_equal 5, stats[:cov25]
+      assert_equal 4, stats[:cov50]
+      assert_equal 3, stats[:cov75]
+      assert_equal 2, stats[:cov85]
+      assert_equal 1, stats[:cov95]
+    end
+  end
+end

data/test/test_contig.rb ADDED Viewed

@@ -0,0 +1,61 @@
+require 'helper'
+require 'bio'
+class TestContig < Test::Unit::TestCase
+  context "Contig" do
+    setup do
+      seq = Bio::Sequence.new 'ATGCGTGTATATACGCGTAG'
+      @contig = Transrate::Contig.new seq
+    end
+    should "know the number and proportion of each base it contains" do
+      assert_equal 5, @contig.bases_a, "count of base a"
+      assert_equal 0.25, @contig.prop_a, "proportion of base a"
+      assert_equal 3, @contig.bases_c, "count of base c"
+      assert_equal 0.15, @contig.prop_c, "proportion of base c"
+      assert_equal 6, @contig.bases_g, "count of base g"
+      assert_equal 0.3, @contig.prop_g, "proportion of base g"
+      assert_equal 6, @contig.bases_t, "count of base t"
+      assert_equal 0.3, @contig.prop_t, "proportion of base t"
+      assert_equal 0, @contig.bases_n, "count of base n"
+      assert_equal 0.0, @contig.prop_n, "proportion of base n"
+    end
+    should "know how many of each two-base pair it contains" do
+      assert_equal 3, @contig.dibase_composition[:cg], "cg count"
+      assert_equal 3, @contig.dibase_composition[:at], "at count"
+      assert_equal 2, @contig.dibase_composition[:tg], "tg count"
+    end
+    should "know its own gc content" do
+      assert_equal 9, @contig.bases_gc, "count of bases that are c or g"
+      assert_equal 0.45, @contig.prop_gc.round(2),
+                   "proportion of bases that are c or g"
+    end
+    should "know its own base-pair skew" do
+      assert_equal 0.45, @contig.gc_skew.round(2), "gc skew"
+      assert_equal 0.55, @contig.at_skew.round(2), "at skew"
+    end
+    should "know its own CpG count and density" do
+      assert_equal 3, @contig.cpg_count, "cpg count"
+      assert_equal 66.67, @contig.cpg_ratio.round(2), "cpg ratio"
+    end
+    should "know the length of its own longest orf" do
+      assert_equal 6, @contig.orf_length, "orf length"
+    end
+    should "know its own linguistic complexity" do
+      assert_equal 0.0586, @contig.linguistic_complexity(4).round(4),
+                   "linguistic complexity k=4"
+      assert_equal 0.0037, @contig.linguistic_complexity(6).round(4),
+                   "linguistic complexity k=6"
+    end
+  end
+end

data/test/test_contig_metrics.rb ADDED Viewed

@@ -0,0 +1,50 @@
+require 'helper'
+class TestContigMetrics < Test::Unit::TestCase
+  context "transrate" do
+    setup do
+      querypath = File.join(File.dirname(__FILE__), 'data',
+                            'assembly.fasta')
+      assembly = Transrate::Assembly.new(querypath)
+      @contig_metrics = Transrate::ContigMetrics.new(assembly)
+    end
+    should "run metrics on assembly" do
+      @contig_metrics.run
+      assert @contig_metrics.has_run
+    end
+    should "get gc content" do
+      @contig_metrics.run
+      assert_equal 0.37672, @contig_metrics.gc_prop.round(5)
+    end
+    should "get gc skew" do
+      @contig_metrics.run
+      assert_equal 0.00440, @contig_metrics.gc_skew.round(5)
+    end
+    should "get at skew" do
+      @contig_metrics.run
+      assert_equal -0.00718, @contig_metrics.at_skew.round(5)
+    end
+    should "get CpG density" do
+      @contig_metrics.run
+      assert_equal 0.52828, @contig_metrics.cpg_ratio.round(5)
+    end
+    should "get linguistic complexity" do
+      @contig_metrics.run
+      assert_equal 0.26599, @contig_metrics.linguistic_complexity.round(5)
+    end
+    should "get the number and proportion of Ns" do
+      @contig_metrics.run
+      assert_equal 2, @contig_metrics.bases_n
+      assert_equal 0.00033, @contig_metrics.proportion_n.round(5)
+    end
+  end
+end

data/test/test_inline.rb CHANGED Viewed

@@ -1,29 +1,30 @@
 #!/usr/bin/env  ruby
 require 'helper'
+require 'bio'
 class TestInline < Test::Unit::TestCase
   context 'transrate' do
     setup do
-      @a = Transrate::Assembly.new('test/assembly.fasta')
-      @seq1 = 'ATGCCCCTAGGGTAG'
+      filepath = File.join(File.dirname(__FILE__), 'data', 'assembly.fasta')
+      @a = Transrate::Assembly.new(filepath)
     end
     should 'find longest orf in file' do
       orfs = []
-      @a.assembly.each do |entry|
-        l = @a.orf_length entry.seq
-        orfs << l
+      @a.assembly.each do |contig|
+        orfs << contig.orf_length
       end
-      assert_equal orfs.length, 4
-      assert_equal orfs, [333, 370, 131, 84]
+      assert_equal 4, orfs.length
+      assert_equal [333, 370, 131, 84], orfs
     end
     should 'find longest orf in sequence' do
-      l = @a.orf_length(@seq1)
-      assert_equal l, 4
+      seq = Bio::Sequence.new 'ATGCCCCTAGGGTAG'
+      contig = Transrate::Contig.new seq
+      assert_equal 4, contig.orf_length
     end
   end

data/test/test_read_metrics.rb ADDED Viewed

@@ -0,0 +1,68 @@
+require 'helper'
+require 'tmpdir'
+class TestReadMetrics < Test::Unit::TestCase
+  context "ReadMetrics" do
+    setup do
+      query = File.join(File.dirname(__FILE__), 'data',
+                        'sorghum_transcript.fa')
+      assembly = Transrate::Assembly.new(query)
+      @read_metrics = Transrate::ReadMetrics.new(assembly)
+    end
+    teardown do
+      if File.exist?("test/data/sorghum_transcript.fa.fai")
+        rm = "rm test/data/sorghum_transcript.fa.fai"
+        `#{rm}`
+      end
+    end
+    should "setup correctly" do
+      assert @read_metrics
+    end
+    should "calculate read mapping statistics" do
+      left = File.join(File.dirname(__FILE__), 'data', '150uncovered.l.fq')
+      right = File.join(File.dirname(__FILE__), 'data', '150uncovered.r.fq')
+      Dir.mktmpdir do |tmpdir|
+        Dir.chdir tmpdir do
+          @read_metrics.run(left, right)
+          stats = @read_metrics.read_stats
+          assert @read_metrics.has_run
+          assert_equal 223, stats[:num_pairs], 'number of read pairs'
+          assert_equal 202, stats[:total_mappings], 'number mapping'
+          assert_equal 90.58, stats[:percent_mapping].round(2),
+                       'percent mapping'
+          assert_equal 202, stats[:good_mappings], 'good mapping'
+          assert_equal 90.58,
+                       stats[:pc_good_mapping].round(2),
+                       'percent good mapping'
+          assert_equal 0, stats[:bad_mappings], 'bad mapping'
+          assert_equal 22.91, stats[:mean_coverage].round(2), 'mean coverage'
+          assert_equal 11, stats[:n_uncovered_bases], 'n uncovered bases'
+          assert_equal 0.007,
+                       stats[:p_uncovered_bases].round(3),
+                       'p uncovered bases'
+        end
+      end
+    end
+    should "find read pairs that support scaffolding" do
+      left = File.join(File.dirname(__FILE__), 'data', 'bridging_reads.l.fastq')
+      right = File.join(File.dirname(__FILE__),
+                        'data',
+                        'bridging_reads.r.fastq')
+      Dir.mktmpdir do |tmpdir|
+        Dir.chdir tmpdir do
+          @read_metrics.run(left, right)
+          stats = @read_metrics.read_stats
+          assert_equal 1, stats[:potential_bridges], 'potential bridges'
+        end
+      end
+    end
+  end
+end

data/test/test_samtools.rb ADDED Viewed

@@ -0,0 +1,22 @@
+require 'helper'
+class TestSamtools < Test::Unit::TestCase
+  context "samtools" do
+    should "know the path to samtools binary" do
+      msg = /Program: samtools/
+      path = Transrate::Samtools.path
+      res = `#{path} 2>&1`.split("\n").join
+      assert msg =~ res
+    end
+    should "run commands" do
+      sam = File.join(File.dirname(__FILE__), 'data', 'tiny.sam')
+      Transrate::Samtools.run "view -bS #{sam} > tiny.bam"
+      assert_equal 460, File.size('tiny.bam'), 'bam file should be created'
+      File.delete 'tiny.bam'
+    end
+  end
+end

data/test/test_transrate.rb ADDED Viewed

@@ -0,0 +1,40 @@
+#!/usr/bin/env	ruby
+require 'helper'
+class TestTransrate < Test::Unit::TestCase
+  context "transrate" do
+    setup do
+      filepath = File.join(File.dirname(__FILE__), 'data', 'assembly.fasta')
+      @a = Transrate::Assembly.new(filepath)
+    end
+    should "create assembly object" do
+      assert @a
+      assert_equal @a.assembly.size, 4
+    end
+    should "run basic stats" do
+      stats = @a.basic_stats
+      assert_equal stats["n_seqs"], 4
+      assert_equal stats["smallest"], 1409
+      assert_equal stats["largest"], 1630
+      assert_equal stats["mean_len"], 1508.25
+    end
+    should "run metrics on assembly" do
+      ans = @a.run(2) # using 2 threads
+      assert_equal ans, true, "should run but returned #{ans}"
+    end
+    should "find the mean length" do
+      ans = @a.run(2)
+      mean = @a.mean_len
+      n_bases = @a.n_bases
+      assert_equal mean, 1508.25
+      assert_equal n_bases, 6033
+    end
+  end
+end