transrate 1.0.0.beta3 → 1.0.0.beta4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,7 @@
1
1
  module Transrate
2
2
 
3
+ class ReadMetricsError < TransrateError; end
4
+
3
5
  class ReadMetrics
4
6
 
5
7
  attr_reader :fragments, :fragments_mapping, :p_good_mapping
@@ -9,7 +11,7 @@ module Transrate
9
11
  attr_reader :read_length
10
12
 
11
13
  def initialize assembly
12
- @assembly = assembly
14
+ @assembly = assembly # Transrate::Assembly
13
15
  @mapper = Snap.new
14
16
  @salmon = Salmon.new
15
17
  self.initial_values
@@ -60,7 +62,12 @@ module Transrate
60
62
  if !File.exist?(assigned_bam)
61
63
  assigned_bam = assign_and_quantify(bamfile, threads)
62
64
  end
63
- File.rename(assigned_bam, final_bam)
65
+ if File.exist?(assigned_bam)
66
+ File.rename(assigned_bam, final_bam)
67
+ else
68
+ logger.error "Couldn't find #{assigned_bam} to rename"
69
+ raise ReadMetricsError
70
+ end
64
71
  end
65
72
  # analyse the final mappings
66
73
  analyse_read_mappings final_bam
@@ -116,6 +123,7 @@ module Transrate
116
123
  def analyse_expression salmon_output
117
124
  salmon_output.each_pair do |name, expr|
118
125
  contig_name = Bio::FastaDefline.new(name.to_s).entry_id
126
+ contig_name.gsub!(/;$/, '') # trim trailing semicolon
119
127
  contig = @assembly[contig_name]
120
128
  if expr[:eff_len]==0
121
129
  coverage = 0
@@ -188,6 +196,7 @@ module Transrate
188
196
 
189
197
  def populate_contig_data row
190
198
  name = Bio::FastaDefline.new(row[:name].to_s).entry_id
199
+ name.gsub!(/;$/, '') # trim trailing semicolon
191
200
  contig = @assembly[name]
192
201
  contig.p_seq_true = row[:p_seq_true]
193
202
  contig.uncovered_bases = row[:bases_uncovered]
@@ -17,6 +17,7 @@ module Transrate
17
17
  def run assembly, bamfile, threads=8
18
18
  assembly = assembly.file if assembly.is_a? Assembly
19
19
  output = "quant.sf"
20
+ sampled_bam = "postSample.bam"
20
21
  @fin_output = "#{File.basename assembly}_#{output}"
21
22
  unless File.exist? @fin_output
22
23
  salmon = Cmd.new build_command(assembly, bamfile, threads)
@@ -25,9 +26,13 @@ module Transrate
25
26
  logger.error salmon.stderr
26
27
  raise SalmonError.new("Salmon failed")
27
28
  end
29
+ unless File.exist?(sampled_bam)
30
+ logger.error salmon.stderr
31
+ raise SalmonError.new("#{sampled_bam} not created")
32
+ end
28
33
  File.rename(output, @fin_output)
29
34
  end
30
- return 'postSample.bam'
35
+ return sampled_bam
31
36
  end
32
37
 
33
38
  def build_command assembly, bamfile, threads=4
@@ -39,7 +44,7 @@ module Transrate
39
44
  cmd << " --sampleOut"
40
45
  cmd << " --sampleUnaligned" # thanks Rob!
41
46
  cmd << " --output ."
42
- cmd << " --useReadCompat"
47
+ cmd << " --useVBOpt"
43
48
  cmd << " --useErrorModel"
44
49
  cmd
45
50
  end
@@ -49,9 +54,14 @@ module Transrate
49
54
  File.open(file).each do |line|
50
55
  if line !~ /^#/
51
56
  line = line.chomp.split("\t")
57
+ unless line.length == 4
58
+ raise SalmonError.new("Salmon output file should have 4 columns " +
59
+ "but it had #{line.length}\n" +
60
+ "Please check you are using the correct version of Salmon")
61
+ end
52
62
  target = line[0]
53
63
  effective_length = line[1]
54
- effective_count = line[4]
64
+ effective_count = line[3]
55
65
  tpm = line[2]
56
66
  expression[target] = {
57
67
  :eff_len => effective_length.to_i,
@@ -6,6 +6,8 @@ module Transrate
6
6
  # while also minimising the number of low scoring contigs.
7
7
  class ScoreOptimiser
8
8
 
9
+ require 'csv'
10
+
9
11
  def initialize assembly, read_metrics
10
12
  @assembly = assembly
11
13
  @fragments = read_metrics.fragments
@@ -20,7 +22,7 @@ module Transrate
20
22
  @contig_score * (@good / @total.to_f)
21
23
  end
22
24
 
23
- def optimal_score
25
+ def optimal_score(prefix='assembly')
24
26
  return [@optimal, @cutoff] unless @optimal.nil?
25
27
  product = 0
26
28
  good = 0
@@ -44,7 +46,10 @@ module Transrate
44
46
  end
45
47
  @optimal = 0
46
48
  @cutoff = 0
49
+ out = CSV.open("#{prefix}_score_optimisation.csv", 'w')
50
+ out << %w[cutoff assembly_score]
47
51
  cutoffscores.each do |c, score|
52
+ out << [c, score]
48
53
  if score > @optimal
49
54
  @optimal = score
50
55
  @cutoff = c
@@ -41,8 +41,7 @@ module Transrate
41
41
  cmd
42
42
  end
43
43
 
44
- def map_reads(file, left, right, insertsize: 200,
45
- insertsd: 50, outputname: nil, threads: 8)
44
+ def map_reads(file, left, right, outputname: nil, threads: 8)
46
45
  raise SnapError.new("Index not built") if !@index_built
47
46
 
48
47
  lbase = File.basename(left.split(",").first)
@@ -69,11 +69,11 @@ module Transrate
69
69
  return @score_optimiser.raw_score
70
70
  end
71
71
 
72
- def assembly_optimal_score
72
+ def assembly_optimal_score prefix
73
73
  if !@score_optimiser
74
74
  @score_optimiser = ScoreOptimiser.new(@assembly, @read_metrics)
75
75
  end
76
- return @score_optimiser.optimal_score
76
+ return @score_optimiser.optimal_score prefix
77
77
  end
78
78
 
79
79
  def assembly_metrics
@@ -11,7 +11,7 @@ module Transrate
11
11
  MAJOR = 1
12
12
  MINOR = 0
13
13
  PATCH = 0
14
- BUILD = 'beta3'
14
+ BUILD = 'beta4'
15
15
 
16
16
  STRING = [MAJOR, MINOR, PATCH, BUILD].compact.join('.')
17
17
  end
@@ -1,6 +1,6 @@
1
1
  >Sb09g017110.1
2
2
  AGCGTGGGAAGGGAGATCACTGGAAGGGAGAAAGAAATGGAGCATGAAGAAGAGCTGCCTTCATCTTCTTCCTCCTTGGGTTACCTGATGCAGTGTAGGATCTGCCACGAGGAAGAGAACGAAGGGCGCGCGATCATGGAGTCTCCTTGTGGATGCTCCGGCTCTCTCAAGTATGCTCACAGGGGATGTGTGCAGAGATGGTGTGATGAGAAGGGGAGCACCCTCTGTGAGATTTGCCTTCAGAATTTCGAGCCAGGCTACACAATGCCTCCAAAGAAAACTCCGGCGATTGAAACTGCGGTCACTATCAGTGAACATGAGGACATGCAACCTTTGGAATCTCCGGAGGGCTCCATTGACGGTGCAGATTACACCAGGTGCTCCTACGCCGCAGATCAATGCGCCACATGGTGCCGGTCGCTGGCGATCACGTTCACCATTATGCTGTTGGCATGGCATCTGGTTGCAGTAGTGACGGTTGAAGCAGCAGATCACTGCGCGTTCAGTCTCCTGACAATGTACTTACTTCGTGCTGCTGGTATCCTGCTGCCGCTCTATGTTGTCATGCGGCTGATTCGCATCGTCCAGAATGGGCAGAGGCAGTATCGGTTGCAGCTGCTGGAGGACCAAAGAAGAAATGCATCAACTATATGAATTATATGCGTGACCACGAGAAGGACCAGCTAGTTATTAATATTCATTAAGCCAATGAATTTAAATGAATCTTATGGAAGGTTGTACAGATGTACATTATGTATTACTGGGGAATTTTTCAAACAGAACATCATTTGTAAACTTTCAGTATACATACCGTTGGTGCAGTGAAAAACAGCAGC
3
- >Sb02g028080.1
3
+ >Sb02g028080.1;
4
4
  ATGGGGACCCCTCTCCTCTTCCCCCTTCTCGTCACCCTCCAGCTGTTCACCGCCGCCTCCCCCGCGGTCGCGTCGTCGCACATCTCCGTCGTCATCTCGCAGTCGGGCCTCGACTTCGCCAAGGACCTGCTCGTGTCCCGTGCCGTCGCGACCCTCACGCCCCTGAACGTGCCGGACATCCAGAAGACCATGAGCACCGTCGTGGGCACCGTCCGCGTGGCCGCATCCGGGATTGTGCTCAACGGCCTCGCCGTCACCAACTCCACCGTCGCTATCGGGGACACGGGTGTTGTCGTGGCCGCCTCGTTGGCCAGAGCGAACCTCACCATGGAGTGGAACTACTCGTACAGCGCCTGGATTGTGACCATATCCGACAGCGGGAATGCTTCGATCCAGGTTGAAGGAATGGAGGTTGGTGTTTCCATGGTCATGAAGAATCAGAATGGATCTATCAAGCTGTCTGTTACAGAATGTAGCTGTAATATGGAGGACTTAGACATAACACTAAGTGGAGGAGCATCTTGGTTCTATCAAGTGTTTATAGATAGTTTCAGTAATCATATCAGATCATCAGTGGAAAATGCAATTGAGAACAAAGTAATGGAAGGTGCACTGAAGCTTGACTCTTTCCTGGGAAACCTTCCAAAGAAAATTGATCTTGATAGCGTTGCTGCAATGAATGTGACTTTTGTTAATGATCCACTATTCAAGAGCTCCTCTGTTGAGTTTGATATAGATGGCTTATTTATTCCATCGGATGAAACTGCTCCCGGAGACATGCTTCTTGGAAATACCCAATTTGCATTACCTCTTGGGAGCTCCTCGAGAATGCTTTGGATTTCATTGGATGAAGATGTTTTCAACTCCGTTTCAGCTCTCTACTTCAAGGCTGGTTTGCTGCAACGGATGGTGGACGAGGTTCCTGAACAGTTTCTTTTGAACACTGCTAGCTGGAGATTTTTGGTTCCTCGATTGTATCGAGAATATCCTGACGATGATATGCTACTGAATATCTCTGCAGTTTCGCCTCCCTCTGTGAGGATTAATGTGGGTAGAATTGATGCCACAGTTGACTTAGATGTCACAGTTAATGTCTTGGATTTTGGTGAGATAGTTCCAGTTGCATGCATATCAGTGTCGGTTGCTGTTTCTGGAGCTGCAGCGGTATCAGAGAATAATCTTGTTGGGAGAGTGAAATTGGATTATTTCTCATTTACCTTGAAATGGAGCAAAGTTGGCAAACTCCACACCAGTCTAGTGCAGACCGTGCTGAGGATTTTGCTGAAAAGTTTGTTTGTACCTTATGTGAACTCATATCTCGAGCAAGGCTTCCAGCTGCCCATCATCAAGGGATTCTCCGTCATAGATGCATATGTCCTCACTTCTTACTCAAGAATGATTGTTAGCTGCAATGTTGCGTTCCCTGAGCCAGAGGTTCTGTCTCCTATCCAAGAATCCAAGACCAACGAAGATTTGTCACATGAAGTTGGTTTGCTGATTGGATCTGCCAAAACTTGGCAGCCACCGATAACTAGTGTAAAATTCCTGTAAACAGTAACGCTATTCGTCGCAGTGTTTTGTTTTTAAAAATGTGTAAATAACTGTGCACTGTATATATGTATGTATGTATGTATGTATATATGCATATTAAATCAACAGATAGGAGCCATTCCGTAGCCTCTAAGTGGAATCGTTAGGTACTACAGTTTCCTCTATCCCAAATTATAAGCTGTTCTAGTTTTTT
5
5
  >Sb03g034100.1
6
6
  CTCACTCTCACACTACTCCTCCCCTCTCCGGCTCTCTGCCTCTGACGTCTGACCTCTCCTCCCCAACGGTGAGGCCGGCGCATTGCCGTTTCGAGCGCGGACACCGAGGGCTAGAACTAGAAGTGGCGGCGGTGCCAGGGCTCGGCGCTCGGTCGGCAATGGCGGGGCGGCTTATGCTGGCGGCGCTCCCTATTCTCCTCTTCTTATTGCTCGTCGGGCAATGCCACGGCGGCAAGATTGGCGTCTGCTACGGCCGCAACGCCGACGACCTGCCGGCGCCGGACAAGGTGGCGCAGCTAATCCAGCAGCAATCCATCAAGTACGTGCGCATCTACGACACCAACATCGACGTCATCAAGGCCTTCGCCAACACCGGCGTCGAGCTCATGGTCGGCGTCCCCAACTCCGACCTCCTCGCCTTCGCGCAGTACCAGTCCAACGTCGACACATGGCTCAAGAACAGCATTCTCCCCTACTACCCGGCCACCATGATCACCTACATCACCGTCGGCGCCGAGGTCACCGAGAGCCCCACCAACGTCTCCGCCCTCGTCGTGCCTGCCATGCGCAATGTGCACACCGCACTCAAGAAGGCCGGCCTGCACAAGAAGATCACCATCTCCAGCACCCACTCGCTCGGGATACTGTCACGGTCGTTCCCGCCGTCTGCTGGGGCGTTCAACAGCAGCTACGCCTACTTCTTGAAGCCTATGCTCGAGTTCCTTGTGGAGAATCAGGCGCCGTTCATGGTGGATTTATACCCCTACTATGCGTACCAGAACTCACCGAGCAATGTGTCCCTCAACTACGCCCTGTTCTCGCCACAGTCTCAGGATGTGATTGACCCAAACACTGGACTGGTTTACACTAACATGTTTGATGCCCAGGTTGATTCCATCTTCTTTGCGCTCATGGCTCTGAACTTCAAAACTCTGAAGATCATGATCACTGAGTCAGGGTGGCCAAACAAAGGGGCGGCCAAGGAGACTGGAGCCACTCCAGACAATGCTCAGACTTACAATACCAATTTGATACGCCATGTTGTTAATGACAGTGGCACGCCTGCGAAACCAGGGGAAGAAATTGATGTCTACATATTTTCATTGTTCAATGAGAACAGGAAACCTGGCATTGAGTCGGAGAGGAACTGGGGACTGTTTTTTCCTGATAAGAGCTCTATCTACAGCCTTGATTGGACGGGCCGAGGCAATGTGGATGTTATGACTGGAGCAAACATTACAAGTGCAAATGGTACCTGGTGTATTGCTTCAGCTAATGCATCAGAAACAGATCTGCAGAATGCCCTCAACTGGGCATGTGGTCCAGGCAACGTAGATTGCTCTGCCATTCAACCAAGCCAACCCTGCTACCAGCCGGACACTTTAGCTTCCCATGCTTCATATGCATTCAATAGCTACTACCAGCAAAATGGAGCCAACGTTGTGGCCTGTGACTTCAGTGGTGCGGGAATACGAACGACGAAAGATCCAAGTTACGACACTTGTGTCTATTTGGCTGCAGGCAATAAGATGAGCACAATGAATTCGACATCTCTTCCAGCTCAGAGCAACTCTGGTCCAGTTCCATGCGCCAAATACTTCACCACTTTCCTCCCCATGCTGGCCCCCGTGATGGCTGCAGTTATGCTGTGATCTATGGAAATGCTCCAGCTAGCCTCTGCAGATGTGGAGATGAAAGGTGAATTGCGTAATGCTGGTAACCAGCCGATGTTCTGTTTTGCTATGAGCAGTAGACTAGTAGTAGTCTAGTAGAGAGGCATATTATGCTGCTGTAGGAATTCTCTGGTCAGTTGAGATGTACATCGTCGCGCAGACAATATATATCAGCTGGCCTTAAGAACTCGATAACCTTTTCTGCTGTCTTTCG
data/test/data/test.sf CHANGED
@@ -9,22 +9,22 @@
9
9
  # [ sampleOut ] => { }
10
10
  # [ useFragLenDist ] => { }
11
11
  # [ sampleUnaligned ] => { }
12
- # Name Length TPM FPKM NumReads
13
- scaffold1 1016 549.279 527.364 20690
14
- scaffold2 1439 598.782 574.892 31945
15
- scaffold3 783 408.072 391.791 11846
16
- scaffold4 893 441.382 423.772 14613
17
- scaffold5 622 494.487 474.758 11403
18
- scaffold6 2073 4.77214 4.58174 366.764
19
- scaffold7 1291 4.288 4.11692 205.236
20
- scaffold8 1355 17.9155 17.2007 900
21
- scaffold9 258 15.891 15.257 152
22
- scaffold10 1934 104.823 100.641 7516
23
- scaffold11 1922 23.9916 23.0344 1709.57
24
- scaffold12 1651 136.498 131.052 8355
25
- scaffold13 1834 360.757 346.363 24529.4
26
- scaffold14 580 6.13864 5.89373 132
27
- scaffold15 1539 4.8197 4.62741 275
28
- scaffold16 2302 26.1878 25.1429 2235
29
- scaffold17 543 1.98695 1.90767 40
30
- scaffold18 4121 25.4151 24.4011 3883
12
+ # Name Length TPM NumReads
13
+ scaffold1 1016 549.279 20690
14
+ scaffold2 1439 598.782 31945
15
+ scaffold3 783 408.072 11846
16
+ scaffold4 893 441.382 14613
17
+ scaffold5 622 494.487 11403
18
+ scaffold6 2073 4.77214 366.764
19
+ scaffold7 1291 4.288 205.236
20
+ scaffold8 1355 17.9155 900
21
+ scaffold9 258 15.891 152
22
+ scaffold10 1934 104.823 7516
23
+ scaffold11 1922 23.9916 1709.57
24
+ scaffold12 1651 136.498 8355
25
+ scaffold13 1834 360.757 24529.4
26
+ scaffold14 580 6.13864 132
27
+ scaffold15 1539 4.8197 275
28
+ scaffold16 2302 26.1878 2235
29
+ scaffold17 543 1.98695 40
30
+ scaffold18 4121 25.4151 3883
@@ -28,9 +28,9 @@ class TestAssembly < Test::Unit::TestCase
28
28
  assert File.exist?("good.sorghum_100.fa"), "good output exists"
29
29
  assert File.exist?("bad.sorghum_100.fa"), "bad output"
30
30
  file_size = File.stat("good.sorghum_100.fa").size
31
- assert_in_delta 81_000, file_size, 5000, "good file size"
31
+ assert_in_delta 86_119, file_size, 5000, "good file size"
32
32
  file_size = File.stat("bad.sorghum_100.fa").size
33
- assert_in_delta 58_000, file_size, 5000, "bad file size"
33
+ assert_in_delta 53_000, file_size, 5000, "bad file size"
34
34
  end
35
35
  end
36
36
  end
data/test/test_bin.rb CHANGED
@@ -116,6 +116,7 @@ class TestTransrateBin < Test::Unit::TestCase
116
116
  end
117
117
  assert_in_delta 137748, hash[:n_bases], 1000, "number of bases"
118
118
  assert_equal 1692, hash[:n50], "n50"
119
+ assert_equal 25006 + 223, hash[:fragments], "number of reads"
119
120
  end
120
121
 
121
122
  should "fail when one of multiple assemblies is missing" do
@@ -1,210 +1,381 @@
1
1
  require 'helper'
2
+ require 'crb-blast'
3
+
4
+ module Transrate
5
+ class ComparativeMetrics
6
+ attr_reader :assembly
7
+ attr_reader :reference
8
+ attr_reader :crbblast
9
+ end
10
+ end
2
11
 
3
12
  module CRB_Blast
4
13
  class CRB_Blast
5
- def change_hit(query_name, target_name, qstart, qend, tstart, tend, qlen, tlen)
6
- hits = @reciprocals[query_name]
7
- hits.each do |hit|
8
- if hit.target == target_name
9
- hit.qstart = qstart
10
- hit.qend = qend
11
- hit.tstart = tstart
12
- hit.tend = tend
13
- hit.qlen = qlen
14
- hit.tlen = tlen
14
+ def add_missing
15
+ @missed.each do |query_id, missed|
16
+ missed.each do |hit|
17
+ @reciprocals[hit.query] ||= []
18
+ @reciprocals[hit.query] << hit
15
19
  end
16
20
  end
17
21
  end
22
+ end
23
+ end
18
24
 
19
- def add_hit(query_name, target_name, qstart, qend, tstart, tend, qlen, tlen)
20
- @reciprocals[query_name] ||= []
21
- list = Array.new(14)
22
- list[0] = query_name
23
- list[1] = target_name
24
- list[6] = qstart
25
- list[7] = qend
26
- list[8] = tstart
27
- list[9] = tend
28
- list[12] = qlen
29
- list[13] = tlen
30
- @reciprocals[query_name] << Hit.new(list)
31
- end
25
+ class Tester
26
+ def self.testpath file
27
+ return File.join(File.dirname(__FILE__), 'data', file)
28
+ end
32
29
 
33
- def remove_hit(query_name)
34
- @reciprocals.delete(query_name)
35
- end
30
+ def self.run_comp_metrics(query, target)
31
+ querypath = testpath(query)
32
+ targetpath = testpath(target)
33
+ @assembly = Transrate::Assembly.new(querypath)
34
+ @reference = Transrate::Assembly.new(targetpath)
35
+ @comp = Transrate::ComparativeMetrics.new(@assembly, @reference, 1)
36
+ @comp.run
37
+ return @comp
36
38
  end
37
39
  end
38
40
 
39
- class TestCompMetrics < Test::Unit::TestCase
41
+ class TestCompMetrics2 < Test::Unit::TestCase
42
+
40
43
 
41
44
  context "ComparativeMetrics" do
42
45
 
43
46
  setup do
44
- querypath = File.join(File.dirname(__FILE__),
45
- 'data',
46
- 'assembly.2.fa')
47
- targetpath = File.join(File.dirname(__FILE__),
48
- 'data',
49
- 'Os.protein.2.fa')
50
- @assembly = Transrate::Assembly.new(querypath)
51
- @q_ids = @assembly.assembly.keys
52
- @reference = Transrate::Assembly.new(targetpath)
53
- @t_ids = @reference.assembly.keys
54
- threads = 8
55
- @comp = Transrate::ComparativeMetrics.new(@assembly, @reference, threads)
56
- end
57
-
58
- should "run metrics on assembly" do
59
- Dir.mktmpdir do |tmpdir|
60
- Dir.chdir tmpdir do
61
- @comp.run
62
- assert @comp.has_run
63
- end
64
- end
65
- end
66
-
67
- should "calculate reference coverage" do
68
- crb = @comp.reciprocal_best_blast
69
- # change the results so i know what i have
70
- # qstart, qend, tstart, tend, qlen, tlen
71
- #
72
- # Q |------------|
73
- # T1 |-------------------------|
74
- crb.change_hit("scaf_Os03g60760.1", "LOC_Os03g60760.1",
75
- 1, 300, 101, 200, 300, 200) # 0.5
76
- @reference["LOC_Os03g60760.1"].seq = "A"*200
77
- #
78
- # Q1 |----------|
79
- # Q2 |------------|
80
- # T2 |-------------------------------|
81
- crb.change_hit("scaf_Os10g39590.1", "LOC_Os10g39590.1",
82
- 1, 150, 51, 100, 150, 200) # 0.25
83
- crb.add_hit("scaf_Os10g39590.1", "LOC_Os10g39590.1",
84
- 1, 150, 151, 200, 150, 200) # 0.25
85
- @reference["LOC_Os10g39590.1"].seq = "A"*200
86
- #
87
- # adding first block [151..300] scaf_Os09g38670.1
88
- # 450 / 600.0
89
- # LOC_Os09g38670.1 0.75
90
-
91
- #
92
- #
93
- # Q1 |-----------|
94
- # Q2 |----------------------|
95
- # T3 |-------------------------------|
96
- crb.change_hit("scaf_Os09g38670.1", "LOC_Os09g38670.1",
97
- 1, 150, 51, 100, 150, 200) # 0.25
98
- crb.add_hit("scaf_Os09g38670.1", "LOC_Os09g38670.1",
99
- 1, 450, 26, 175, 450, 200) # 0.75
100
- @reference["LOC_Os09g38670.1"].seq = "A"*200
101
-
102
- #
103
- # Q1 |----------------------|
104
- # Q2 |-----------|
105
- # T4 |-------------------------------|
106
- crb.change_hit("scaf_Os12g21920.1", "LOC_Os12g21920.1", #
107
- 1, 450, 26, 175, 450, 200) # 0.75
108
- crb.add_hit("scaf_Os12g21920.1", "LOC_Os12g21920.1",
109
- 1, 150, 51, 100, 150, 200) # 0.25
110
- @reference["LOC_Os12g21920.1"].seq = "A"*200
47
+ end
48
+
49
+ should "01 should run" do
50
+ Dir.mktmpdir do |tmpdir|
51
+ Dir.chdir tmpdir do
52
+ comp = Tester.run_comp_metrics("test_contig_nc1.fa", "test_reference_nc1.fa")
53
+ assert comp.has_run
54
+ end
55
+ end
56
+ end
111
57
 
58
+ should "01-1n should get reference hits" do
59
+ # The reciprocals hash in crb blast has contig names as the key.
60
+ # In order to look up by the reference name we need to reverse this.
61
+ # Scan through the reciprocals and get this Hit objects and add them to
62
+ # the @reference object for each reference sequence
63
+ Dir.mktmpdir do |tmpdir|
64
+ Dir.chdir tmpdir do
65
+ query = Tester.testpath("test_contig_nc1.fa")
66
+ target = Tester.testpath("test_reference_nc1.fa")
67
+ crbblast = CRB_Blast::CRB_Blast.new query, target
68
+ crbblast.run(1e-5, 1, true)
69
+ assembly = Transrate::Assembly.new(query)
70
+ reference = Transrate::Assembly.new(target)
71
+ comp = Transrate::ComparativeMetrics.new(assembly, reference, 1)
72
+ comp.get_reference_hits crbblast
73
+ assert_equal 1, comp.reference["reference1"].hits.size, "size of reference hits list"
74
+ assert_equal "contig1", comp.reference["reference1"].hits[0].query
75
+ assert_equal "reference1", comp.reference["reference1"].hits[0].target
76
+ end
77
+ end
78
+ end
112
79
 
113
- #
114
- # Q1 |------|
115
- # Q2 |--------|
116
- # Q3 |-----------------|
117
- # T5 |-------------------------------|
118
- crb.change_hit("scaf_Os01g36294.1", "LOC_Os01g36294.1", #
119
- 1, 300, 51, 100, 300, 400)
120
- crb.add_hit("scaf_Os01g36294.1", "LOC_Os01g36294.1",
121
- 1, 300, 200, 250, 300, 400)
122
- crb.add_hit("scaf_Os01g36294.1", "LOC_Os01g36294.1",
123
- 1, 300, 75, 225, 300, 400)
124
- @reference["LOC_Os01g36294.1"].seq = "A"*400
80
+ should "01-1n get per contig reference coverage" do
81
+ Dir.mktmpdir do |tmpdir|
82
+ Dir.chdir tmpdir do
83
+ query = Tester.testpath("test_contig_nc1.fa")
84
+ target = Tester.testpath("test_reference_nc1.fa")
85
+ crbblast = CRB_Blast::CRB_Blast.new query, target
86
+ crbblast.run(1e-5, 1, true)
87
+ assembly = Transrate::Assembly.new(query)
88
+ reference = Transrate::Assembly.new(target)
89
+ comp = Transrate::ComparativeMetrics.new(assembly, reference, 1)
90
+ comp.get_reference_hits crbblast
91
+ comp.per_query_contig_reference_coverage
92
+ assert_equal (2/3.0), comp.assembly["contig1"].reference_coverage
93
+ end
94
+ end
95
+ end
96
+
97
+ should "01-1a get per contig reference coverage on protein" do
98
+ Dir.mktmpdir do |tmpdir|
99
+ Dir.chdir tmpdir do
100
+ query = Tester.testpath("test_contig_nc1.fa")
101
+ target = Tester.testpath("test_reference_aa1.fa")
102
+ crbblast = CRB_Blast::CRB_Blast.new query, target
103
+ crbblast.run(1e-5, 1, true)
104
+ assembly = Transrate::Assembly.new(query)
105
+ reference = Transrate::Assembly.new(target)
106
+ comp = Transrate::ComparativeMetrics.new(assembly, reference, 1)
107
+ comp.get_reference_hits crbblast
108
+ comp.per_query_contig_reference_coverage
109
+ assert_equal (2/3.0), comp.assembly["contig1"].reference_coverage
110
+ end
111
+ end
112
+ end
113
+
114
+ should "01e raise error because you can't have protein queries" do
115
+ Dir.mktmpdir do |tmpdir|
116
+ Dir.chdir tmpdir do
117
+ query = Tester.testpath("test_reference_aa1.fa")
118
+ target = Tester.testpath("test_contig_nc1.fa")
119
+ crbblast = CRB_Blast::CRB_Blast.new query, target
120
+ crbblast.run(1e-5, 1, true)
121
+ assembly = Transrate::Assembly.new(query)
122
+ reference = Transrate::Assembly.new(target)
123
+ comp = Transrate::ComparativeMetrics.new(assembly, reference, 1)
124
+ comp.get_reference_hits crbblast
125
+ assert_raise Transrate::TransrateError do
126
+ comp.per_query_contig_reference_coverage
127
+ end
128
+ end
129
+ end
130
+ end
131
+
132
+ should "02-2n calculate coverage for each reference sequence" do
133
+ Dir.mktmpdir do |tmpdir|
134
+ # tmpdir = Dir.mktmpdir
135
+ # puts tmpdir
136
+ Dir.chdir tmpdir do
137
+ query = Tester.testpath("test_contig_nc2.fa")
138
+ target = Tester.testpath("test_reference_nc1.fa")
139
+ crbblast = CRB_Blast::CRB_Blast.new query, target
140
+ crbblast.run(1e-5, 1, true)
141
+ assembly = Transrate::Assembly.new(query)
142
+ reference = Transrate::Assembly.new(target)
143
+ comp = Transrate::ComparativeMetrics.new(assembly, reference, 1)
144
+ comp.get_reference_hits crbblast
145
+ comp.per_query_contig_reference_coverage
146
+ comp.per_target_contig_reference_coverage crbblast
147
+ # answer should be 290/300.0
148
+ assert_equal 29/30.0, comp.reference["reference1"].reference_coverage
149
+ end
150
+ end
151
+ end
125
152
 
126
- crb.change_hit("scaf_Os12g22750.1", "LOC_Os12g22750.1",
127
- 1, 300, 101, 200, 300, 200) # 0.5 # 300/600
128
- @reference["LOC_Os12g22750.1"].seq = "A"*200
153
+ should "02-3n calculate coverage for each reference sequence" do
154
+ Dir.mktmpdir do |tmpdir|
155
+ Dir.chdir tmpdir do
156
+ query = Tester.testpath("test_contig_nc3.fa")
157
+ target = Tester.testpath("test_reference_nc1.fa")
158
+ crbblast = CRB_Blast::CRB_Blast.new query, target
159
+ crbblast.run(1e-5, 1, true)
160
+ crbblast.add_missing
161
+ assembly = Transrate::Assembly.new(query)
162
+ reference = Transrate::Assembly.new(target)
163
+ comp = Transrate::ComparativeMetrics.new(assembly, reference, 1)
164
+ comp.get_reference_hits crbblast
165
+ comp.per_query_contig_reference_coverage
166
+ comp.per_target_contig_reference_coverage crbblast
167
+ # answer should be 1.0000
168
+ assert_equal 1.00, comp.reference["reference1"].reference_coverage
169
+ end
170
+ end
171
+ end
129
172
 
130
- crb.change_hit("scaf_Os02g55190.1", "LOC_Os02g55190.1",
131
- 1, 300, 101, 200, 300, 200) # 0.5 # 300/600
132
- @reference["LOC_Os02g55190.1"].seq = "A"*200
173
+ should "02-3a calculate coverage for each reference sequence" do
174
+ Dir.mktmpdir do |tmpdir|
175
+ Dir.chdir tmpdir do
176
+ query = Tester.testpath("test_contig_nc3.fa")
177
+ target = Tester.testpath("test_reference_aa1.fa")
178
+ crbblast = CRB_Blast::CRB_Blast.new query, target
179
+ crbblast.run(1e-5, 1, true)
180
+ crbblast.add_missing
181
+ assembly = Transrate::Assembly.new(query)
182
+ reference = Transrate::Assembly.new(target)
183
+ comp = Transrate::ComparativeMetrics.new(assembly, reference, 1)
184
+ comp.get_reference_hits crbblast
185
+ comp.per_query_contig_reference_coverage
186
+ comp.per_target_contig_reference_coverage crbblast
187
+ # answer should be 1.0000
188
+ assert_equal 1.00, comp.reference["reference2"].reference_coverage
189
+ end
190
+ end
191
+ end
133
192
 
134
- crb.change_hit("scaf_Os03g56500.1", "LOC_Os03g56500.1",
135
- 1, 300, 101, 200, 300, 400) # 0.25
136
- crb.change_hit("scaf_Os03g56500.2", "LOC_Os03g56500.1",
137
- 1, 300, 201, 300, 300, 400) # 0.25 # 600 / 1200
138
- @reference["LOC_Os03g56500.1"].seq = "A"*400
193
+ should "02-4n calculate coverage for each reference sequence" do
194
+ Dir.mktmpdir do |tmpdir|
195
+ Dir.chdir tmpdir do
196
+ query = Tester.testpath("test_contig_nc4.fa")
197
+ target = Tester.testpath("test_reference_nc1.fa")
198
+ crbblast = CRB_Blast::CRB_Blast.new query, target
199
+ crbblast.run(1e-5, 1, true)
200
+ assembly = Transrate::Assembly.new(query)
201
+ reference = Transrate::Assembly.new(target)
202
+ comp = Transrate::ComparativeMetrics.new(assembly, reference, 1)
203
+ comp.get_reference_hits crbblast
204
+ comp.per_query_contig_reference_coverage
205
+ comp.per_target_contig_reference_coverage crbblast
206
+ assert_equal 0.5, comp.reference["reference1"].reference_coverage
207
+ end
208
+ end
209
+ end
139
210
 
140
- crb.change_hit("scaf_Os03g56724.1", "LOC_Os03g56724.1",
141
- 1, 300, 101, 200, 300, 200) # 300/600 = 0.5
142
- @reference["LOC_Os03g56724.1"].seq = "A"*200
211
+ should "02-4a calculate coverage for each reference sequence" do
212
+ Dir.mktmpdir do |tmpdir|
213
+ Dir.chdir tmpdir do
214
+ query = Tester.testpath("test_contig_nc4.fa")
215
+ target = Tester.testpath("test_reference_aa1.fa")
216
+ crbblast = CRB_Blast::CRB_Blast.new query, target
217
+ crbblast.run(1e-5, 1, true)
218
+ assembly = Transrate::Assembly.new(query)
219
+ reference = Transrate::Assembly.new(target)
220
+ comp = Transrate::ComparativeMetrics.new(assembly, reference, 1)
221
+ comp.get_reference_hits crbblast
222
+ comp.per_query_contig_reference_coverage
223
+ comp.per_target_contig_reference_coverage crbblast
224
+ assert_equal 0.5, comp.reference["reference2"].reference_coverage
225
+ end
226
+ end
227
+ end
143
228
 
144
- crb.remove_hit("scaf_Os01g11360.1")
229
+ should "02-5a calculate coverage for each reference sequence" do
230
+ Dir.mktmpdir do |tmpdir|
231
+ Dir.chdir tmpdir do
232
+ query = Tester.testpath("test_contig_nc5.fa")
233
+ target = Tester.testpath("test_reference_aa1.fa")
234
+ crbblast = CRB_Blast::CRB_Blast.new query, target
235
+ crbblast.run(1e-5, 1, true)
236
+ crbblast.add_missing
237
+ assembly = Transrate::Assembly.new(query)
238
+ reference = Transrate::Assembly.new(target)
239
+ comp = Transrate::ComparativeMetrics.new(assembly, reference, 1)
240
+ comp.get_reference_hits crbblast
241
+ comp.per_query_contig_reference_coverage
242
+ comp.per_target_contig_reference_coverage crbblast
243
+ assert_equal (2/3.0), comp.reference["reference2"].reference_coverage
244
+ end
245
+ end
246
+ end
145
247
 
146
- @reference["LOC_Os03g08270.3"].seq = "A"*200
147
- @reference["LOC_Os10g41970.1"].seq = "A"*200
148
- @reference["LOC_Os09g26780.1"].seq = "A"*200
149
- @reference["LOC_Os12g24659.1"].seq = "A"*200
150
- @reference["LOC_Os01g36410.1"].seq = "A"*200
151
- @reference["LOC_Os12g22780.1"].seq = "A"*200
152
- @reference["LOC_Os02g56470.1"].seq = "A"*200
153
- @reference["LOC_Os03g30530.1"].seq = "A"*200
154
- @reference["LOC_Os03g49850.1"].seq = "A"*200
155
- @reference["LOC_Os01g11360.1"].seq = "A"*200
156
- @reference["LOC_Os01g44140.1"].seq = "A"*200
248
+ should "02-5n calculate coverage for each reference sequence" do
249
+ Dir.mktmpdir do |tmpdir|
250
+ Dir.chdir tmpdir do
251
+ query = Tester.testpath("test_contig_nc5.fa")
252
+ target = Tester.testpath("test_reference_nc1.fa")
253
+ crbblast = CRB_Blast::CRB_Blast.new query, target
254
+ crbblast.run(1e-5, 1, true)
255
+ crbblast.add_missing
256
+ assembly = Transrate::Assembly.new(query)
257
+ reference = Transrate::Assembly.new(target)
258
+ comp = Transrate::ComparativeMetrics.new(assembly, reference, 1)
259
+ comp.get_reference_hits crbblast
260
+ comp.per_query_contig_reference_coverage
261
+ comp.per_target_contig_reference_coverage crbblast
262
+ assert_equal (2/3.0), comp.reference["reference1"].reference_coverage
263
+ end
264
+ end
265
+ end
157
266
 
158
- assert_equal true, crb.target_is_prot, "target is prot"
159
- assert_equal false, crb.query_is_prot, "query is prot"
160
- # total_length of references should be 4400
267
+ should "02-6a calculate coverage for each reference sequence" do
268
+ Dir.mktmpdir do |tmpdir|
269
+ Dir.chdir tmpdir do
270
+ query = Tester.testpath("test_contig_nc6.fa")
271
+ target = Tester.testpath("test_reference_aa1.fa")
272
+ crbblast = CRB_Blast::CRB_Blast.new query, target
273
+ crbblast.run(1e-5, 1, true)
274
+ crbblast.add_missing
275
+ assembly = Transrate::Assembly.new(query)
276
+ reference = Transrate::Assembly.new(target)
277
+ comp = Transrate::ComparativeMetrics.new(assembly, reference, 1)
278
+ comp.get_reference_hits crbblast
279
+ comp.per_query_contig_reference_coverage
280
+ comp.per_target_contig_reference_coverage crbblast
281
+ assert_equal (1/3.0), comp.reference["reference2"].reference_coverage
282
+ end
283
+ end
284
+ end
161
285
 
162
- cov = @comp.coverage crb
163
- assert_equal 3600/13200.0, cov, "reference coverage"
286
+ should "02-6n calculate coverage for each reference sequence" do
287
+ Dir.mktmpdir do |tmpdir|
288
+ Dir.chdir tmpdir do
289
+ query = Tester.testpath("test_contig_nc6.fa")
290
+ target = Tester.testpath("test_reference_nc1.fa")
291
+ crbblast = CRB_Blast::CRB_Blast.new query, target
292
+ crbblast.run(1e-5, 1, true)
293
+ crbblast.add_missing
294
+ assembly = Transrate::Assembly.new(query)
295
+ reference = Transrate::Assembly.new(target)
296
+ comp = Transrate::ComparativeMetrics.new(assembly, reference, 1)
297
+ comp.get_reference_hits crbblast
298
+ comp.per_query_contig_reference_coverage
299
+ comp.per_target_contig_reference_coverage crbblast
300
+ assert_equal (1/3.0), comp.reference["reference1"].reference_coverage
301
+ end
302
+ end
164
303
  end
165
304
 
166
- should "calculate overlap amount" do
167
- assert_equal 0.5, @comp.overlap_amount(201,500,101,400), "1"
168
- assert_equal 0.5, @comp.overlap_amount(101,400,201,500), "2"
169
- assert_equal 0.5, @comp.overlap_amount(201,400,101,500), "3"
170
- assert_equal 0.5, @comp.overlap_amount(101,500,201,400), "4"
171
- end
172
-
173
- should "calculate number of contigs with crbblast hit" do
305
+ should "02-7a calculate coverage for each reference sequence" do
174
306
  Dir.mktmpdir do |tmpdir|
175
307
  Dir.chdir tmpdir do
176
- @comp.run
177
- assert_equal 11, @comp.comp_stats[:n_contigs_with_CRBB]
178
- assert_equal 11/13.0, @comp.comp_stats[:p_contigs_with_CRBB]
308
+ query = Tester.testpath("test_contig_nc7.fa")
309
+ target = Tester.testpath("test_reference_aa1.fa")
310
+ crbblast = CRB_Blast::CRB_Blast.new query, target
311
+ crbblast.run(1e-5, 1, true)
312
+ crbblast.add_missing
313
+ assembly = Transrate::Assembly.new(query)
314
+ reference = Transrate::Assembly.new(target)
315
+ comp = Transrate::ComparativeMetrics.new(assembly, reference, 1)
316
+ comp.get_reference_hits crbblast
317
+ comp.per_query_contig_reference_coverage
318
+ comp.per_target_contig_reference_coverage crbblast
319
+ assert_equal (1/3.0), comp.reference["reference2"].reference_coverage
179
320
  end
180
321
  end
181
322
  end
182
323
 
183
- should "calculate number of reference sequences with crbblast hit" do
324
+ should "02-7n calculate coverage for each reference sequence" do
184
325
  Dir.mktmpdir do |tmpdir|
185
326
  Dir.chdir tmpdir do
186
- @comp.run
187
- assert_equal 10, @comp.comp_stats[:n_refs_with_CRBB]
188
- assert_equal 0.5, @comp.comp_stats[:p_refs_with_CRBB]
327
+ query = Tester.testpath("test_contig_nc7.fa")
328
+ target = Tester.testpath("test_reference_nc1.fa")
329
+ crbblast = CRB_Blast::CRB_Blast.new query, target
330
+ crbblast.run(1e-5, 1, true)
331
+ crbblast.add_missing
332
+ assembly = Transrate::Assembly.new(query)
333
+ reference = Transrate::Assembly.new(target)
334
+ comp = Transrate::ComparativeMetrics.new(assembly, reference, 1)
335
+ comp.get_reference_hits crbblast
336
+ comp.per_query_contig_reference_coverage
337
+ comp.per_target_contig_reference_coverage crbblast
338
+ assert_equal (1/3.0), comp.reference["reference1"].reference_coverage
189
339
  end
190
340
  end
191
341
  end
192
342
 
193
- should "calculate reference sequence coverage" do
194
- # n&p of reference sequences covered to (25, 50, 75, 85, 95%)
195
- # of their length by CRB-BLAST hit
343
+ should "03 calculate all metrics" do
196
344
  Dir.mktmpdir do |tmpdir|
197
345
  Dir.chdir tmpdir do
198
- @comp.run
199
- stats = @comp.comp_stats
200
- assert_equal 10, stats[:cov25]
201
- assert_equal 10, stats[:cov50]
202
- assert_equal 7, stats[:cov75]
203
- assert_equal 6, stats[:cov85]
204
- assert_equal 3, stats[:cov95]
346
+ query = Tester.testpath("assembly.2.fa")
347
+ target = Tester.testpath("Os.protein.2.fa")
348
+ crbblast = CRB_Blast::CRB_Blast.new query, target
349
+ crbblast.run(1e-5, 1, true)
350
+ assembly = Transrate::Assembly.new(query)
351
+ reference = Transrate::Assembly.new(target)
352
+ comp = Transrate::ComparativeMetrics.new(assembly, reference, 1)
353
+ comp.get_reference_hits crbblast
354
+ comp.per_query_contig_reference_coverage
355
+ comp.per_target_contig_reference_coverage crbblast
356
+ assert_equal 11, comp.comp_stats[:CRBB_hits], "CRBB hits"
357
+ assert_equal 11, comp.comp_stats[:n_contigs_with_CRBB], "n_contigs_with_CRBB"
358
+ assert_equal 0.84615, comp.comp_stats[:p_contigs_with_CRBB].round(5), "p_contigs_with_CRBB"
359
+ assert_equal 0.55, comp.comp_stats[:rbh_per_reference], "rbh_per_reference"
360
+ assert_equal 10, comp.comp_stats[:n_refs_with_CRBB], "n_refs_with_CRBB"
361
+ assert_equal 0.5, comp.comp_stats[:p_refs_with_CRBB], "p_refs_with_CRBB"
362
+ assert_equal 10, comp.comp_stats[:cov25], "cov25"
363
+ assert_equal 10, comp.comp_stats[:cov50], "cov50"
364
+ assert_equal 7, comp.comp_stats[:cov75], "cov75"
365
+ assert_equal 6, comp.comp_stats[:cov85], "cov85"
366
+ assert_equal 3, comp.comp_stats[:cov95], "cov95"
367
+ assert_equal 0.5, comp.comp_stats[:p_cov25], "p_cov25"
368
+ assert_equal 0.5, comp.comp_stats[:p_cov50], "p_cov50"
369
+ assert_equal 0.35, comp.comp_stats[:p_cov75], "p_cov75"
370
+ assert_equal 0.3, comp.comp_stats[:p_cov85], "p_cov85"
371
+ assert_equal 0.15, comp.comp_stats[:p_cov95], "p_cov95"
372
+ assert_equal 0.37261, comp.comp_stats[:reference_coverage].round(5), "reference_coverage"
373
+
205
374
  end
206
375
  end
207
376
  end
208
377
 
378
+
379
+
209
380
  end
210
381
  end