transrate 1.0.0.beta3 → 1.0.0.beta4

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,5 +1,7 @@
1
1
  module Transrate
2
2
 
3
+ class ReadMetricsError < TransrateError; end
4
+
3
5
  class ReadMetrics
4
6
 
5
7
  attr_reader :fragments, :fragments_mapping, :p_good_mapping
@@ -9,7 +11,7 @@ module Transrate
9
11
  attr_reader :read_length
10
12
 
11
13
  def initialize assembly
12
- @assembly = assembly
14
+ @assembly = assembly # Transrate::Assembly
13
15
  @mapper = Snap.new
14
16
  @salmon = Salmon.new
15
17
  self.initial_values
@@ -60,7 +62,12 @@ module Transrate
60
62
  if !File.exist?(assigned_bam)
61
63
  assigned_bam = assign_and_quantify(bamfile, threads)
62
64
  end
63
- File.rename(assigned_bam, final_bam)
65
+ if File.exist?(assigned_bam)
66
+ File.rename(assigned_bam, final_bam)
67
+ else
68
+ logger.error "Couldn't find #{assigned_bam} to rename"
69
+ raise ReadMetricsError
70
+ end
64
71
  end
65
72
  # analyse the final mappings
66
73
  analyse_read_mappings final_bam
@@ -116,6 +123,7 @@ module Transrate
116
123
  def analyse_expression salmon_output
117
124
  salmon_output.each_pair do |name, expr|
118
125
  contig_name = Bio::FastaDefline.new(name.to_s).entry_id
126
+ contig_name.gsub!(/;$/, '') # trim trailing semicolon
119
127
  contig = @assembly[contig_name]
120
128
  if expr[:eff_len]==0
121
129
  coverage = 0
@@ -188,6 +196,7 @@ module Transrate
188
196
 
189
197
  def populate_contig_data row
190
198
  name = Bio::FastaDefline.new(row[:name].to_s).entry_id
199
+ name.gsub!(/;$/, '') # trim trailing semicolon
191
200
  contig = @assembly[name]
192
201
  contig.p_seq_true = row[:p_seq_true]
193
202
  contig.uncovered_bases = row[:bases_uncovered]
@@ -17,6 +17,7 @@ module Transrate
17
17
  def run assembly, bamfile, threads=8
18
18
  assembly = assembly.file if assembly.is_a? Assembly
19
19
  output = "quant.sf"
20
+ sampled_bam = "postSample.bam"
20
21
  @fin_output = "#{File.basename assembly}_#{output}"
21
22
  unless File.exist? @fin_output
22
23
  salmon = Cmd.new build_command(assembly, bamfile, threads)
@@ -25,9 +26,13 @@ module Transrate
25
26
  logger.error salmon.stderr
26
27
  raise SalmonError.new("Salmon failed")
27
28
  end
29
+ unless File.exist?(sampled_bam)
30
+ logger.error salmon.stderr
31
+ raise SalmonError.new("#{sampled_bam} not created")
32
+ end
28
33
  File.rename(output, @fin_output)
29
34
  end
30
- return 'postSample.bam'
35
+ return sampled_bam
31
36
  end
32
37
 
33
38
  def build_command assembly, bamfile, threads=4
@@ -39,7 +44,7 @@ module Transrate
39
44
  cmd << " --sampleOut"
40
45
  cmd << " --sampleUnaligned" # thanks Rob!
41
46
  cmd << " --output ."
42
- cmd << " --useReadCompat"
47
+ cmd << " --useVBOpt"
43
48
  cmd << " --useErrorModel"
44
49
  cmd
45
50
  end
@@ -49,9 +54,14 @@ module Transrate
49
54
  File.open(file).each do |line|
50
55
  if line !~ /^#/
51
56
  line = line.chomp.split("\t")
57
+ unless line.length == 4
58
+ raise SalmonError.new("Salmon output file should have 4 columns " +
59
+ "but it had #{line.length}\n" +
60
+ "Please check you are using the correct version of Salmon")
61
+ end
52
62
  target = line[0]
53
63
  effective_length = line[1]
54
- effective_count = line[4]
64
+ effective_count = line[3]
55
65
  tpm = line[2]
56
66
  expression[target] = {
57
67
  :eff_len => effective_length.to_i,
@@ -6,6 +6,8 @@ module Transrate
6
6
  # while also minimising the number of low scoring contigs.
7
7
  class ScoreOptimiser
8
8
 
9
+ require 'csv'
10
+
9
11
  def initialize assembly, read_metrics
10
12
  @assembly = assembly
11
13
  @fragments = read_metrics.fragments
@@ -20,7 +22,7 @@ module Transrate
20
22
  @contig_score * (@good / @total.to_f)
21
23
  end
22
24
 
23
- def optimal_score
25
+ def optimal_score(prefix='assembly')
24
26
  return [@optimal, @cutoff] unless @optimal.nil?
25
27
  product = 0
26
28
  good = 0
@@ -44,7 +46,10 @@ module Transrate
44
46
  end
45
47
  @optimal = 0
46
48
  @cutoff = 0
49
+ out = CSV.open("#{prefix}_score_optimisation.csv", 'w')
50
+ out << %w[cutoff assembly_score]
47
51
  cutoffscores.each do |c, score|
52
+ out << [c, score]
48
53
  if score > @optimal
49
54
  @optimal = score
50
55
  @cutoff = c
@@ -41,8 +41,7 @@ module Transrate
41
41
  cmd
42
42
  end
43
43
 
44
- def map_reads(file, left, right, insertsize: 200,
45
- insertsd: 50, outputname: nil, threads: 8)
44
+ def map_reads(file, left, right, outputname: nil, threads: 8)
46
45
  raise SnapError.new("Index not built") if !@index_built
47
46
 
48
47
  lbase = File.basename(left.split(",").first)
@@ -69,11 +69,11 @@ module Transrate
69
69
  return @score_optimiser.raw_score
70
70
  end
71
71
 
72
- def assembly_optimal_score
72
+ def assembly_optimal_score prefix
73
73
  if !@score_optimiser
74
74
  @score_optimiser = ScoreOptimiser.new(@assembly, @read_metrics)
75
75
  end
76
- return @score_optimiser.optimal_score
76
+ return @score_optimiser.optimal_score prefix
77
77
  end
78
78
 
79
79
  def assembly_metrics
@@ -11,7 +11,7 @@ module Transrate
11
11
  MAJOR = 1
12
12
  MINOR = 0
13
13
  PATCH = 0
14
- BUILD = 'beta3'
14
+ BUILD = 'beta4'
15
15
 
16
16
  STRING = [MAJOR, MINOR, PATCH, BUILD].compact.join('.')
17
17
  end
@@ -1,6 +1,6 @@
1
1
  >Sb09g017110.1
2
2
  AGCGTGGGAAGGGAGATCACTGGAAGGGAGAAAGAAATGGAGCATGAAGAAGAGCTGCCTTCATCTTCTTCCTCCTTGGGTTACCTGATGCAGTGTAGGATCTGCCACGAGGAAGAGAACGAAGGGCGCGCGATCATGGAGTCTCCTTGTGGATGCTCCGGCTCTCTCAAGTATGCTCACAGGGGATGTGTGCAGAGATGGTGTGATGAGAAGGGGAGCACCCTCTGTGAGATTTGCCTTCAGAATTTCGAGCCAGGCTACACAATGCCTCCAAAGAAAACTCCGGCGATTGAAACTGCGGTCACTATCAGTGAACATGAGGACATGCAACCTTTGGAATCTCCGGAGGGCTCCATTGACGGTGCAGATTACACCAGGTGCTCCTACGCCGCAGATCAATGCGCCACATGGTGCCGGTCGCTGGCGATCACGTTCACCATTATGCTGTTGGCATGGCATCTGGTTGCAGTAGTGACGGTTGAAGCAGCAGATCACTGCGCGTTCAGTCTCCTGACAATGTACTTACTTCGTGCTGCTGGTATCCTGCTGCCGCTCTATGTTGTCATGCGGCTGATTCGCATCGTCCAGAATGGGCAGAGGCAGTATCGGTTGCAGCTGCTGGAGGACCAAAGAAGAAATGCATCAACTATATGAATTATATGCGTGACCACGAGAAGGACCAGCTAGTTATTAATATTCATTAAGCCAATGAATTTAAATGAATCTTATGGAAGGTTGTACAGATGTACATTATGTATTACTGGGGAATTTTTCAAACAGAACATCATTTGTAAACTTTCAGTATACATACCGTTGGTGCAGTGAAAAACAGCAGC
3
- >Sb02g028080.1
3
+ >Sb02g028080.1;
4
4
  ATGGGGACCCCTCTCCTCTTCCCCCTTCTCGTCACCCTCCAGCTGTTCACCGCCGCCTCCCCCGCGGTCGCGTCGTCGCACATCTCCGTCGTCATCTCGCAGTCGGGCCTCGACTTCGCCAAGGACCTGCTCGTGTCCCGTGCCGTCGCGACCCTCACGCCCCTGAACGTGCCGGACATCCAGAAGACCATGAGCACCGTCGTGGGCACCGTCCGCGTGGCCGCATCCGGGATTGTGCTCAACGGCCTCGCCGTCACCAACTCCACCGTCGCTATCGGGGACACGGGTGTTGTCGTGGCCGCCTCGTTGGCCAGAGCGAACCTCACCATGGAGTGGAACTACTCGTACAGCGCCTGGATTGTGACCATATCCGACAGCGGGAATGCTTCGATCCAGGTTGAAGGAATGGAGGTTGGTGTTTCCATGGTCATGAAGAATCAGAATGGATCTATCAAGCTGTCTGTTACAGAATGTAGCTGTAATATGGAGGACTTAGACATAACACTAAGTGGAGGAGCATCTTGGTTCTATCAAGTGTTTATAGATAGTTTCAGTAATCATATCAGATCATCAGTGGAAAATGCAATTGAGAACAAAGTAATGGAAGGTGCACTGAAGCTTGACTCTTTCCTGGGAAACCTTCCAAAGAAAATTGATCTTGATAGCGTTGCTGCAATGAATGTGACTTTTGTTAATGATCCACTATTCAAGAGCTCCTCTGTTGAGTTTGATATAGATGGCTTATTTATTCCATCGGATGAAACTGCTCCCGGAGACATGCTTCTTGGAAATACCCAATTTGCATTACCTCTTGGGAGCTCCTCGAGAATGCTTTGGATTTCATTGGATGAAGATGTTTTCAACTCCGTTTCAGCTCTCTACTTCAAGGCTGGTTTGCTGCAACGGATGGTGGACGAGGTTCCTGAACAGTTTCTTTTGAACACTGCTAGCTGGAGATTTTTGGTTCCTCGATTGTATCGAGAATATCCTGACGATGATATGCTACTGAATATCTCTGCAGTTTCGCCTCCCTCTGTGAGGATTAATGTGGGTAGAATTGATGCCACAGTTGACTTAGATGTCACAGTTAATGTCTTGGATTTTGGTGAGATAGTTCCAGTTGCATGCATATCAGTGTCGGTTGCTGTTTCTGGAGCTGCAGCGGTATCAGAGAATAATCTTGTTGGGAGAGTGAAATTGGATTATTTCTCATTTACCTTGAAATGGAGCAAAGTTGGCAAACTCCACACCAGTCTAGTGCAGACCGTGCTGAGGATTTTGCTGAAAAGTTTGTTTGTACCTTATGTGAACTCATATCTCGAGCAAGGCTTCCAGCTGCCCATCATCAAGGGATTCTCCGTCATAGATGCATATGTCCTCACTTCTTACTCAAGAATGATTGTTAGCTGCAATGTTGCGTTCCCTGAGCCAGAGGTTCTGTCTCCTATCCAAGAATCCAAGACCAACGAAGATTTGTCACATGAAGTTGGTTTGCTGATTGGATCTGCCAAAACTTGGCAGCCACCGATAACTAGTGTAAAATTCCTGTAAACAGTAACGCTATTCGTCGCAGTGTTTTGTTTTTAAAAATGTGTAAATAACTGTGCACTGTATATATGTATGTATGTATGTATGTATATATGCATATTAAATCAACAGATAGGAGCCATTCCGTAGCCTCTAAGTGGAATCGTTAGGTACTACAGTTTCCTCTATCCCAAATTATAAGCTGTTCTAGTTTTTT
5
5
  >Sb03g034100.1
6
6
  CTCACTCTCACACTACTCCTCCCCTCTCCGGCTCTCTGCCTCTGACGTCTGACCTCTCCTCCCCAACGGTGAGGCCGGCGCATTGCCGTTTCGAGCGCGGACACCGAGGGCTAGAACTAGAAGTGGCGGCGGTGCCAGGGCTCGGCGCTCGGTCGGCAATGGCGGGGCGGCTTATGCTGGCGGCGCTCCCTATTCTCCTCTTCTTATTGCTCGTCGGGCAATGCCACGGCGGCAAGATTGGCGTCTGCTACGGCCGCAACGCCGACGACCTGCCGGCGCCGGACAAGGTGGCGCAGCTAATCCAGCAGCAATCCATCAAGTACGTGCGCATCTACGACACCAACATCGACGTCATCAAGGCCTTCGCCAACACCGGCGTCGAGCTCATGGTCGGCGTCCCCAACTCCGACCTCCTCGCCTTCGCGCAGTACCAGTCCAACGTCGACACATGGCTCAAGAACAGCATTCTCCCCTACTACCCGGCCACCATGATCACCTACATCACCGTCGGCGCCGAGGTCACCGAGAGCCCCACCAACGTCTCCGCCCTCGTCGTGCCTGCCATGCGCAATGTGCACACCGCACTCAAGAAGGCCGGCCTGCACAAGAAGATCACCATCTCCAGCACCCACTCGCTCGGGATACTGTCACGGTCGTTCCCGCCGTCTGCTGGGGCGTTCAACAGCAGCTACGCCTACTTCTTGAAGCCTATGCTCGAGTTCCTTGTGGAGAATCAGGCGCCGTTCATGGTGGATTTATACCCCTACTATGCGTACCAGAACTCACCGAGCAATGTGTCCCTCAACTACGCCCTGTTCTCGCCACAGTCTCAGGATGTGATTGACCCAAACACTGGACTGGTTTACACTAACATGTTTGATGCCCAGGTTGATTCCATCTTCTTTGCGCTCATGGCTCTGAACTTCAAAACTCTGAAGATCATGATCACTGAGTCAGGGTGGCCAAACAAAGGGGCGGCCAAGGAGACTGGAGCCACTCCAGACAATGCTCAGACTTACAATACCAATTTGATACGCCATGTTGTTAATGACAGTGGCACGCCTGCGAAACCAGGGGAAGAAATTGATGTCTACATATTTTCATTGTTCAATGAGAACAGGAAACCTGGCATTGAGTCGGAGAGGAACTGGGGACTGTTTTTTCCTGATAAGAGCTCTATCTACAGCCTTGATTGGACGGGCCGAGGCAATGTGGATGTTATGACTGGAGCAAACATTACAAGTGCAAATGGTACCTGGTGTATTGCTTCAGCTAATGCATCAGAAACAGATCTGCAGAATGCCCTCAACTGGGCATGTGGTCCAGGCAACGTAGATTGCTCTGCCATTCAACCAAGCCAACCCTGCTACCAGCCGGACACTTTAGCTTCCCATGCTTCATATGCATTCAATAGCTACTACCAGCAAAATGGAGCCAACGTTGTGGCCTGTGACTTCAGTGGTGCGGGAATACGAACGACGAAAGATCCAAGTTACGACACTTGTGTCTATTTGGCTGCAGGCAATAAGATGAGCACAATGAATTCGACATCTCTTCCAGCTCAGAGCAACTCTGGTCCAGTTCCATGCGCCAAATACTTCACCACTTTCCTCCCCATGCTGGCCCCCGTGATGGCTGCAGTTATGCTGTGATCTATGGAAATGCTCCAGCTAGCCTCTGCAGATGTGGAGATGAAAGGTGAATTGCGTAATGCTGGTAACCAGCCGATGTTCTGTTTTGCTATGAGCAGTAGACTAGTAGTAGTCTAGTAGAGAGGCATATTATGCTGCTGTAGGAATTCTCTGGTCAGTTGAGATGTACATCGTCGCGCAGACAATATATATCAGCTGGCCTTAAGAACTCGATAACCTTTTCTGCTGTCTTTCG
data/test/data/test.sf CHANGED
@@ -9,22 +9,22 @@
9
9
  # [ sampleOut ] => { }
10
10
  # [ useFragLenDist ] => { }
11
11
  # [ sampleUnaligned ] => { }
12
- # Name Length TPM FPKM NumReads
13
- scaffold1 1016 549.279 527.364 20690
14
- scaffold2 1439 598.782 574.892 31945
15
- scaffold3 783 408.072 391.791 11846
16
- scaffold4 893 441.382 423.772 14613
17
- scaffold5 622 494.487 474.758 11403
18
- scaffold6 2073 4.77214 4.58174 366.764
19
- scaffold7 1291 4.288 4.11692 205.236
20
- scaffold8 1355 17.9155 17.2007 900
21
- scaffold9 258 15.891 15.257 152
22
- scaffold10 1934 104.823 100.641 7516
23
- scaffold11 1922 23.9916 23.0344 1709.57
24
- scaffold12 1651 136.498 131.052 8355
25
- scaffold13 1834 360.757 346.363 24529.4
26
- scaffold14 580 6.13864 5.89373 132
27
- scaffold15 1539 4.8197 4.62741 275
28
- scaffold16 2302 26.1878 25.1429 2235
29
- scaffold17 543 1.98695 1.90767 40
30
- scaffold18 4121 25.4151 24.4011 3883
12
+ # Name Length TPM NumReads
13
+ scaffold1 1016 549.279 20690
14
+ scaffold2 1439 598.782 31945
15
+ scaffold3 783 408.072 11846
16
+ scaffold4 893 441.382 14613
17
+ scaffold5 622 494.487 11403
18
+ scaffold6 2073 4.77214 366.764
19
+ scaffold7 1291 4.288 205.236
20
+ scaffold8 1355 17.9155 900
21
+ scaffold9 258 15.891 152
22
+ scaffold10 1934 104.823 7516
23
+ scaffold11 1922 23.9916 1709.57
24
+ scaffold12 1651 136.498 8355
25
+ scaffold13 1834 360.757 24529.4
26
+ scaffold14 580 6.13864 132
27
+ scaffold15 1539 4.8197 275
28
+ scaffold16 2302 26.1878 2235
29
+ scaffold17 543 1.98695 40
30
+ scaffold18 4121 25.4151 3883
@@ -28,9 +28,9 @@ class TestAssembly < Test::Unit::TestCase
28
28
  assert File.exist?("good.sorghum_100.fa"), "good output exists"
29
29
  assert File.exist?("bad.sorghum_100.fa"), "bad output"
30
30
  file_size = File.stat("good.sorghum_100.fa").size
31
- assert_in_delta 81_000, file_size, 5000, "good file size"
31
+ assert_in_delta 86_119, file_size, 5000, "good file size"
32
32
  file_size = File.stat("bad.sorghum_100.fa").size
33
- assert_in_delta 58_000, file_size, 5000, "bad file size"
33
+ assert_in_delta 53_000, file_size, 5000, "bad file size"
34
34
  end
35
35
  end
36
36
  end
data/test/test_bin.rb CHANGED
@@ -116,6 +116,7 @@ class TestTransrateBin < Test::Unit::TestCase
116
116
  end
117
117
  assert_in_delta 137748, hash[:n_bases], 1000, "number of bases"
118
118
  assert_equal 1692, hash[:n50], "n50"
119
+ assert_equal 25006 + 223, hash[:fragments], "number of reads"
119
120
  end
120
121
 
121
122
  should "fail when one of multiple assemblies is missing" do
@@ -1,210 +1,381 @@
1
1
  require 'helper'
2
+ require 'crb-blast'
3
+
4
+ module Transrate
5
+ class ComparativeMetrics
6
+ attr_reader :assembly
7
+ attr_reader :reference
8
+ attr_reader :crbblast
9
+ end
10
+ end
2
11
 
3
12
  module CRB_Blast
4
13
  class CRB_Blast
5
- def change_hit(query_name, target_name, qstart, qend, tstart, tend, qlen, tlen)
6
- hits = @reciprocals[query_name]
7
- hits.each do |hit|
8
- if hit.target == target_name
9
- hit.qstart = qstart
10
- hit.qend = qend
11
- hit.tstart = tstart
12
- hit.tend = tend
13
- hit.qlen = qlen
14
- hit.tlen = tlen
14
+ def add_missing
15
+ @missed.each do |query_id, missed|
16
+ missed.each do |hit|
17
+ @reciprocals[hit.query] ||= []
18
+ @reciprocals[hit.query] << hit
15
19
  end
16
20
  end
17
21
  end
22
+ end
23
+ end
18
24
 
19
- def add_hit(query_name, target_name, qstart, qend, tstart, tend, qlen, tlen)
20
- @reciprocals[query_name] ||= []
21
- list = Array.new(14)
22
- list[0] = query_name
23
- list[1] = target_name
24
- list[6] = qstart
25
- list[7] = qend
26
- list[8] = tstart
27
- list[9] = tend
28
- list[12] = qlen
29
- list[13] = tlen
30
- @reciprocals[query_name] << Hit.new(list)
31
- end
25
+ class Tester
26
+ def self.testpath file
27
+ return File.join(File.dirname(__FILE__), 'data', file)
28
+ end
32
29
 
33
- def remove_hit(query_name)
34
- @reciprocals.delete(query_name)
35
- end
30
+ def self.run_comp_metrics(query, target)
31
+ querypath = testpath(query)
32
+ targetpath = testpath(target)
33
+ @assembly = Transrate::Assembly.new(querypath)
34
+ @reference = Transrate::Assembly.new(targetpath)
35
+ @comp = Transrate::ComparativeMetrics.new(@assembly, @reference, 1)
36
+ @comp.run
37
+ return @comp
36
38
  end
37
39
  end
38
40
 
39
- class TestCompMetrics < Test::Unit::TestCase
41
+ class TestCompMetrics2 < Test::Unit::TestCase
42
+
40
43
 
41
44
  context "ComparativeMetrics" do
42
45
 
43
46
  setup do
44
- querypath = File.join(File.dirname(__FILE__),
45
- 'data',
46
- 'assembly.2.fa')
47
- targetpath = File.join(File.dirname(__FILE__),
48
- 'data',
49
- 'Os.protein.2.fa')
50
- @assembly = Transrate::Assembly.new(querypath)
51
- @q_ids = @assembly.assembly.keys
52
- @reference = Transrate::Assembly.new(targetpath)
53
- @t_ids = @reference.assembly.keys
54
- threads = 8
55
- @comp = Transrate::ComparativeMetrics.new(@assembly, @reference, threads)
56
- end
57
-
58
- should "run metrics on assembly" do
59
- Dir.mktmpdir do |tmpdir|
60
- Dir.chdir tmpdir do
61
- @comp.run
62
- assert @comp.has_run
63
- end
64
- end
65
- end
66
-
67
- should "calculate reference coverage" do
68
- crb = @comp.reciprocal_best_blast
69
- # change the results so i know what i have
70
- # qstart, qend, tstart, tend, qlen, tlen
71
- #
72
- # Q |------------|
73
- # T1 |-------------------------|
74
- crb.change_hit("scaf_Os03g60760.1", "LOC_Os03g60760.1",
75
- 1, 300, 101, 200, 300, 200) # 0.5
76
- @reference["LOC_Os03g60760.1"].seq = "A"*200
77
- #
78
- # Q1 |----------|
79
- # Q2 |------------|
80
- # T2 |-------------------------------|
81
- crb.change_hit("scaf_Os10g39590.1", "LOC_Os10g39590.1",
82
- 1, 150, 51, 100, 150, 200) # 0.25
83
- crb.add_hit("scaf_Os10g39590.1", "LOC_Os10g39590.1",
84
- 1, 150, 151, 200, 150, 200) # 0.25
85
- @reference["LOC_Os10g39590.1"].seq = "A"*200
86
- #
87
- # adding first block [151..300] scaf_Os09g38670.1
88
- # 450 / 600.0
89
- # LOC_Os09g38670.1 0.75
90
-
91
- #
92
- #
93
- # Q1 |-----------|
94
- # Q2 |----------------------|
95
- # T3 |-------------------------------|
96
- crb.change_hit("scaf_Os09g38670.1", "LOC_Os09g38670.1",
97
- 1, 150, 51, 100, 150, 200) # 0.25
98
- crb.add_hit("scaf_Os09g38670.1", "LOC_Os09g38670.1",
99
- 1, 450, 26, 175, 450, 200) # 0.75
100
- @reference["LOC_Os09g38670.1"].seq = "A"*200
101
-
102
- #
103
- # Q1 |----------------------|
104
- # Q2 |-----------|
105
- # T4 |-------------------------------|
106
- crb.change_hit("scaf_Os12g21920.1", "LOC_Os12g21920.1", #
107
- 1, 450, 26, 175, 450, 200) # 0.75
108
- crb.add_hit("scaf_Os12g21920.1", "LOC_Os12g21920.1",
109
- 1, 150, 51, 100, 150, 200) # 0.25
110
- @reference["LOC_Os12g21920.1"].seq = "A"*200
47
+ end
48
+
49
+ should "01 should run" do
50
+ Dir.mktmpdir do |tmpdir|
51
+ Dir.chdir tmpdir do
52
+ comp = Tester.run_comp_metrics("test_contig_nc1.fa", "test_reference_nc1.fa")
53
+ assert comp.has_run
54
+ end
55
+ end
56
+ end
111
57
 
58
+ should "01-1n should get reference hits" do
59
+ # The reciprocals hash in crb blast has contig names as the key.
60
+ # In order to look up by the reference name we need to reverse this.
61
+ # Scan through the reciprocals and get this Hit objects and add them to
62
+ # the @reference object for each reference sequence
63
+ Dir.mktmpdir do |tmpdir|
64
+ Dir.chdir tmpdir do
65
+ query = Tester.testpath("test_contig_nc1.fa")
66
+ target = Tester.testpath("test_reference_nc1.fa")
67
+ crbblast = CRB_Blast::CRB_Blast.new query, target
68
+ crbblast.run(1e-5, 1, true)
69
+ assembly = Transrate::Assembly.new(query)
70
+ reference = Transrate::Assembly.new(target)
71
+ comp = Transrate::ComparativeMetrics.new(assembly, reference, 1)
72
+ comp.get_reference_hits crbblast
73
+ assert_equal 1, comp.reference["reference1"].hits.size, "size of reference hits list"
74
+ assert_equal "contig1", comp.reference["reference1"].hits[0].query
75
+ assert_equal "reference1", comp.reference["reference1"].hits[0].target
76
+ end
77
+ end
78
+ end
112
79
 
113
- #
114
- # Q1 |------|
115
- # Q2 |--------|
116
- # Q3 |-----------------|
117
- # T5 |-------------------------------|
118
- crb.change_hit("scaf_Os01g36294.1", "LOC_Os01g36294.1", #
119
- 1, 300, 51, 100, 300, 400)
120
- crb.add_hit("scaf_Os01g36294.1", "LOC_Os01g36294.1",
121
- 1, 300, 200, 250, 300, 400)
122
- crb.add_hit("scaf_Os01g36294.1", "LOC_Os01g36294.1",
123
- 1, 300, 75, 225, 300, 400)
124
- @reference["LOC_Os01g36294.1"].seq = "A"*400
80
+ should "01-1n get per contig reference coverage" do
81
+ Dir.mktmpdir do |tmpdir|
82
+ Dir.chdir tmpdir do
83
+ query = Tester.testpath("test_contig_nc1.fa")
84
+ target = Tester.testpath("test_reference_nc1.fa")
85
+ crbblast = CRB_Blast::CRB_Blast.new query, target
86
+ crbblast.run(1e-5, 1, true)
87
+ assembly = Transrate::Assembly.new(query)
88
+ reference = Transrate::Assembly.new(target)
89
+ comp = Transrate::ComparativeMetrics.new(assembly, reference, 1)
90
+ comp.get_reference_hits crbblast
91
+ comp.per_query_contig_reference_coverage
92
+ assert_equal (2/3.0), comp.assembly["contig1"].reference_coverage
93
+ end
94
+ end
95
+ end
96
+
97
+ should "01-1a get per contig reference coverage on protein" do
98
+ Dir.mktmpdir do |tmpdir|
99
+ Dir.chdir tmpdir do
100
+ query = Tester.testpath("test_contig_nc1.fa")
101
+ target = Tester.testpath("test_reference_aa1.fa")
102
+ crbblast = CRB_Blast::CRB_Blast.new query, target
103
+ crbblast.run(1e-5, 1, true)
104
+ assembly = Transrate::Assembly.new(query)
105
+ reference = Transrate::Assembly.new(target)
106
+ comp = Transrate::ComparativeMetrics.new(assembly, reference, 1)
107
+ comp.get_reference_hits crbblast
108
+ comp.per_query_contig_reference_coverage
109
+ assert_equal (2/3.0), comp.assembly["contig1"].reference_coverage
110
+ end
111
+ end
112
+ end
113
+
114
+ should "01e raise error because you can't have protein queries" do
115
+ Dir.mktmpdir do |tmpdir|
116
+ Dir.chdir tmpdir do
117
+ query = Tester.testpath("test_reference_aa1.fa")
118
+ target = Tester.testpath("test_contig_nc1.fa")
119
+ crbblast = CRB_Blast::CRB_Blast.new query, target
120
+ crbblast.run(1e-5, 1, true)
121
+ assembly = Transrate::Assembly.new(query)
122
+ reference = Transrate::Assembly.new(target)
123
+ comp = Transrate::ComparativeMetrics.new(assembly, reference, 1)
124
+ comp.get_reference_hits crbblast
125
+ assert_raise Transrate::TransrateError do
126
+ comp.per_query_contig_reference_coverage
127
+ end
128
+ end
129
+ end
130
+ end
131
+
132
+ should "02-2n calculate coverage for each reference sequence" do
133
+ Dir.mktmpdir do |tmpdir|
134
+ # tmpdir = Dir.mktmpdir
135
+ # puts tmpdir
136
+ Dir.chdir tmpdir do
137
+ query = Tester.testpath("test_contig_nc2.fa")
138
+ target = Tester.testpath("test_reference_nc1.fa")
139
+ crbblast = CRB_Blast::CRB_Blast.new query, target
140
+ crbblast.run(1e-5, 1, true)
141
+ assembly = Transrate::Assembly.new(query)
142
+ reference = Transrate::Assembly.new(target)
143
+ comp = Transrate::ComparativeMetrics.new(assembly, reference, 1)
144
+ comp.get_reference_hits crbblast
145
+ comp.per_query_contig_reference_coverage
146
+ comp.per_target_contig_reference_coverage crbblast
147
+ # answer should be 290/300.0
148
+ assert_equal 29/30.0, comp.reference["reference1"].reference_coverage
149
+ end
150
+ end
151
+ end
125
152
 
126
- crb.change_hit("scaf_Os12g22750.1", "LOC_Os12g22750.1",
127
- 1, 300, 101, 200, 300, 200) # 0.5 # 300/600
128
- @reference["LOC_Os12g22750.1"].seq = "A"*200
153
+ should "02-3n calculate coverage for each reference sequence" do
154
+ Dir.mktmpdir do |tmpdir|
155
+ Dir.chdir tmpdir do
156
+ query = Tester.testpath("test_contig_nc3.fa")
157
+ target = Tester.testpath("test_reference_nc1.fa")
158
+ crbblast = CRB_Blast::CRB_Blast.new query, target
159
+ crbblast.run(1e-5, 1, true)
160
+ crbblast.add_missing
161
+ assembly = Transrate::Assembly.new(query)
162
+ reference = Transrate::Assembly.new(target)
163
+ comp = Transrate::ComparativeMetrics.new(assembly, reference, 1)
164
+ comp.get_reference_hits crbblast
165
+ comp.per_query_contig_reference_coverage
166
+ comp.per_target_contig_reference_coverage crbblast
167
+ # answer should be 1.0000
168
+ assert_equal 1.00, comp.reference["reference1"].reference_coverage
169
+ end
170
+ end
171
+ end
129
172
 
130
- crb.change_hit("scaf_Os02g55190.1", "LOC_Os02g55190.1",
131
- 1, 300, 101, 200, 300, 200) # 0.5 # 300/600
132
- @reference["LOC_Os02g55190.1"].seq = "A"*200
173
+ should "02-3a calculate coverage for each reference sequence" do
174
+ Dir.mktmpdir do |tmpdir|
175
+ Dir.chdir tmpdir do
176
+ query = Tester.testpath("test_contig_nc3.fa")
177
+ target = Tester.testpath("test_reference_aa1.fa")
178
+ crbblast = CRB_Blast::CRB_Blast.new query, target
179
+ crbblast.run(1e-5, 1, true)
180
+ crbblast.add_missing
181
+ assembly = Transrate::Assembly.new(query)
182
+ reference = Transrate::Assembly.new(target)
183
+ comp = Transrate::ComparativeMetrics.new(assembly, reference, 1)
184
+ comp.get_reference_hits crbblast
185
+ comp.per_query_contig_reference_coverage
186
+ comp.per_target_contig_reference_coverage crbblast
187
+ # answer should be 1.0000
188
+ assert_equal 1.00, comp.reference["reference2"].reference_coverage
189
+ end
190
+ end
191
+ end
133
192
 
134
- crb.change_hit("scaf_Os03g56500.1", "LOC_Os03g56500.1",
135
- 1, 300, 101, 200, 300, 400) # 0.25
136
- crb.change_hit("scaf_Os03g56500.2", "LOC_Os03g56500.1",
137
- 1, 300, 201, 300, 300, 400) # 0.25 # 600 / 1200
138
- @reference["LOC_Os03g56500.1"].seq = "A"*400
193
+ should "02-4n calculate coverage for each reference sequence" do
194
+ Dir.mktmpdir do |tmpdir|
195
+ Dir.chdir tmpdir do
196
+ query = Tester.testpath("test_contig_nc4.fa")
197
+ target = Tester.testpath("test_reference_nc1.fa")
198
+ crbblast = CRB_Blast::CRB_Blast.new query, target
199
+ crbblast.run(1e-5, 1, true)
200
+ assembly = Transrate::Assembly.new(query)
201
+ reference = Transrate::Assembly.new(target)
202
+ comp = Transrate::ComparativeMetrics.new(assembly, reference, 1)
203
+ comp.get_reference_hits crbblast
204
+ comp.per_query_contig_reference_coverage
205
+ comp.per_target_contig_reference_coverage crbblast
206
+ assert_equal 0.5, comp.reference["reference1"].reference_coverage
207
+ end
208
+ end
209
+ end
139
210
 
140
- crb.change_hit("scaf_Os03g56724.1", "LOC_Os03g56724.1",
141
- 1, 300, 101, 200, 300, 200) # 300/600 = 0.5
142
- @reference["LOC_Os03g56724.1"].seq = "A"*200
211
+ should "02-4a calculate coverage for each reference sequence" do
212
+ Dir.mktmpdir do |tmpdir|
213
+ Dir.chdir tmpdir do
214
+ query = Tester.testpath("test_contig_nc4.fa")
215
+ target = Tester.testpath("test_reference_aa1.fa")
216
+ crbblast = CRB_Blast::CRB_Blast.new query, target
217
+ crbblast.run(1e-5, 1, true)
218
+ assembly = Transrate::Assembly.new(query)
219
+ reference = Transrate::Assembly.new(target)
220
+ comp = Transrate::ComparativeMetrics.new(assembly, reference, 1)
221
+ comp.get_reference_hits crbblast
222
+ comp.per_query_contig_reference_coverage
223
+ comp.per_target_contig_reference_coverage crbblast
224
+ assert_equal 0.5, comp.reference["reference2"].reference_coverage
225
+ end
226
+ end
227
+ end
143
228
 
144
- crb.remove_hit("scaf_Os01g11360.1")
229
+ should "02-5a calculate coverage for each reference sequence" do
230
+ Dir.mktmpdir do |tmpdir|
231
+ Dir.chdir tmpdir do
232
+ query = Tester.testpath("test_contig_nc5.fa")
233
+ target = Tester.testpath("test_reference_aa1.fa")
234
+ crbblast = CRB_Blast::CRB_Blast.new query, target
235
+ crbblast.run(1e-5, 1, true)
236
+ crbblast.add_missing
237
+ assembly = Transrate::Assembly.new(query)
238
+ reference = Transrate::Assembly.new(target)
239
+ comp = Transrate::ComparativeMetrics.new(assembly, reference, 1)
240
+ comp.get_reference_hits crbblast
241
+ comp.per_query_contig_reference_coverage
242
+ comp.per_target_contig_reference_coverage crbblast
243
+ assert_equal (2/3.0), comp.reference["reference2"].reference_coverage
244
+ end
245
+ end
246
+ end
145
247
 
146
- @reference["LOC_Os03g08270.3"].seq = "A"*200
147
- @reference["LOC_Os10g41970.1"].seq = "A"*200
148
- @reference["LOC_Os09g26780.1"].seq = "A"*200
149
- @reference["LOC_Os12g24659.1"].seq = "A"*200
150
- @reference["LOC_Os01g36410.1"].seq = "A"*200
151
- @reference["LOC_Os12g22780.1"].seq = "A"*200
152
- @reference["LOC_Os02g56470.1"].seq = "A"*200
153
- @reference["LOC_Os03g30530.1"].seq = "A"*200
154
- @reference["LOC_Os03g49850.1"].seq = "A"*200
155
- @reference["LOC_Os01g11360.1"].seq = "A"*200
156
- @reference["LOC_Os01g44140.1"].seq = "A"*200
248
+ should "02-5n calculate coverage for each reference sequence" do
249
+ Dir.mktmpdir do |tmpdir|
250
+ Dir.chdir tmpdir do
251
+ query = Tester.testpath("test_contig_nc5.fa")
252
+ target = Tester.testpath("test_reference_nc1.fa")
253
+ crbblast = CRB_Blast::CRB_Blast.new query, target
254
+ crbblast.run(1e-5, 1, true)
255
+ crbblast.add_missing
256
+ assembly = Transrate::Assembly.new(query)
257
+ reference = Transrate::Assembly.new(target)
258
+ comp = Transrate::ComparativeMetrics.new(assembly, reference, 1)
259
+ comp.get_reference_hits crbblast
260
+ comp.per_query_contig_reference_coverage
261
+ comp.per_target_contig_reference_coverage crbblast
262
+ assert_equal (2/3.0), comp.reference["reference1"].reference_coverage
263
+ end
264
+ end
265
+ end
157
266
 
158
- assert_equal true, crb.target_is_prot, "target is prot"
159
- assert_equal false, crb.query_is_prot, "query is prot"
160
- # total_length of references should be 4400
267
+ should "02-6a calculate coverage for each reference sequence" do
268
+ Dir.mktmpdir do |tmpdir|
269
+ Dir.chdir tmpdir do
270
+ query = Tester.testpath("test_contig_nc6.fa")
271
+ target = Tester.testpath("test_reference_aa1.fa")
272
+ crbblast = CRB_Blast::CRB_Blast.new query, target
273
+ crbblast.run(1e-5, 1, true)
274
+ crbblast.add_missing
275
+ assembly = Transrate::Assembly.new(query)
276
+ reference = Transrate::Assembly.new(target)
277
+ comp = Transrate::ComparativeMetrics.new(assembly, reference, 1)
278
+ comp.get_reference_hits crbblast
279
+ comp.per_query_contig_reference_coverage
280
+ comp.per_target_contig_reference_coverage crbblast
281
+ assert_equal (1/3.0), comp.reference["reference2"].reference_coverage
282
+ end
283
+ end
284
+ end
161
285
 
162
- cov = @comp.coverage crb
163
- assert_equal 3600/13200.0, cov, "reference coverage"
286
+ should "02-6n calculate coverage for each reference sequence" do
287
+ Dir.mktmpdir do |tmpdir|
288
+ Dir.chdir tmpdir do
289
+ query = Tester.testpath("test_contig_nc6.fa")
290
+ target = Tester.testpath("test_reference_nc1.fa")
291
+ crbblast = CRB_Blast::CRB_Blast.new query, target
292
+ crbblast.run(1e-5, 1, true)
293
+ crbblast.add_missing
294
+ assembly = Transrate::Assembly.new(query)
295
+ reference = Transrate::Assembly.new(target)
296
+ comp = Transrate::ComparativeMetrics.new(assembly, reference, 1)
297
+ comp.get_reference_hits crbblast
298
+ comp.per_query_contig_reference_coverage
299
+ comp.per_target_contig_reference_coverage crbblast
300
+ assert_equal (1/3.0), comp.reference["reference1"].reference_coverage
301
+ end
302
+ end
164
303
  end
165
304
 
166
- should "calculate overlap amount" do
167
- assert_equal 0.5, @comp.overlap_amount(201,500,101,400), "1"
168
- assert_equal 0.5, @comp.overlap_amount(101,400,201,500), "2"
169
- assert_equal 0.5, @comp.overlap_amount(201,400,101,500), "3"
170
- assert_equal 0.5, @comp.overlap_amount(101,500,201,400), "4"
171
- end
172
-
173
- should "calculate number of contigs with crbblast hit" do
305
+ should "02-7a calculate coverage for each reference sequence" do
174
306
  Dir.mktmpdir do |tmpdir|
175
307
  Dir.chdir tmpdir do
176
- @comp.run
177
- assert_equal 11, @comp.comp_stats[:n_contigs_with_CRBB]
178
- assert_equal 11/13.0, @comp.comp_stats[:p_contigs_with_CRBB]
308
+ query = Tester.testpath("test_contig_nc7.fa")
309
+ target = Tester.testpath("test_reference_aa1.fa")
310
+ crbblast = CRB_Blast::CRB_Blast.new query, target
311
+ crbblast.run(1e-5, 1, true)
312
+ crbblast.add_missing
313
+ assembly = Transrate::Assembly.new(query)
314
+ reference = Transrate::Assembly.new(target)
315
+ comp = Transrate::ComparativeMetrics.new(assembly, reference, 1)
316
+ comp.get_reference_hits crbblast
317
+ comp.per_query_contig_reference_coverage
318
+ comp.per_target_contig_reference_coverage crbblast
319
+ assert_equal (1/3.0), comp.reference["reference2"].reference_coverage
179
320
  end
180
321
  end
181
322
  end
182
323
 
183
- should "calculate number of reference sequences with crbblast hit" do
324
+ should "02-7n calculate coverage for each reference sequence" do
184
325
  Dir.mktmpdir do |tmpdir|
185
326
  Dir.chdir tmpdir do
186
- @comp.run
187
- assert_equal 10, @comp.comp_stats[:n_refs_with_CRBB]
188
- assert_equal 0.5, @comp.comp_stats[:p_refs_with_CRBB]
327
+ query = Tester.testpath("test_contig_nc7.fa")
328
+ target = Tester.testpath("test_reference_nc1.fa")
329
+ crbblast = CRB_Blast::CRB_Blast.new query, target
330
+ crbblast.run(1e-5, 1, true)
331
+ crbblast.add_missing
332
+ assembly = Transrate::Assembly.new(query)
333
+ reference = Transrate::Assembly.new(target)
334
+ comp = Transrate::ComparativeMetrics.new(assembly, reference, 1)
335
+ comp.get_reference_hits crbblast
336
+ comp.per_query_contig_reference_coverage
337
+ comp.per_target_contig_reference_coverage crbblast
338
+ assert_equal (1/3.0), comp.reference["reference1"].reference_coverage
189
339
  end
190
340
  end
191
341
  end
192
342
 
193
- should "calculate reference sequence coverage" do
194
- # n&p of reference sequences covered to (25, 50, 75, 85, 95%)
195
- # of their length by CRB-BLAST hit
343
+ should "03 calculate all metrics" do
196
344
  Dir.mktmpdir do |tmpdir|
197
345
  Dir.chdir tmpdir do
198
- @comp.run
199
- stats = @comp.comp_stats
200
- assert_equal 10, stats[:cov25]
201
- assert_equal 10, stats[:cov50]
202
- assert_equal 7, stats[:cov75]
203
- assert_equal 6, stats[:cov85]
204
- assert_equal 3, stats[:cov95]
346
+ query = Tester.testpath("assembly.2.fa")
347
+ target = Tester.testpath("Os.protein.2.fa")
348
+ crbblast = CRB_Blast::CRB_Blast.new query, target
349
+ crbblast.run(1e-5, 1, true)
350
+ assembly = Transrate::Assembly.new(query)
351
+ reference = Transrate::Assembly.new(target)
352
+ comp = Transrate::ComparativeMetrics.new(assembly, reference, 1)
353
+ comp.get_reference_hits crbblast
354
+ comp.per_query_contig_reference_coverage
355
+ comp.per_target_contig_reference_coverage crbblast
356
+ assert_equal 11, comp.comp_stats[:CRBB_hits], "CRBB hits"
357
+ assert_equal 11, comp.comp_stats[:n_contigs_with_CRBB], "n_contigs_with_CRBB"
358
+ assert_equal 0.84615, comp.comp_stats[:p_contigs_with_CRBB].round(5), "p_contigs_with_CRBB"
359
+ assert_equal 0.55, comp.comp_stats[:rbh_per_reference], "rbh_per_reference"
360
+ assert_equal 10, comp.comp_stats[:n_refs_with_CRBB], "n_refs_with_CRBB"
361
+ assert_equal 0.5, comp.comp_stats[:p_refs_with_CRBB], "p_refs_with_CRBB"
362
+ assert_equal 10, comp.comp_stats[:cov25], "cov25"
363
+ assert_equal 10, comp.comp_stats[:cov50], "cov50"
364
+ assert_equal 7, comp.comp_stats[:cov75], "cov75"
365
+ assert_equal 6, comp.comp_stats[:cov85], "cov85"
366
+ assert_equal 3, comp.comp_stats[:cov95], "cov95"
367
+ assert_equal 0.5, comp.comp_stats[:p_cov25], "p_cov25"
368
+ assert_equal 0.5, comp.comp_stats[:p_cov50], "p_cov50"
369
+ assert_equal 0.35, comp.comp_stats[:p_cov75], "p_cov75"
370
+ assert_equal 0.3, comp.comp_stats[:p_cov85], "p_cov85"
371
+ assert_equal 0.15, comp.comp_stats[:p_cov95], "p_cov95"
372
+ assert_equal 0.37261, comp.comp_stats[:reference_coverage].round(5), "reference_coverage"
373
+
205
374
  end
206
375
  end
207
376
  end
208
377
 
378
+
379
+
209
380
  end
210
381
  end