transrate 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (47) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +16 -1
  3. data/.travis.yml +8 -0
  4. data/README.md +45 -43
  5. data/Rakefile +36 -0
  6. data/bin/transrate +98 -50
  7. data/deps/deps.yaml +55 -0
  8. data/lib/transrate.rb +19 -4
  9. data/lib/transrate/assembly.rb +93 -182
  10. data/lib/transrate/bowtie2.rb +37 -13
  11. data/lib/transrate/cmd.rb +19 -0
  12. data/lib/transrate/comparative_metrics.rb +239 -19
  13. data/lib/transrate/contig.rb +212 -0
  14. data/lib/transrate/contig_metrics.rb +76 -0
  15. data/lib/transrate/read_metrics.rb +83 -41
  16. data/lib/transrate/samtools.rb +73 -0
  17. data/lib/transrate/transrater.rb +31 -11
  18. data/lib/transrate/version.rb +1 -1
  19. data/test/data/150uncovered.l.fq +892 -0
  20. data/test/data/150uncovered.r.fq +892 -0
  21. data/test/data/Os.protein.2.fa +95 -0
  22. data/test/data/Os.protein.fa +199 -0
  23. data/test/data/assembly.2.fa +26 -0
  24. data/test/{assembly.fasta → data/assembly.fasta} +0 -0
  25. data/test/data/bridging_reads.l.fastq +20 -0
  26. data/test/data/bridging_reads.r.fastq +20 -0
  27. data/test/data/sorghum_transcript.fa +4 -0
  28. data/test/data/tiny.sam +4 -0
  29. data/test/helper.rb +33 -2
  30. data/test/test_bowtie.rb +54 -0
  31. data/test/test_cmd.rb +15 -0
  32. data/test/test_comp_metrics.rb +177 -0
  33. data/test/test_contig.rb +61 -0
  34. data/test/test_contig_metrics.rb +50 -0
  35. data/test/test_inline.rb +10 -9
  36. data/test/test_read_metrics.rb +68 -0
  37. data/test/test_samtools.rb +22 -0
  38. data/test/test_transrate.rb +40 -0
  39. data/test/test_transrater.rb +68 -0
  40. data/transrate.gemspec +16 -10
  41. metadata +232 -57
  42. data/lib/transrate/express.rb +0 -37
  43. data/lib/transrate/log.rb +0 -16
  44. data/lib/transrate/rb_hit.rb +0 -33
  45. data/lib/transrate/reciprocal_annotation.rb +0 -105
  46. data/lib/transrate/usearch.rb +0 -66
  47. data/test/test_test.rb +0 -41
data/test/test_cmd.rb ADDED
@@ -0,0 +1,15 @@
1
+ require 'helper'
2
+
3
+ class TestCmd < Test::Unit::TestCase
4
+
5
+ context "Cmd" do
6
+
7
+ should "run commands" do
8
+ cmd = Transrate::Cmd.new 'echo "success"'
9
+ cmd.run
10
+ assert_equal "success", cmd.stdout.chomp, 'run echo command'
11
+ end
12
+
13
+ end
14
+
15
+ end
@@ -0,0 +1,177 @@
1
+ require 'helper'
2
+
3
+ class TestCompMetrics < Test::Unit::TestCase
4
+
5
+ context "ComparativeMetrics" do
6
+
7
+ setup do
8
+ querypath = File.join(File.dirname(__FILE__),
9
+ 'data',
10
+ 'assembly.2.fa')
11
+ targetpath = File.join(File.dirname(__FILE__),
12
+ 'data',
13
+ 'Os.protein.2.fa')
14
+ assembly = Transrate::Assembly.new(querypath)
15
+ reference = Transrate::Assembly.new(targetpath)
16
+ threads = 8
17
+ @comp = Transrate::ComparativeMetrics.new(assembly, reference, threads)
18
+ end
19
+
20
+
21
+ should "run metrics on assembly" do
22
+ Dir.mktmpdir do |tmpdir|
23
+ Dir.chdir tmpdir do
24
+ @comp.run
25
+ assert @comp.has_run
26
+ end
27
+ end
28
+ end
29
+
30
+ should "calculate ortholog hit ratio" do
31
+ crb = CRBHelper.new(false)
32
+
33
+ hash = Hash.new
34
+ (1..11).each do |i|
35
+ hash["q#{i}"] = []
36
+ end
37
+ # Q1 |------------|
38
+ # Q2 |--------------|
39
+ # T1 |------------------------------------| # coverage = 200/500
40
+ hash["q1"] << HitHelper.new("q1", "t1", 1, 100, 101, 200, 100, 500)
41
+ hash["q2"] << HitHelper.new("q2", "t1", 1, 100, 301, 400, 100, 500)
42
+
43
+ # Q3 |------------|
44
+ # Q4 |--------------|
45
+ # T2 |------------------------------------| # coverage = 200/500
46
+ hash["q3"] << HitHelper.new("q3", "t2", 1, 100, 301, 400, 100, 500)
47
+ hash["q4"] << HitHelper.new("q4", "t2", 1, 100, 101, 200, 100, 500)
48
+
49
+ # Q5 |------------|
50
+ # Q6 |-------------------|
51
+ # T3 |------------------------------------| # coverage = 300/500
52
+ hash["q5"] << HitHelper.new("q5", "t3", 1, 200, 201, 400, 200, 500)
53
+ hash["q6"] << HitHelper.new("q6", "t3", 1, 200, 101, 300, 200, 500)
54
+
55
+ # Q7 |------------|
56
+ # Q8 |------------------------|
57
+ # T3 |------------------------------------| # coverage = 300/500
58
+ hash["q7"] << HitHelper.new("q7", "t4", 1, 100, 201, 300, 100, 500)
59
+ hash["q8"] << HitHelper.new("q8", "t4", 1, 300, 101, 400, 300, 500)
60
+
61
+ # Q9 |--------|
62
+ # Q10 |--------|
63
+ # Q11 |--------------------|
64
+ # T5 |------------------------------------| # coverage = 600/1000
65
+ hash["q9"] << HitHelper.new("q9", "t5", 1, 200, 201, 400, 200, 1000)
66
+ hash["q10"] << HitHelper.new("q10", "t5", 1, 200, 601, 800, 200, 1000)
67
+ hash["q11"] << HitHelper.new("q11", "t5", 1, 400, 301, 700, 400, 1000)
68
+
69
+ crb.hash = hash
70
+ ohr = @comp.ortholog_hit_ratio crb
71
+ assert_equal 16.0/30.0, ohr
72
+ end
73
+
74
+ should "calculate potential chimera count" do
75
+ crb = CRBHelper.new(false)
76
+
77
+ hash = Hash.new
78
+ (1..3).each do |i|
79
+ hash["q#{i}"] = []
80
+ end
81
+
82
+ # T1 |---------|
83
+ # T2 |---------|
84
+ # Q1 |----------------------------| # chimera = true
85
+ hash["q1"] << HitHelper.new("q1", "t1", 101, 200, 1, 100, 500, 100)
86
+ hash["q1"] << HitHelper.new("q1", "t2", 301, 400, 1, 100, 400, 100)
87
+
88
+
89
+ # T3 |---------|
90
+ # T3 |---------|
91
+ # Q2 |----------------------------|
92
+ # chimera = true because the reference has the region 1-100 duplicated
93
+ hash["q2"] << HitHelper.new("q2", "t3", 101, 200, 1, 100, 500, 100)
94
+ hash["q2"] << HitHelper.new("q2", "t3", 301, 400, 1, 100, 400, 100)
95
+
96
+ # # T4 |---------|
97
+ # # T4 |---------|
98
+ # # Q3 |----------------------------|
99
+ # # chimera = false because the reference
100
+ hash["q3"] << HitHelper.new("q3", "t4", 101, 200, 1, 100, 500, 200)
101
+ hash["q3"] << HitHelper.new("q3", "t4", 301, 400, 101, 200, 400, 200)
102
+
103
+ crb.hash = hash
104
+ @comp.chimeras crb
105
+ assert_equal 0.667, @comp.p_chimeras.round(3)
106
+ end
107
+
108
+ should "calculate overlap amount" do
109
+ assert_equal 0.5, @comp.overlap_amount(201,500,101,400), "1"
110
+ assert_equal 0.5, @comp.overlap_amount(101,400,201,500), "2"
111
+ assert_equal 0.5, @comp.overlap_amount(201,400,101,500), "3"
112
+ assert_equal 0.5, @comp.overlap_amount(101,500,201,400), "4"
113
+ end
114
+
115
+ should "calculate number of contigs with crbblast hit" do
116
+ Dir.mktmpdir do |tmpdir|
117
+ Dir.chdir tmpdir do
118
+ @comp.run
119
+ assert_equal 11, @comp.comp_stats[:n_contigs_with_recip]
120
+ assert_equal 11/13.0, @comp.comp_stats[:p_contigs_with_recip]
121
+ end
122
+ end
123
+ end
124
+
125
+ should "calculate number of reference sequences with crbblast hit" do
126
+ Dir.mktmpdir do |tmpdir|
127
+ Dir.chdir tmpdir do
128
+ @comp.run
129
+ assert_equal 10, @comp.comp_stats[:n_refs_with_recip]
130
+ assert_equal 0.5, @comp.comp_stats[:p_refs_with_recip]
131
+ end
132
+ end
133
+ end
134
+
135
+ should "calculate reference sequence coverage" do
136
+ # n&p of reference sequences covered to (25, 50, 75, 85, 95%)
137
+ # of their length by CRB-BLAST hit
138
+ Dir.mktmpdir do |tmpdir|
139
+ Dir.chdir tmpdir do
140
+ @comp.run
141
+ stats = @comp.comp_stats
142
+ assert_equal 10, stats[:cov25]
143
+ assert_equal 10, stats[:cov50]
144
+ assert_equal 7, stats[:cov75]
145
+ assert_equal 6, stats[:cov85]
146
+ assert_equal 3, stats[:cov95]
147
+ end
148
+ end
149
+ end
150
+
151
+ should "number of reference sequences coverage" do
152
+ # n&p of reference sequences covered to (25, 50, 75, 85, 95%)
153
+ # of their length by CRB-BLAST hit
154
+ crb = CRBHelper.new(false)
155
+
156
+ hash = Hash.new
157
+ (1..5).each do |i|
158
+ hash["q#{i}"] = []
159
+ end
160
+ hash["q1"] << HitHelper.new("q1", "t1", 1, 250, 101, 350, 250, 1000)
161
+ hash["q2"] << HitHelper.new("q2", "t2", 1, 500, 101, 600, 500, 1000)
162
+ hash["q3"] << HitHelper.new("q3", "t3", 1, 750, 101, 850, 750, 1000)
163
+ hash["q4"] << HitHelper.new("q4", "t4", 1, 850, 101, 950, 850, 1000)
164
+ hash["q5"] << HitHelper.new("q5", "t5", 1, 950, 1, 950, 950, 1000)
165
+
166
+ crb.hash = hash
167
+ ohr = @comp.ortholog_hit_ratio crb
168
+ stats = @comp.comp_stats
169
+ assert_equal 5, stats[:cov25]
170
+ assert_equal 4, stats[:cov50]
171
+ assert_equal 3, stats[:cov75]
172
+ assert_equal 2, stats[:cov85]
173
+ assert_equal 1, stats[:cov95]
174
+ end
175
+
176
+ end
177
+ end
@@ -0,0 +1,61 @@
1
+ require 'helper'
2
+ require 'bio'
3
+
4
+ class TestContig < Test::Unit::TestCase
5
+
6
+ context "Contig" do
7
+
8
+ setup do
9
+ seq = Bio::Sequence.new 'ATGCGTGTATATACGCGTAG'
10
+ @contig = Transrate::Contig.new seq
11
+ end
12
+
13
+ should "know the number and proportion of each base it contains" do
14
+ assert_equal 5, @contig.bases_a, "count of base a"
15
+ assert_equal 0.25, @contig.prop_a, "proportion of base a"
16
+ assert_equal 3, @contig.bases_c, "count of base c"
17
+ assert_equal 0.15, @contig.prop_c, "proportion of base c"
18
+ assert_equal 6, @contig.bases_g, "count of base g"
19
+ assert_equal 0.3, @contig.prop_g, "proportion of base g"
20
+ assert_equal 6, @contig.bases_t, "count of base t"
21
+ assert_equal 0.3, @contig.prop_t, "proportion of base t"
22
+ assert_equal 0, @contig.bases_n, "count of base n"
23
+ assert_equal 0.0, @contig.prop_n, "proportion of base n"
24
+ end
25
+
26
+ should "know how many of each two-base pair it contains" do
27
+ assert_equal 3, @contig.dibase_composition[:cg], "cg count"
28
+ assert_equal 3, @contig.dibase_composition[:at], "at count"
29
+ assert_equal 2, @contig.dibase_composition[:tg], "tg count"
30
+ end
31
+
32
+ should "know its own gc content" do
33
+ assert_equal 9, @contig.bases_gc, "count of bases that are c or g"
34
+ assert_equal 0.45, @contig.prop_gc.round(2),
35
+ "proportion of bases that are c or g"
36
+ end
37
+
38
+ should "know its own base-pair skew" do
39
+ assert_equal 0.45, @contig.gc_skew.round(2), "gc skew"
40
+ assert_equal 0.55, @contig.at_skew.round(2), "at skew"
41
+ end
42
+
43
+ should "know its own CpG count and density" do
44
+ assert_equal 3, @contig.cpg_count, "cpg count"
45
+ assert_equal 66.67, @contig.cpg_ratio.round(2), "cpg ratio"
46
+ end
47
+
48
+ should "know the length of its own longest orf" do
49
+ assert_equal 6, @contig.orf_length, "orf length"
50
+ end
51
+
52
+
53
+ should "know its own linguistic complexity" do
54
+ assert_equal 0.0586, @contig.linguistic_complexity(4).round(4),
55
+ "linguistic complexity k=4"
56
+ assert_equal 0.0037, @contig.linguistic_complexity(6).round(4),
57
+ "linguistic complexity k=6"
58
+ end
59
+
60
+ end
61
+ end
@@ -0,0 +1,50 @@
1
+ require 'helper'
2
+
3
+ class TestContigMetrics < Test::Unit::TestCase
4
+
5
+ context "transrate" do
6
+
7
+ setup do
8
+ querypath = File.join(File.dirname(__FILE__), 'data',
9
+ 'assembly.fasta')
10
+ assembly = Transrate::Assembly.new(querypath)
11
+ @contig_metrics = Transrate::ContigMetrics.new(assembly)
12
+ end
13
+
14
+ should "run metrics on assembly" do
15
+ @contig_metrics.run
16
+ assert @contig_metrics.has_run
17
+ end
18
+
19
+ should "get gc content" do
20
+ @contig_metrics.run
21
+ assert_equal 0.37672, @contig_metrics.gc_prop.round(5)
22
+ end
23
+
24
+ should "get gc skew" do
25
+ @contig_metrics.run
26
+ assert_equal 0.00440, @contig_metrics.gc_skew.round(5)
27
+ end
28
+
29
+ should "get at skew" do
30
+ @contig_metrics.run
31
+ assert_equal -0.00718, @contig_metrics.at_skew.round(5)
32
+ end
33
+
34
+ should "get CpG density" do
35
+ @contig_metrics.run
36
+ assert_equal 0.52828, @contig_metrics.cpg_ratio.round(5)
37
+ end
38
+
39
+ should "get linguistic complexity" do
40
+ @contig_metrics.run
41
+ assert_equal 0.26599, @contig_metrics.linguistic_complexity.round(5)
42
+ end
43
+
44
+ should "get the number and proportion of Ns" do
45
+ @contig_metrics.run
46
+ assert_equal 2, @contig_metrics.bases_n
47
+ assert_equal 0.00033, @contig_metrics.proportion_n.round(5)
48
+ end
49
+ end
50
+ end
data/test/test_inline.rb CHANGED
@@ -1,29 +1,30 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
3
  require 'helper'
4
+ require 'bio'
4
5
 
5
6
  class TestInline < Test::Unit::TestCase
6
7
 
7
8
  context 'transrate' do
8
9
 
9
10
  setup do
10
- @a = Transrate::Assembly.new('test/assembly.fasta')
11
- @seq1 = 'ATGCCCCTAGGGTAG'
11
+ filepath = File.join(File.dirname(__FILE__), 'data', 'assembly.fasta')
12
+ @a = Transrate::Assembly.new(filepath)
12
13
  end
13
14
 
14
15
  should 'find longest orf in file' do
15
16
  orfs = []
16
- @a.assembly.each do |entry|
17
- l = @a.orf_length entry.seq
18
- orfs << l
17
+ @a.assembly.each do |contig|
18
+ orfs << contig.orf_length
19
19
  end
20
- assert_equal orfs.length, 4
21
- assert_equal orfs, [333, 370, 131, 84]
20
+ assert_equal 4, orfs.length
21
+ assert_equal [333, 370, 131, 84], orfs
22
22
  end
23
23
 
24
24
  should 'find longest orf in sequence' do
25
- l = @a.orf_length(@seq1)
26
- assert_equal l, 4
25
+ seq = Bio::Sequence.new 'ATGCCCCTAGGGTAG'
26
+ contig = Transrate::Contig.new seq
27
+ assert_equal 4, contig.orf_length
27
28
  end
28
29
 
29
30
  end
@@ -0,0 +1,68 @@
1
+ require 'helper'
2
+ require 'tmpdir'
3
+
4
+ class TestReadMetrics < Test::Unit::TestCase
5
+
6
+ context "ReadMetrics" do
7
+
8
+ setup do
9
+ query = File.join(File.dirname(__FILE__), 'data',
10
+ 'sorghum_transcript.fa')
11
+ assembly = Transrate::Assembly.new(query)
12
+ @read_metrics = Transrate::ReadMetrics.new(assembly)
13
+ end
14
+
15
+ teardown do
16
+ if File.exist?("test/data/sorghum_transcript.fa.fai")
17
+ rm = "rm test/data/sorghum_transcript.fa.fai"
18
+ `#{rm}`
19
+ end
20
+ end
21
+
22
+ should "setup correctly" do
23
+ assert @read_metrics
24
+ end
25
+
26
+ should "calculate read mapping statistics" do
27
+ left = File.join(File.dirname(__FILE__), 'data', '150uncovered.l.fq')
28
+ right = File.join(File.dirname(__FILE__), 'data', '150uncovered.r.fq')
29
+ Dir.mktmpdir do |tmpdir|
30
+ Dir.chdir tmpdir do
31
+ @read_metrics.run(left, right)
32
+ stats = @read_metrics.read_stats
33
+ assert @read_metrics.has_run
34
+ assert_equal 223, stats[:num_pairs], 'number of read pairs'
35
+ assert_equal 202, stats[:total_mappings], 'number mapping'
36
+ assert_equal 90.58, stats[:percent_mapping].round(2),
37
+ 'percent mapping'
38
+ assert_equal 202, stats[:good_mappings], 'good mapping'
39
+ assert_equal 90.58,
40
+ stats[:pc_good_mapping].round(2),
41
+ 'percent good mapping'
42
+ assert_equal 0, stats[:bad_mappings], 'bad mapping'
43
+ assert_equal 22.91, stats[:mean_coverage].round(2), 'mean coverage'
44
+ assert_equal 11, stats[:n_uncovered_bases], 'n uncovered bases'
45
+ assert_equal 0.007,
46
+ stats[:p_uncovered_bases].round(3),
47
+ 'p uncovered bases'
48
+ end
49
+ end
50
+ end
51
+
52
+ should "find read pairs that support scaffolding" do
53
+ left = File.join(File.dirname(__FILE__), 'data', 'bridging_reads.l.fastq')
54
+ right = File.join(File.dirname(__FILE__),
55
+ 'data',
56
+ 'bridging_reads.r.fastq')
57
+ Dir.mktmpdir do |tmpdir|
58
+ Dir.chdir tmpdir do
59
+ @read_metrics.run(left, right)
60
+ stats = @read_metrics.read_stats
61
+ assert_equal 1, stats[:potential_bridges], 'potential bridges'
62
+ end
63
+ end
64
+ end
65
+
66
+ end
67
+
68
+ end
@@ -0,0 +1,22 @@
1
+ require 'helper'
2
+
3
+ class TestSamtools < Test::Unit::TestCase
4
+
5
+ context "samtools" do
6
+
7
+ should "know the path to samtools binary" do
8
+ msg = /Program: samtools/
9
+ path = Transrate::Samtools.path
10
+ res = `#{path} 2>&1`.split("\n").join
11
+ assert msg =~ res
12
+ end
13
+
14
+ should "run commands" do
15
+ sam = File.join(File.dirname(__FILE__), 'data', 'tiny.sam')
16
+ Transrate::Samtools.run "view -bS #{sam} > tiny.bam"
17
+ assert_equal 460, File.size('tiny.bam'), 'bam file should be created'
18
+ File.delete 'tiny.bam'
19
+ end
20
+
21
+ end
22
+ end
@@ -0,0 +1,40 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'helper'
4
+
5
+ class TestTransrate < Test::Unit::TestCase
6
+
7
+ context "transrate" do
8
+
9
+ setup do
10
+ filepath = File.join(File.dirname(__FILE__), 'data', 'assembly.fasta')
11
+ @a = Transrate::Assembly.new(filepath)
12
+ end
13
+
14
+ should "create assembly object" do
15
+ assert @a
16
+ assert_equal @a.assembly.size, 4
17
+ end
18
+
19
+ should "run basic stats" do
20
+ stats = @a.basic_stats
21
+ assert_equal stats["n_seqs"], 4
22
+ assert_equal stats["smallest"], 1409
23
+ assert_equal stats["largest"], 1630
24
+ assert_equal stats["mean_len"], 1508.25
25
+ end
26
+
27
+ should "run metrics on assembly" do
28
+ ans = @a.run(2) # using 2 threads
29
+ assert_equal ans, true, "should run but returned #{ans}"
30
+ end
31
+
32
+ should "find the mean length" do
33
+ ans = @a.run(2)
34
+ mean = @a.mean_len
35
+ n_bases = @a.n_bases
36
+ assert_equal mean, 1508.25
37
+ assert_equal n_bases, 6033
38
+ end
39
+ end
40
+ end