transrate 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +16 -1
  3. data/.travis.yml +8 -0
  4. data/README.md +45 -43
  5. data/Rakefile +36 -0
  6. data/bin/transrate +98 -50
  7. data/deps/deps.yaml +55 -0
  8. data/lib/transrate.rb +19 -4
  9. data/lib/transrate/assembly.rb +93 -182
  10. data/lib/transrate/bowtie2.rb +37 -13
  11. data/lib/transrate/cmd.rb +19 -0
  12. data/lib/transrate/comparative_metrics.rb +239 -19
  13. data/lib/transrate/contig.rb +212 -0
  14. data/lib/transrate/contig_metrics.rb +76 -0
  15. data/lib/transrate/read_metrics.rb +83 -41
  16. data/lib/transrate/samtools.rb +73 -0
  17. data/lib/transrate/transrater.rb +31 -11
  18. data/lib/transrate/version.rb +1 -1
  19. data/test/data/150uncovered.l.fq +892 -0
  20. data/test/data/150uncovered.r.fq +892 -0
  21. data/test/data/Os.protein.2.fa +95 -0
  22. data/test/data/Os.protein.fa +199 -0
  23. data/test/data/assembly.2.fa +26 -0
  24. data/test/{assembly.fasta → data/assembly.fasta} +0 -0
  25. data/test/data/bridging_reads.l.fastq +20 -0
  26. data/test/data/bridging_reads.r.fastq +20 -0
  27. data/test/data/sorghum_transcript.fa +4 -0
  28. data/test/data/tiny.sam +4 -0
  29. data/test/helper.rb +33 -2
  30. data/test/test_bowtie.rb +54 -0
  31. data/test/test_cmd.rb +15 -0
  32. data/test/test_comp_metrics.rb +177 -0
  33. data/test/test_contig.rb +61 -0
  34. data/test/test_contig_metrics.rb +50 -0
  35. data/test/test_inline.rb +10 -9
  36. data/test/test_read_metrics.rb +68 -0
  37. data/test/test_samtools.rb +22 -0
  38. data/test/test_transrate.rb +40 -0
  39. data/test/test_transrater.rb +68 -0
  40. data/transrate.gemspec +16 -10
  41. metadata +232 -57
  42. data/lib/transrate/express.rb +0 -37
  43. data/lib/transrate/log.rb +0 -16
  44. data/lib/transrate/rb_hit.rb +0 -33
  45. data/lib/transrate/reciprocal_annotation.rb +0 -105
  46. data/lib/transrate/usearch.rb +0 -66
  47. data/test/test_test.rb +0 -41
data/test/test_cmd.rb ADDED
@@ -0,0 +1,15 @@
1
+ require 'helper'
2
+
3
+ class TestCmd < Test::Unit::TestCase
4
+
5
+ context "Cmd" do
6
+
7
+ should "run commands" do
8
+ cmd = Transrate::Cmd.new 'echo "success"'
9
+ cmd.run
10
+ assert_equal "success", cmd.stdout.chomp, 'run echo command'
11
+ end
12
+
13
+ end
14
+
15
+ end
@@ -0,0 +1,177 @@
1
+ require 'helper'
2
+
3
+ class TestCompMetrics < Test::Unit::TestCase
4
+
5
+ context "ComparativeMetrics" do
6
+
7
+ setup do
8
+ querypath = File.join(File.dirname(__FILE__),
9
+ 'data',
10
+ 'assembly.2.fa')
11
+ targetpath = File.join(File.dirname(__FILE__),
12
+ 'data',
13
+ 'Os.protein.2.fa')
14
+ assembly = Transrate::Assembly.new(querypath)
15
+ reference = Transrate::Assembly.new(targetpath)
16
+ threads = 8
17
+ @comp = Transrate::ComparativeMetrics.new(assembly, reference, threads)
18
+ end
19
+
20
+
21
+ should "run metrics on assembly" do
22
+ Dir.mktmpdir do |tmpdir|
23
+ Dir.chdir tmpdir do
24
+ @comp.run
25
+ assert @comp.has_run
26
+ end
27
+ end
28
+ end
29
+
30
+ should "calculate ortholog hit ratio" do
31
+ crb = CRBHelper.new(false)
32
+
33
+ hash = Hash.new
34
+ (1..11).each do |i|
35
+ hash["q#{i}"] = []
36
+ end
37
+ # Q1 |------------|
38
+ # Q2 |--------------|
39
+ # T1 |------------------------------------| # coverage = 200/500
40
+ hash["q1"] << HitHelper.new("q1", "t1", 1, 100, 101, 200, 100, 500)
41
+ hash["q2"] << HitHelper.new("q2", "t1", 1, 100, 301, 400, 100, 500)
42
+
43
+ # Q3 |------------|
44
+ # Q4 |--------------|
45
+ # T2 |------------------------------------| # coverage = 200/500
46
+ hash["q3"] << HitHelper.new("q3", "t2", 1, 100, 301, 400, 100, 500)
47
+ hash["q4"] << HitHelper.new("q4", "t2", 1, 100, 101, 200, 100, 500)
48
+
49
+ # Q5 |------------|
50
+ # Q6 |-------------------|
51
+ # T3 |------------------------------------| # coverage = 300/500
52
+ hash["q5"] << HitHelper.new("q5", "t3", 1, 200, 201, 400, 200, 500)
53
+ hash["q6"] << HitHelper.new("q6", "t3", 1, 200, 101, 300, 200, 500)
54
+
55
+ # Q7 |------------|
56
+ # Q8 |------------------------|
57
+ # T3 |------------------------------------| # coverage = 300/500
58
+ hash["q7"] << HitHelper.new("q7", "t4", 1, 100, 201, 300, 100, 500)
59
+ hash["q8"] << HitHelper.new("q8", "t4", 1, 300, 101, 400, 300, 500)
60
+
61
+ # Q9 |--------|
62
+ # Q10 |--------|
63
+ # Q11 |--------------------|
64
+ # T5 |------------------------------------| # coverage = 600/1000
65
+ hash["q9"] << HitHelper.new("q9", "t5", 1, 200, 201, 400, 200, 1000)
66
+ hash["q10"] << HitHelper.new("q10", "t5", 1, 200, 601, 800, 200, 1000)
67
+ hash["q11"] << HitHelper.new("q11", "t5", 1, 400, 301, 700, 400, 1000)
68
+
69
+ crb.hash = hash
70
+ ohr = @comp.ortholog_hit_ratio crb
71
+ assert_equal 16.0/30.0, ohr
72
+ end
73
+
74
+ should "calculate potential chimera count" do
75
+ crb = CRBHelper.new(false)
76
+
77
+ hash = Hash.new
78
+ (1..3).each do |i|
79
+ hash["q#{i}"] = []
80
+ end
81
+
82
+ # T1 |---------|
83
+ # T2 |---------|
84
+ # Q1 |----------------------------| # chimera = true
85
+ hash["q1"] << HitHelper.new("q1", "t1", 101, 200, 1, 100, 500, 100)
86
+ hash["q1"] << HitHelper.new("q1", "t2", 301, 400, 1, 100, 400, 100)
87
+
88
+
89
+ # T3 |---------|
90
+ # T3 |---------|
91
+ # Q2 |----------------------------|
92
+ # chimera = true because the reference has the region 1-100 duplicated
93
+ hash["q2"] << HitHelper.new("q2", "t3", 101, 200, 1, 100, 500, 100)
94
+ hash["q2"] << HitHelper.new("q2", "t3", 301, 400, 1, 100, 400, 100)
95
+
96
+ # # T4 |---------|
97
+ # # T4 |---------|
98
+ # # Q3 |----------------------------|
99
+ # # chimera = false because the reference
100
+ hash["q3"] << HitHelper.new("q3", "t4", 101, 200, 1, 100, 500, 200)
101
+ hash["q3"] << HitHelper.new("q3", "t4", 301, 400, 101, 200, 400, 200)
102
+
103
+ crb.hash = hash
104
+ @comp.chimeras crb
105
+ assert_equal 0.667, @comp.p_chimeras.round(3)
106
+ end
107
+
108
+ should "calculate overlap amount" do
109
+ assert_equal 0.5, @comp.overlap_amount(201,500,101,400), "1"
110
+ assert_equal 0.5, @comp.overlap_amount(101,400,201,500), "2"
111
+ assert_equal 0.5, @comp.overlap_amount(201,400,101,500), "3"
112
+ assert_equal 0.5, @comp.overlap_amount(101,500,201,400), "4"
113
+ end
114
+
115
+ should "calculate number of contigs with crbblast hit" do
116
+ Dir.mktmpdir do |tmpdir|
117
+ Dir.chdir tmpdir do
118
+ @comp.run
119
+ assert_equal 11, @comp.comp_stats[:n_contigs_with_recip]
120
+ assert_equal 11/13.0, @comp.comp_stats[:p_contigs_with_recip]
121
+ end
122
+ end
123
+ end
124
+
125
+ should "calculate number of reference sequences with crbblast hit" do
126
+ Dir.mktmpdir do |tmpdir|
127
+ Dir.chdir tmpdir do
128
+ @comp.run
129
+ assert_equal 10, @comp.comp_stats[:n_refs_with_recip]
130
+ assert_equal 0.5, @comp.comp_stats[:p_refs_with_recip]
131
+ end
132
+ end
133
+ end
134
+
135
+ should "calculate reference sequence coverage" do
136
+ # n&p of reference sequences covered to (25, 50, 75, 85, 95%)
137
+ # of their length by CRB-BLAST hit
138
+ Dir.mktmpdir do |tmpdir|
139
+ Dir.chdir tmpdir do
140
+ @comp.run
141
+ stats = @comp.comp_stats
142
+ assert_equal 10, stats[:cov25]
143
+ assert_equal 10, stats[:cov50]
144
+ assert_equal 7, stats[:cov75]
145
+ assert_equal 6, stats[:cov85]
146
+ assert_equal 3, stats[:cov95]
147
+ end
148
+ end
149
+ end
150
+
151
+ should "number of reference sequences coverage" do
152
+ # n&p of reference sequences covered to (25, 50, 75, 85, 95%)
153
+ # of their length by CRB-BLAST hit
154
+ crb = CRBHelper.new(false)
155
+
156
+ hash = Hash.new
157
+ (1..5).each do |i|
158
+ hash["q#{i}"] = []
159
+ end
160
+ hash["q1"] << HitHelper.new("q1", "t1", 1, 250, 101, 350, 250, 1000)
161
+ hash["q2"] << HitHelper.new("q2", "t2", 1, 500, 101, 600, 500, 1000)
162
+ hash["q3"] << HitHelper.new("q3", "t3", 1, 750, 101, 850, 750, 1000)
163
+ hash["q4"] << HitHelper.new("q4", "t4", 1, 850, 101, 950, 850, 1000)
164
+ hash["q5"] << HitHelper.new("q5", "t5", 1, 950, 1, 950, 950, 1000)
165
+
166
+ crb.hash = hash
167
+ ohr = @comp.ortholog_hit_ratio crb
168
+ stats = @comp.comp_stats
169
+ assert_equal 5, stats[:cov25]
170
+ assert_equal 4, stats[:cov50]
171
+ assert_equal 3, stats[:cov75]
172
+ assert_equal 2, stats[:cov85]
173
+ assert_equal 1, stats[:cov95]
174
+ end
175
+
176
+ end
177
+ end
@@ -0,0 +1,61 @@
1
+ require 'helper'
2
+ require 'bio'
3
+
4
+ class TestContig < Test::Unit::TestCase
5
+
6
+ context "Contig" do
7
+
8
+ setup do
9
+ seq = Bio::Sequence.new 'ATGCGTGTATATACGCGTAG'
10
+ @contig = Transrate::Contig.new seq
11
+ end
12
+
13
+ should "know the number and proportion of each base it contains" do
14
+ assert_equal 5, @contig.bases_a, "count of base a"
15
+ assert_equal 0.25, @contig.prop_a, "proportion of base a"
16
+ assert_equal 3, @contig.bases_c, "count of base c"
17
+ assert_equal 0.15, @contig.prop_c, "proportion of base c"
18
+ assert_equal 6, @contig.bases_g, "count of base g"
19
+ assert_equal 0.3, @contig.prop_g, "proportion of base g"
20
+ assert_equal 6, @contig.bases_t, "count of base t"
21
+ assert_equal 0.3, @contig.prop_t, "proportion of base t"
22
+ assert_equal 0, @contig.bases_n, "count of base n"
23
+ assert_equal 0.0, @contig.prop_n, "proportion of base n"
24
+ end
25
+
26
+ should "know how many of each two-base pair it contains" do
27
+ assert_equal 3, @contig.dibase_composition[:cg], "cg count"
28
+ assert_equal 3, @contig.dibase_composition[:at], "at count"
29
+ assert_equal 2, @contig.dibase_composition[:tg], "tg count"
30
+ end
31
+
32
+ should "know its own gc content" do
33
+ assert_equal 9, @contig.bases_gc, "count of bases that are c or g"
34
+ assert_equal 0.45, @contig.prop_gc.round(2),
35
+ "proportion of bases that are c or g"
36
+ end
37
+
38
+ should "know its own base-pair skew" do
39
+ assert_equal 0.45, @contig.gc_skew.round(2), "gc skew"
40
+ assert_equal 0.55, @contig.at_skew.round(2), "at skew"
41
+ end
42
+
43
+ should "know its own CpG count and density" do
44
+ assert_equal 3, @contig.cpg_count, "cpg count"
45
+ assert_equal 66.67, @contig.cpg_ratio.round(2), "cpg ratio"
46
+ end
47
+
48
+ should "know the length of its own longest orf" do
49
+ assert_equal 6, @contig.orf_length, "orf length"
50
+ end
51
+
52
+
53
+ should "know its own linguistic complexity" do
54
+ assert_equal 0.0586, @contig.linguistic_complexity(4).round(4),
55
+ "linguistic complexity k=4"
56
+ assert_equal 0.0037, @contig.linguistic_complexity(6).round(4),
57
+ "linguistic complexity k=6"
58
+ end
59
+
60
+ end
61
+ end
@@ -0,0 +1,50 @@
1
+ require 'helper'
2
+
3
+ class TestContigMetrics < Test::Unit::TestCase
4
+
5
+ context "transrate" do
6
+
7
+ setup do
8
+ querypath = File.join(File.dirname(__FILE__), 'data',
9
+ 'assembly.fasta')
10
+ assembly = Transrate::Assembly.new(querypath)
11
+ @contig_metrics = Transrate::ContigMetrics.new(assembly)
12
+ end
13
+
14
+ should "run metrics on assembly" do
15
+ @contig_metrics.run
16
+ assert @contig_metrics.has_run
17
+ end
18
+
19
+ should "get gc content" do
20
+ @contig_metrics.run
21
+ assert_equal 0.37672, @contig_metrics.gc_prop.round(5)
22
+ end
23
+
24
+ should "get gc skew" do
25
+ @contig_metrics.run
26
+ assert_equal 0.00440, @contig_metrics.gc_skew.round(5)
27
+ end
28
+
29
+ should "get at skew" do
30
+ @contig_metrics.run
31
+ assert_equal -0.00718, @contig_metrics.at_skew.round(5)
32
+ end
33
+
34
+ should "get CpG density" do
35
+ @contig_metrics.run
36
+ assert_equal 0.52828, @contig_metrics.cpg_ratio.round(5)
37
+ end
38
+
39
+ should "get linguistic complexity" do
40
+ @contig_metrics.run
41
+ assert_equal 0.26599, @contig_metrics.linguistic_complexity.round(5)
42
+ end
43
+
44
+ should "get the number and proportion of Ns" do
45
+ @contig_metrics.run
46
+ assert_equal 2, @contig_metrics.bases_n
47
+ assert_equal 0.00033, @contig_metrics.proportion_n.round(5)
48
+ end
49
+ end
50
+ end
data/test/test_inline.rb CHANGED
@@ -1,29 +1,30 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
3
  require 'helper'
4
+ require 'bio'
4
5
 
5
6
  class TestInline < Test::Unit::TestCase
6
7
 
7
8
  context 'transrate' do
8
9
 
9
10
  setup do
10
- @a = Transrate::Assembly.new('test/assembly.fasta')
11
- @seq1 = 'ATGCCCCTAGGGTAG'
11
+ filepath = File.join(File.dirname(__FILE__), 'data', 'assembly.fasta')
12
+ @a = Transrate::Assembly.new(filepath)
12
13
  end
13
14
 
14
15
  should 'find longest orf in file' do
15
16
  orfs = []
16
- @a.assembly.each do |entry|
17
- l = @a.orf_length entry.seq
18
- orfs << l
17
+ @a.assembly.each do |contig|
18
+ orfs << contig.orf_length
19
19
  end
20
- assert_equal orfs.length, 4
21
- assert_equal orfs, [333, 370, 131, 84]
20
+ assert_equal 4, orfs.length
21
+ assert_equal [333, 370, 131, 84], orfs
22
22
  end
23
23
 
24
24
  should 'find longest orf in sequence' do
25
- l = @a.orf_length(@seq1)
26
- assert_equal l, 4
25
+ seq = Bio::Sequence.new 'ATGCCCCTAGGGTAG'
26
+ contig = Transrate::Contig.new seq
27
+ assert_equal 4, contig.orf_length
27
28
  end
28
29
 
29
30
  end
@@ -0,0 +1,68 @@
1
+ require 'helper'
2
+ require 'tmpdir'
3
+
4
+ class TestReadMetrics < Test::Unit::TestCase
5
+
6
+ context "ReadMetrics" do
7
+
8
+ setup do
9
+ query = File.join(File.dirname(__FILE__), 'data',
10
+ 'sorghum_transcript.fa')
11
+ assembly = Transrate::Assembly.new(query)
12
+ @read_metrics = Transrate::ReadMetrics.new(assembly)
13
+ end
14
+
15
+ teardown do
16
+ if File.exist?("test/data/sorghum_transcript.fa.fai")
17
+ rm = "rm test/data/sorghum_transcript.fa.fai"
18
+ `#{rm}`
19
+ end
20
+ end
21
+
22
+ should "setup correctly" do
23
+ assert @read_metrics
24
+ end
25
+
26
+ should "calculate read mapping statistics" do
27
+ left = File.join(File.dirname(__FILE__), 'data', '150uncovered.l.fq')
28
+ right = File.join(File.dirname(__FILE__), 'data', '150uncovered.r.fq')
29
+ Dir.mktmpdir do |tmpdir|
30
+ Dir.chdir tmpdir do
31
+ @read_metrics.run(left, right)
32
+ stats = @read_metrics.read_stats
33
+ assert @read_metrics.has_run
34
+ assert_equal 223, stats[:num_pairs], 'number of read pairs'
35
+ assert_equal 202, stats[:total_mappings], 'number mapping'
36
+ assert_equal 90.58, stats[:percent_mapping].round(2),
37
+ 'percent mapping'
38
+ assert_equal 202, stats[:good_mappings], 'good mapping'
39
+ assert_equal 90.58,
40
+ stats[:pc_good_mapping].round(2),
41
+ 'percent good mapping'
42
+ assert_equal 0, stats[:bad_mappings], 'bad mapping'
43
+ assert_equal 22.91, stats[:mean_coverage].round(2), 'mean coverage'
44
+ assert_equal 11, stats[:n_uncovered_bases], 'n uncovered bases'
45
+ assert_equal 0.007,
46
+ stats[:p_uncovered_bases].round(3),
47
+ 'p uncovered bases'
48
+ end
49
+ end
50
+ end
51
+
52
+ should "find read pairs that support scaffolding" do
53
+ left = File.join(File.dirname(__FILE__), 'data', 'bridging_reads.l.fastq')
54
+ right = File.join(File.dirname(__FILE__),
55
+ 'data',
56
+ 'bridging_reads.r.fastq')
57
+ Dir.mktmpdir do |tmpdir|
58
+ Dir.chdir tmpdir do
59
+ @read_metrics.run(left, right)
60
+ stats = @read_metrics.read_stats
61
+ assert_equal 1, stats[:potential_bridges], 'potential bridges'
62
+ end
63
+ end
64
+ end
65
+
66
+ end
67
+
68
+ end
@@ -0,0 +1,22 @@
1
+ require 'helper'
2
+
3
+ class TestSamtools < Test::Unit::TestCase
4
+
5
+ context "samtools" do
6
+
7
+ should "know the path to samtools binary" do
8
+ msg = /Program: samtools/
9
+ path = Transrate::Samtools.path
10
+ res = `#{path} 2>&1`.split("\n").join
11
+ assert msg =~ res
12
+ end
13
+
14
+ should "run commands" do
15
+ sam = File.join(File.dirname(__FILE__), 'data', 'tiny.sam')
16
+ Transrate::Samtools.run "view -bS #{sam} > tiny.bam"
17
+ assert_equal 460, File.size('tiny.bam'), 'bam file should be created'
18
+ File.delete 'tiny.bam'
19
+ end
20
+
21
+ end
22
+ end
@@ -0,0 +1,40 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'helper'
4
+
5
+ class TestTransrate < Test::Unit::TestCase
6
+
7
+ context "transrate" do
8
+
9
+ setup do
10
+ filepath = File.join(File.dirname(__FILE__), 'data', 'assembly.fasta')
11
+ @a = Transrate::Assembly.new(filepath)
12
+ end
13
+
14
+ should "create assembly object" do
15
+ assert @a
16
+ assert_equal @a.assembly.size, 4
17
+ end
18
+
19
+ should "run basic stats" do
20
+ stats = @a.basic_stats
21
+ assert_equal stats["n_seqs"], 4
22
+ assert_equal stats["smallest"], 1409
23
+ assert_equal stats["largest"], 1630
24
+ assert_equal stats["mean_len"], 1508.25
25
+ end
26
+
27
+ should "run metrics on assembly" do
28
+ ans = @a.run(2) # using 2 threads
29
+ assert_equal ans, true, "should run but returned #{ans}"
30
+ end
31
+
32
+ should "find the mean length" do
33
+ ans = @a.run(2)
34
+ mean = @a.mean_len
35
+ n_bases = @a.n_bases
36
+ assert_equal mean, 1508.25
37
+ assert_equal n_bases, 6033
38
+ end
39
+ end
40
+ end