transrate 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (47) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +16 -1
  3. data/.travis.yml +8 -0
  4. data/README.md +45 -43
  5. data/Rakefile +36 -0
  6. data/bin/transrate +98 -50
  7. data/deps/deps.yaml +55 -0
  8. data/lib/transrate.rb +19 -4
  9. data/lib/transrate/assembly.rb +93 -182
  10. data/lib/transrate/bowtie2.rb +37 -13
  11. data/lib/transrate/cmd.rb +19 -0
  12. data/lib/transrate/comparative_metrics.rb +239 -19
  13. data/lib/transrate/contig.rb +212 -0
  14. data/lib/transrate/contig_metrics.rb +76 -0
  15. data/lib/transrate/read_metrics.rb +83 -41
  16. data/lib/transrate/samtools.rb +73 -0
  17. data/lib/transrate/transrater.rb +31 -11
  18. data/lib/transrate/version.rb +1 -1
  19. data/test/data/150uncovered.l.fq +892 -0
  20. data/test/data/150uncovered.r.fq +892 -0
  21. data/test/data/Os.protein.2.fa +95 -0
  22. data/test/data/Os.protein.fa +199 -0
  23. data/test/data/assembly.2.fa +26 -0
  24. data/test/{assembly.fasta → data/assembly.fasta} +0 -0
  25. data/test/data/bridging_reads.l.fastq +20 -0
  26. data/test/data/bridging_reads.r.fastq +20 -0
  27. data/test/data/sorghum_transcript.fa +4 -0
  28. data/test/data/tiny.sam +4 -0
  29. data/test/helper.rb +33 -2
  30. data/test/test_bowtie.rb +54 -0
  31. data/test/test_cmd.rb +15 -0
  32. data/test/test_comp_metrics.rb +177 -0
  33. data/test/test_contig.rb +61 -0
  34. data/test/test_contig_metrics.rb +50 -0
  35. data/test/test_inline.rb +10 -9
  36. data/test/test_read_metrics.rb +68 -0
  37. data/test/test_samtools.rb +22 -0
  38. data/test/test_transrate.rb +40 -0
  39. data/test/test_transrater.rb +68 -0
  40. data/transrate.gemspec +16 -10
  41. metadata +232 -57
  42. data/lib/transrate/express.rb +0 -37
  43. data/lib/transrate/log.rb +0 -16
  44. data/lib/transrate/rb_hit.rb +0 -33
  45. data/lib/transrate/reciprocal_annotation.rb +0 -105
  46. data/lib/transrate/usearch.rb +0 -66
  47. data/test/test_test.rb +0 -41
@@ -1,42 +1,66 @@
1
1
  module Transrate
2
2
 
3
+ class Bowtie2Error < StandardError
4
+ end
5
+
3
6
  class Bowtie2
4
7
 
5
8
  require 'which'
6
9
  include Which
7
10
 
11
+ attr_reader :index_name, :sam
12
+
8
13
  def initialize
9
14
  bowtie2_path = which('bowtie2')
10
- raise "could not find bowtie2 in the path" if bowtie2_path.empty?
15
+ if bowtie2_path.empty?
16
+ raise Bowtie2Error.new("could not find bowtie2 in the path")
17
+ end
11
18
  @bowtie2 = bowtie2_path.first
12
19
  bowtie2_build_path = which('bowtie2-build')
13
- raise "could not find bowtie2-build in the path" if bowtie2_build_path.empty?
20
+ if bowtie2_build_path.empty?
21
+ raise Bowtie2Error.new("could not find bowtie2-build in the path")
22
+ end
14
23
  @bowtie2_build = bowtie2_build_path.first
24
+ @index_built = false
25
+ @index_name = ""
15
26
  end
16
27
 
17
- def map_reads file, left, right=nil, insertsize=200, insertsd=50, outputname=nil
28
+ def map_reads(file, left,
29
+ right, insertsize: 200,
30
+ insertsd: 50, outputname: nil,
31
+ threads: 8)
32
+ raise Bowtie2Error.new("Index not built") if !@index_built
18
33
  lbase = File.basename(left)
19
34
  rbase = File.basename(right)
20
- outputname ||= "#{lbase}.#{rbase}.#{File.basename(file)}.sam"
35
+ index = File.basename(@index_name)
36
+ @sam = File.expand_path("#{lbase}.#{rbase}.#{index}.sam")
21
37
  realistic_dist = insertsize + (3 * insertsd)
22
- unless File.exists? outputname
38
+ unless File.exists? @sam
23
39
  # construct bowtie command
24
- bowtiecmd = "#{@bowtie2} --very-sensitive-local -k 10 -p 8 -X #{realistic_dist}" # TODO number of cores should be variable '-p 8'
25
- bowtiecmd += " --no-unal --quiet"
26
- bowtiecmd += " #{File.basename(file)} -1 #{left}"
40
+ bowtiecmd = "#{@bowtie2} --very-sensitive"
41
+ bowtiecmd += " -p #{threads} -X #{realistic_dist}"
42
+ bowtiecmd += " --quiet --no-unal"
43
+ bowtiecmd += " --seed 1337"
44
+ bowtiecmd += " -x #{@index_name}"
45
+ bowtiecmd += " -1 #{left}"
27
46
  # paired end?
28
47
  bowtiecmd += " -2 #{right}" if right
29
- bowtiecmd += " > #{outputname}"
48
+ bowtiecmd += " -S #{@sam}"
30
49
  # run bowtie
31
- `#{bowtiecmd}`
50
+ runner = Cmd.new bowtiecmd
51
+ runner.run
32
52
  end
33
- outputname
53
+ @sam
34
54
  end
35
55
 
36
56
  def build_index file
37
- unless File.exists?(file + '.1.bt2')
38
- `#{@bowtie2_build} --offrate 1 #{file} #{File.basename(file)}`
57
+ @index_name = File.basename(file).split(".")[0..-2].join(".")
58
+ unless File.exists?(@index_name + '.1.bt2')
59
+ cmd = "#{@bowtie2_build} --quiet --offrate 1 #{file} #{@index_name}"
60
+ runner = Cmd.new cmd
61
+ runner.run
39
62
  end
63
+ @index_built = true
40
64
  end
41
65
 
42
66
  end # Bowtie2
@@ -0,0 +1,19 @@
1
+ require 'open3'
2
+
3
+ module Transrate
4
+
5
+ class Cmd
6
+
7
+ attr_accessor :cmd, :stdout, :stderr, :status
8
+
9
+ def initialize cmd
10
+ @cmd = cmd
11
+ end
12
+
13
+ def run
14
+ @stdout, @stderr, @status = Open3.capture3 @cmd
15
+ end
16
+
17
+ end
18
+
19
+ end
@@ -1,7 +1,8 @@
1
1
  require 'set'
2
+ require 'crb-blast'
2
3
 
3
4
  module Transrate
4
-
5
+
5
6
  class ComparativeMetrics
6
7
 
7
8
  attr_reader :rbh_per_contig
@@ -9,21 +10,26 @@ module Transrate
9
10
  attr_reader :reciprocal_hits
10
11
  attr_reader :has_run
11
12
  attr_reader :reference_coverage
13
+ attr_reader :n_chimeras, :p_chimeras
12
14
 
13
- def initialize assembly, reference
15
+ def initialize assembly, reference, threads
14
16
  @assembly = assembly
15
17
  @reference = reference
16
- @usearch = Usearch.new
18
+ @threads = threads
17
19
  end
18
20
 
19
21
  def run
20
- rbu = reciprocal_best_ublast
21
- @ortholog_hit_ratio = ortholog_hit_ratio rbu
22
- @collapse_factor = collapse_factor @ra.r2l_hits
23
- @reciprocal_hits = rbu.size
22
+ @crbblast = reciprocal_best_blast
23
+ @ortholog_hit_ratio = ortholog_hit_ratio @crbblast
24
+ @collapse_factor = collapse_factor @crbblast.target_results
25
+ @reciprocal_hits = @crbblast.size
24
26
  @rbh_per_reference = @reciprocal_hits.to_f / @reference.size.to_f
25
27
  @reference_coverage = @ortholog_hit_ratio * @rbh_per_reference
26
28
  @rbh_per_contig = @reciprocal_hits.to_f / @assembly.assembly.size.to_f
29
+ @p_contigs_with_recip = @crbblast.reciprocals.size / @assembly.size.to_f
30
+ @n_contigs_with_recip = @crbblast.reciprocals.size
31
+ @p_refs_with_recip = @n_refs_with_recip / @reference.size.to_f
32
+ chimeras @crbblast
27
33
  @has_run = true
28
34
  end
29
35
 
@@ -31,33 +37,247 @@ module Transrate
31
37
  {
32
38
  :reciprocal_hits => @reciprocal_hits,
33
39
  :rbh_per_contig => @rbh_per_contig,
40
+ :p_contigs_with_recip => @p_contigs_with_recip,
41
+ :n_contigs_with_recip => @n_contigs_with_recip,
42
+ :p_refs_with_recip => @p_refs_with_recip,
43
+ :n_refs_with_recip => @n_refs_with_recip,
34
44
  :rbh_per_reference => @rbh_per_reference,
35
45
  :reference_coverage => @reference_coverage,
36
46
  :ortholog_hit_ratio => @ortholog_hit_ratio,
37
- :collapse_factor => @collapse_factor
47
+ :collapse_factor => @collapse_factor,
48
+ :n_chimeras => @n_chimeras,
49
+ :p_chimeras => @p_chimeras,
50
+ :cov25 => @cov[0],
51
+ :cov50 => @cov[1],
52
+ :cov75 => @cov[2],
53
+ :cov85 => @cov[3],
54
+ :cov95 => @cov[4],
55
+ :p_cov25 => @cov[0]/@reference.size.to_f,
56
+ :p_cov50 => @cov[1]/@reference.size.to_f,
57
+ :p_cov75 => @cov[2]/@reference.size.to_f,
58
+ :p_cov85 => @cov[3]/@reference.size.to_f,
59
+ :p_cov95 => @cov[4]/@reference.size.to_f
38
60
  }
39
61
  end
40
62
 
41
- def reciprocal_best_ublast
42
- @ra = ReciprocalAnnotation.new @assembly, @reference
43
- @ra.run
63
+ def reciprocal_best_blast
64
+ crbblast = CRB_Blast.new @assembly.file, @reference.file
65
+ crbblast.run 1e-5, @threads
66
+ crbblast
44
67
  end
45
68
 
46
- def ortholog_hit_ratio rbu=nil
69
+ # coverage of contigs that have reciprocal hits
70
+ # divided by
71
+ # number of reciprocal targets
72
+ def ortholog_hit_ratio crbblast
47
73
  return @ortholog_hit_ratio unless @ortholog_hit_ratio.nil?
48
- divisor = (rbu.size * 100).to_f
49
- rbu.reduce(0){ |sum, hit| sum += hit.last.target_coverage } / divisor
74
+
75
+ targets = Hash.new
76
+ crbblast.reciprocals.each_pair do |key, list|
77
+ list.each do |hit|
78
+ targets[hit.target] ||= [] # if key doesn't exist add it with a []
79
+ targets[hit.target] << hit
80
+ end
81
+ end
82
+ @n_refs_with_recip = targets.size
83
+ total_coverage=0
84
+ total_length=0
85
+ targets.each_pair do |key, list|
86
+ blocks = []
87
+ target_length = 0
88
+ list.each do |hit|
89
+ target_length = hit.tlen
90
+ if crbblast.target_is_prot
91
+ target_length *= 3
92
+ start, stop = [hit.tstart*3, hit.tend*3].minmax
93
+ else
94
+ start, stop = [hit.tstart, hit.tend].minmax
95
+ end
96
+ if blocks.empty?
97
+ blocks << [start, stop]
98
+ else
99
+ found=false
100
+ blocks.each do |block|
101
+ # if query overlaps with any block extend that block
102
+ o = overlap(block[0], block[1], start, stop)
103
+ if o == 0 # perfect overlap
104
+ found=true
105
+ elsif o == 1 # partial overlap
106
+ block[0] = start
107
+ found=true
108
+ elsif o == 2 # partial overlap
109
+ block[1] = stop
110
+ found=true
111
+ elsif o == 3 # full overlap
112
+ block[0] = start
113
+ block[1] = stop
114
+ found=true
115
+ # elsif o == 4 # full overlap
116
+ # nothing
117
+ # elsif o == 5 || o == 6 # no overlap
118
+
119
+ end
120
+ end
121
+ if !found
122
+ blocks << [start, stop]
123
+ end
124
+ # if any blocks now overlap then extend one block and remove
125
+ # the other
126
+ end
127
+ end
128
+ blocks.each_with_index do |block_a,a|
129
+ blocks.each_with_index do |block_b,b|
130
+ if a!=b
131
+ o = overlap(block_a[0], block_a[1], block_b[0], block_b[1])
132
+ if o == 0 # perfect overlap
133
+ block_b[0]=-1
134
+ block_b[1]=-1
135
+ elsif o == 1 # partial overlap
136
+ block_a[0] = block_b[0]
137
+ block_b[0] = -1
138
+ block_b[1] = -1
139
+ elsif o == 2 # partial overlap
140
+ block_a[1] = block_b[1]
141
+ block_b[0] = -1
142
+ block_b[1] = -1
143
+ elsif o == 3 # full overlap
144
+ block_a[0] = block_b[0]
145
+ block_a[1] = block_b[1]
146
+ block_b[0] = -1
147
+ block_b[1] = -1
148
+ elsif o == 4 # full overlap
149
+ block_b[0] = -1
150
+ block_b[1] = -1
151
+ # elsif o == 5 || o == 6# no overlap
152
+ # do nothing
153
+ # elsif # no overlap
154
+ # do nothing
155
+ end
156
+ end
157
+ end # each_with_index b
158
+ end # each_with_index a
159
+ # sum blocks to find total coverage
160
+ length_of_coverage=0
161
+ blocks.each do |block|
162
+ if block[0] and block[1]
163
+ if block[0]>=0 and block[1]>=0
164
+ length_of_coverage += block[1] - block[0] + 1
165
+ end
166
+ else
167
+ puts "error: key = #{key}, #{blocks}"
168
+ end
169
+ end
170
+ cov = [0.25, 0.5, 0.75, 0.85, 0.95]
171
+ @cov ||= [0, 0, 0, 0, 0]
172
+ p = length_of_coverage / target_length.to_f
173
+ cov.each_with_index do |c, i|
174
+ if p >= c
175
+ @cov[i] +=1
176
+ end
177
+ end
178
+ total_coverage += length_of_coverage
179
+ total_length += target_length
180
+ end
181
+ return ortholog_hit_ratio = total_coverage / total_length.to_f
182
+ end
183
+
184
+ def chimeras crbblast
185
+ @n_chimeras = 0
186
+ crbblast.reciprocals.each_pair do |key, list|
187
+ p = 0
188
+ list.each_with_index do |a, i|
189
+ list.each_with_index do |b, j|
190
+ if j>i
191
+ if a.target == b.target
192
+ astart, astop = [a.tstart, a.tend].minmax
193
+ bstart, bstop = [b.tstart, b.tend].minmax
194
+
195
+ oa = overlap_amount(astart, astop, bstart, bstop)
196
+ if oa > 0.75
197
+ p += 1
198
+ end
199
+ else
200
+ astart, astop = [a.qstart, a.qend].minmax
201
+ bstart, bstop = [b.qstart, b.qend].minmax
202
+
203
+ oa = overlap_amount(astart, astop, bstart, bstop)
204
+ if oa < 0.25
205
+ p += 1
206
+ end
207
+ end
208
+ end
209
+ end
210
+ end
211
+ if p/list.size.to_f >= 0.5
212
+ @n_chimeras += 1
213
+ end
214
+ end
215
+ @p_chimeras = @n_chimeras / crbblast.reciprocals.length.to_f
216
+ end
217
+
218
+ def overlap(astart, astop, bstart, bstop)
219
+ if astart == bstart and astop == bstop
220
+ return 0
221
+ elsif astart < bstart
222
+ if astop > bstart
223
+ if astop > bstop
224
+ return 4
225
+ else
226
+ return 2
227
+ end
228
+ else
229
+ return 5 # no overlap
230
+ end
231
+ else
232
+ if bstop > astart
233
+ if bstop > astop
234
+ return 3
235
+ else
236
+ return 1
237
+ end
238
+ else
239
+ return 6 # no overlap
240
+ end
241
+ end
242
+ end
243
+
244
+ def overlap_amount(astart, astop, bstart, bstop)
245
+ if astart == bstart and astop == bstop
246
+ return 1
247
+ elsif astart < bstart
248
+ if astop > bstart
249
+ if astop > bstop
250
+ return (bstop-bstart+1)/(astop-astart+1).to_f # 4
251
+ else
252
+ return (astop-bstart+1)/(bstop-astart+1).to_f # 2
253
+ end
254
+ else
255
+ return 0 # 5 no overlap
256
+ end
257
+ else
258
+ if bstop > astart
259
+ if bstop > astop
260
+ return (astop-astart+1)/(bstop-bstart+1).to_f # 3
261
+ else
262
+ return (bstop-astart+1)/(astop-bstart+1).to_f # 1
263
+ end
264
+ else
265
+ return 0 # 6 no overlap
266
+ end
267
+ end
50
268
  end
51
269
 
52
270
  def collapse_factor hits=nil
53
271
  return @collapse_factor unless @collapse_factor.nil?
54
272
  targets = {}
55
- hits.each_pair do |query, hit|
56
- target = hit.target
57
- unless targets.has_key? target
58
- targets[target] = Set.new
273
+ hits.each_pair do |query, list|
274
+ list.each do |hit|
275
+ target = hit.target
276
+ unless targets.has_key? target
277
+ targets[target] = Set.new
278
+ end
279
+ targets[target] << query
59
280
  end
60
- targets[target] << query
61
281
  end
62
282
  sum = targets.values.reduce(0.0){ |summer, val| summer += val.size }
63
283
  sum / targets.size
@@ -0,0 +1,212 @@
1
+ require 'forwardable'
2
+ require 'inline'
3
+
4
+ module Transrate
5
+
6
+ # A contig in a transcriptome assembly.
7
+ class Contig
8
+
9
+ include Enumerable
10
+ extend Forwardable
11
+ def_delegators :@seq, :size, :length
12
+ attr_accessor :seq, :name, :coverage
13
+
14
+ def initialize(seq, name: nil)
15
+ @seq = seq
16
+ @name = seq.respond_to?(:entry_id) ? seq.entry_id : name
17
+ end
18
+
19
+ def each &block
20
+ @seq.seq.each_char &block
21
+ end
22
+
23
+ # Base composition of the contig
24
+ def base_composition
25
+ if @base_composition
26
+ return @base_composition
27
+ end
28
+ base_comp = {
29
+ :a => 0,
30
+ :t => 0,
31
+ :c => 0,
32
+ :g => 0,
33
+ :n => 0
34
+ }
35
+ dibase_comp = {
36
+ :cg => 0
37
+ }
38
+ last_base = nil
39
+ @seq.seq.each_char do |base|
40
+ # single bases
41
+ key = base.downcase.to_sym
42
+ base_comp[key] += 1
43
+ if last_base
44
+ # pairs of bases
45
+ dikey = "#{last_base}#{base}".downcase.to_sym
46
+ if dibase_comp[dikey]
47
+ dibase_comp[dikey] += 1
48
+ else
49
+ dibase_comp[dikey] = 1
50
+ end
51
+ end
52
+ last_base = base
53
+ end
54
+ @base_composition = base_comp
55
+ @dibase_composition = dibase_comp
56
+ return base_comp
57
+ end
58
+
59
+ # Dibase composition of the contig
60
+ def dibase_composition
61
+ if @dibase_composition
62
+ return @dibase_composition
63
+ end
64
+ base_composition
65
+ @dibase_composition
66
+ end
67
+
68
+ # Number of bases that are C
69
+ def bases_c
70
+ base_composition[:c]
71
+ end
72
+
73
+ # Proportion of bases that are C
74
+ def prop_c
75
+ bases_c / length.to_f
76
+ end
77
+
78
+ # Number of bases that are G
79
+ def bases_g
80
+ base_composition[:g]
81
+ end
82
+
83
+ # Proportion of bases that are G
84
+ def prop_g
85
+ bases_g / length.to_f
86
+ end
87
+
88
+ # Number of bases that are A
89
+ def bases_a
90
+ base_composition[:a]
91
+ end
92
+
93
+ # Proportion of bases that are A
94
+ def prop_a
95
+ bases_a / length.to_f
96
+ end
97
+
98
+ # Number of bases that are T
99
+ def bases_t
100
+ base_composition[:t]
101
+ end
102
+
103
+ # Proportion of bases that are T
104
+ def prop_t
105
+ bases_t / length.to_f
106
+ end
107
+
108
+ def bases_n
109
+ base_composition[:n]
110
+ end
111
+
112
+ def prop_n
113
+ bases_n / length.to_f
114
+ end
115
+
116
+ # GC
117
+ def bases_gc
118
+ bases_g + bases_c
119
+ end
120
+
121
+ def prop_gc
122
+ prop_g + prop_c
123
+ end
124
+
125
+ # GC skew
126
+ def gc_skew
127
+ prop_gc / (prop_a + prop_t + prop_gc)
128
+ end
129
+
130
+ # AT skew
131
+ def at_skew
132
+ prop_a + prop_t / (prop_a + prop_t + prop_gc)
133
+ end
134
+
135
+ # CpG count
136
+ def cpg_count
137
+ dibase_composition[:cg]
138
+ end
139
+
140
+ # CpG (C-phosphate-G) ratio
141
+ def cpg_ratio
142
+ dibase_composition[:cg] / (prop_c * prop_g)
143
+ end
144
+
145
+ # Find the longest orf in the contig
146
+ def orf_length
147
+ longest = longest_orf @seq.seq
148
+ return longest
149
+ end
150
+
151
+ # Inlined C longest-ORF function
152
+ inline do |builder|
153
+ builder.c <<SRC
154
+ static
155
+ void
156
+ longest_orf(VALUE _s) {
157
+ int i,sl,longest=0;
158
+ int len[6];
159
+ char * c_str;
160
+
161
+ sl = RSTRING_LEN(_s);
162
+ c_str = StringValueCStr(_s);
163
+ for (i=0;i<6;i++) {
164
+ len[i]=0;
165
+ }
166
+ for (i=0;i<sl-2;i++) {
167
+ if (c_str[i]=='T' &&
168
+ ((c_str[i+1]=='A' && c_str[i+2]=='G') ||
169
+ (c_str[i+1]=='A' && c_str[i+2]=='A') ||
170
+ (c_str[i+1]=='G' && c_str[i+2]=='A'))) {
171
+ if (len[i%3] > longest) {
172
+ longest = len[i%3];
173
+ }
174
+ len[i%3]=0;
175
+ } else {
176
+ len[i%3]++;
177
+ }
178
+ if (c_str[i+2]=='A' &&
179
+ ((c_str[i]=='C' && c_str[i+1]=='T') ||
180
+ (c_str[i]=='T' && c_str[i+1]=='T') ||
181
+ (c_str[i]=='T' && c_str[i+1]=='C'))) {
182
+ if (len[3+i%3] > longest) {
183
+ longest = len[3+i%3];
184
+ }
185
+ len[3+i%3]=0;
186
+ } else {
187
+ len[3+i%3]++;
188
+ }
189
+ }
190
+ if (len[i%3] > longest) {
191
+ longest = len[i%3];
192
+ }
193
+ if (len[3+i%3] > longest) {
194
+ longest = len[3+i%3];
195
+ }
196
+ return INT2NUM(longest);
197
+ }
198
+ SRC
199
+ end
200
+
201
+ def linguistic_complexity k
202
+ d = 4 ** k
203
+ set = Set.new
204
+ (0..@seq.length-k).each do |i|
205
+ set << @seq.seq.slice(i,k).upcase # slice(start, length)
206
+ end # count how many kmers in seq
207
+ set.size / d.to_f
208
+ end
209
+
210
+ end
211
+
212
+ end