transrate 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +16 -1
  3. data/.travis.yml +8 -0
  4. data/README.md +45 -43
  5. data/Rakefile +36 -0
  6. data/bin/transrate +98 -50
  7. data/deps/deps.yaml +55 -0
  8. data/lib/transrate.rb +19 -4
  9. data/lib/transrate/assembly.rb +93 -182
  10. data/lib/transrate/bowtie2.rb +37 -13
  11. data/lib/transrate/cmd.rb +19 -0
  12. data/lib/transrate/comparative_metrics.rb +239 -19
  13. data/lib/transrate/contig.rb +212 -0
  14. data/lib/transrate/contig_metrics.rb +76 -0
  15. data/lib/transrate/read_metrics.rb +83 -41
  16. data/lib/transrate/samtools.rb +73 -0
  17. data/lib/transrate/transrater.rb +31 -11
  18. data/lib/transrate/version.rb +1 -1
  19. data/test/data/150uncovered.l.fq +892 -0
  20. data/test/data/150uncovered.r.fq +892 -0
  21. data/test/data/Os.protein.2.fa +95 -0
  22. data/test/data/Os.protein.fa +199 -0
  23. data/test/data/assembly.2.fa +26 -0
  24. data/test/{assembly.fasta → data/assembly.fasta} +0 -0
  25. data/test/data/bridging_reads.l.fastq +20 -0
  26. data/test/data/bridging_reads.r.fastq +20 -0
  27. data/test/data/sorghum_transcript.fa +4 -0
  28. data/test/data/tiny.sam +4 -0
  29. data/test/helper.rb +33 -2
  30. data/test/test_bowtie.rb +54 -0
  31. data/test/test_cmd.rb +15 -0
  32. data/test/test_comp_metrics.rb +177 -0
  33. data/test/test_contig.rb +61 -0
  34. data/test/test_contig_metrics.rb +50 -0
  35. data/test/test_inline.rb +10 -9
  36. data/test/test_read_metrics.rb +68 -0
  37. data/test/test_samtools.rb +22 -0
  38. data/test/test_transrate.rb +40 -0
  39. data/test/test_transrater.rb +68 -0
  40. data/transrate.gemspec +16 -10
  41. metadata +232 -57
  42. data/lib/transrate/express.rb +0 -37
  43. data/lib/transrate/log.rb +0 -16
  44. data/lib/transrate/rb_hit.rb +0 -33
  45. data/lib/transrate/reciprocal_annotation.rb +0 -105
  46. data/lib/transrate/usearch.rb +0 -66
  47. data/test/test_test.rb +0 -41
@@ -1,42 +1,66 @@
1
1
  module Transrate
2
2
 
3
+ class Bowtie2Error < StandardError
4
+ end
5
+
3
6
  class Bowtie2
4
7
 
5
8
  require 'which'
6
9
  include Which
7
10
 
11
+ attr_reader :index_name, :sam
12
+
8
13
  def initialize
9
14
  bowtie2_path = which('bowtie2')
10
- raise "could not find bowtie2 in the path" if bowtie2_path.empty?
15
+ if bowtie2_path.empty?
16
+ raise Bowtie2Error.new("could not find bowtie2 in the path")
17
+ end
11
18
  @bowtie2 = bowtie2_path.first
12
19
  bowtie2_build_path = which('bowtie2-build')
13
- raise "could not find bowtie2-build in the path" if bowtie2_build_path.empty?
20
+ if bowtie2_build_path.empty?
21
+ raise Bowtie2Error.new("could not find bowtie2-build in the path")
22
+ end
14
23
  @bowtie2_build = bowtie2_build_path.first
24
+ @index_built = false
25
+ @index_name = ""
15
26
  end
16
27
 
17
- def map_reads file, left, right=nil, insertsize=200, insertsd=50, outputname=nil
28
+ def map_reads(file, left,
29
+ right, insertsize: 200,
30
+ insertsd: 50, outputname: nil,
31
+ threads: 8)
32
+ raise Bowtie2Error.new("Index not built") if !@index_built
18
33
  lbase = File.basename(left)
19
34
  rbase = File.basename(right)
20
- outputname ||= "#{lbase}.#{rbase}.#{File.basename(file)}.sam"
35
+ index = File.basename(@index_name)
36
+ @sam = File.expand_path("#{lbase}.#{rbase}.#{index}.sam")
21
37
  realistic_dist = insertsize + (3 * insertsd)
22
- unless File.exists? outputname
38
+ unless File.exists? @sam
23
39
  # construct bowtie command
24
- bowtiecmd = "#{@bowtie2} --very-sensitive-local -k 10 -p 8 -X #{realistic_dist}" # TODO number of cores should be variable '-p 8'
25
- bowtiecmd += " --no-unal --quiet"
26
- bowtiecmd += " #{File.basename(file)} -1 #{left}"
40
+ bowtiecmd = "#{@bowtie2} --very-sensitive"
41
+ bowtiecmd += " -p #{threads} -X #{realistic_dist}"
42
+ bowtiecmd += " --quiet --no-unal"
43
+ bowtiecmd += " --seed 1337"
44
+ bowtiecmd += " -x #{@index_name}"
45
+ bowtiecmd += " -1 #{left}"
27
46
  # paired end?
28
47
  bowtiecmd += " -2 #{right}" if right
29
- bowtiecmd += " > #{outputname}"
48
+ bowtiecmd += " -S #{@sam}"
30
49
  # run bowtie
31
- `#{bowtiecmd}`
50
+ runner = Cmd.new bowtiecmd
51
+ runner.run
32
52
  end
33
- outputname
53
+ @sam
34
54
  end
35
55
 
36
56
  def build_index file
37
- unless File.exists?(file + '.1.bt2')
38
- `#{@bowtie2_build} --offrate 1 #{file} #{File.basename(file)}`
57
+ @index_name = File.basename(file).split(".")[0..-2].join(".")
58
+ unless File.exists?(@index_name + '.1.bt2')
59
+ cmd = "#{@bowtie2_build} --quiet --offrate 1 #{file} #{@index_name}"
60
+ runner = Cmd.new cmd
61
+ runner.run
39
62
  end
63
+ @index_built = true
40
64
  end
41
65
 
42
66
  end # Bowtie2
@@ -0,0 +1,19 @@
1
+ require 'open3'
2
+
3
+ module Transrate
4
+
5
+ class Cmd
6
+
7
+ attr_accessor :cmd, :stdout, :stderr, :status
8
+
9
+ def initialize cmd
10
+ @cmd = cmd
11
+ end
12
+
13
+ def run
14
+ @stdout, @stderr, @status = Open3.capture3 @cmd
15
+ end
16
+
17
+ end
18
+
19
+ end
@@ -1,7 +1,8 @@
1
1
  require 'set'
2
+ require 'crb-blast'
2
3
 
3
4
  module Transrate
4
-
5
+
5
6
  class ComparativeMetrics
6
7
 
7
8
  attr_reader :rbh_per_contig
@@ -9,21 +10,26 @@ module Transrate
9
10
  attr_reader :reciprocal_hits
10
11
  attr_reader :has_run
11
12
  attr_reader :reference_coverage
13
+ attr_reader :n_chimeras, :p_chimeras
12
14
 
13
- def initialize assembly, reference
15
+ def initialize assembly, reference, threads
14
16
  @assembly = assembly
15
17
  @reference = reference
16
- @usearch = Usearch.new
18
+ @threads = threads
17
19
  end
18
20
 
19
21
  def run
20
- rbu = reciprocal_best_ublast
21
- @ortholog_hit_ratio = ortholog_hit_ratio rbu
22
- @collapse_factor = collapse_factor @ra.r2l_hits
23
- @reciprocal_hits = rbu.size
22
+ @crbblast = reciprocal_best_blast
23
+ @ortholog_hit_ratio = ortholog_hit_ratio @crbblast
24
+ @collapse_factor = collapse_factor @crbblast.target_results
25
+ @reciprocal_hits = @crbblast.size
24
26
  @rbh_per_reference = @reciprocal_hits.to_f / @reference.size.to_f
25
27
  @reference_coverage = @ortholog_hit_ratio * @rbh_per_reference
26
28
  @rbh_per_contig = @reciprocal_hits.to_f / @assembly.assembly.size.to_f
29
+ @p_contigs_with_recip = @crbblast.reciprocals.size / @assembly.size.to_f
30
+ @n_contigs_with_recip = @crbblast.reciprocals.size
31
+ @p_refs_with_recip = @n_refs_with_recip / @reference.size.to_f
32
+ chimeras @crbblast
27
33
  @has_run = true
28
34
  end
29
35
 
@@ -31,33 +37,247 @@ module Transrate
31
37
  {
32
38
  :reciprocal_hits => @reciprocal_hits,
33
39
  :rbh_per_contig => @rbh_per_contig,
40
+ :p_contigs_with_recip => @p_contigs_with_recip,
41
+ :n_contigs_with_recip => @n_contigs_with_recip,
42
+ :p_refs_with_recip => @p_refs_with_recip,
43
+ :n_refs_with_recip => @n_refs_with_recip,
34
44
  :rbh_per_reference => @rbh_per_reference,
35
45
  :reference_coverage => @reference_coverage,
36
46
  :ortholog_hit_ratio => @ortholog_hit_ratio,
37
- :collapse_factor => @collapse_factor
47
+ :collapse_factor => @collapse_factor,
48
+ :n_chimeras => @n_chimeras,
49
+ :p_chimeras => @p_chimeras,
50
+ :cov25 => @cov[0],
51
+ :cov50 => @cov[1],
52
+ :cov75 => @cov[2],
53
+ :cov85 => @cov[3],
54
+ :cov95 => @cov[4],
55
+ :p_cov25 => @cov[0]/@reference.size.to_f,
56
+ :p_cov50 => @cov[1]/@reference.size.to_f,
57
+ :p_cov75 => @cov[2]/@reference.size.to_f,
58
+ :p_cov85 => @cov[3]/@reference.size.to_f,
59
+ :p_cov95 => @cov[4]/@reference.size.to_f
38
60
  }
39
61
  end
40
62
 
41
- def reciprocal_best_ublast
42
- @ra = ReciprocalAnnotation.new @assembly, @reference
43
- @ra.run
63
+ def reciprocal_best_blast
64
+ crbblast = CRB_Blast.new @assembly.file, @reference.file
65
+ crbblast.run 1e-5, @threads
66
+ crbblast
44
67
  end
45
68
 
46
- def ortholog_hit_ratio rbu=nil
69
+ # coverage of contigs that have reciprocal hits
70
+ # divided by
71
+ # number of reciprocal targets
72
+ def ortholog_hit_ratio crbblast
47
73
  return @ortholog_hit_ratio unless @ortholog_hit_ratio.nil?
48
- divisor = (rbu.size * 100).to_f
49
- rbu.reduce(0){ |sum, hit| sum += hit.last.target_coverage } / divisor
74
+
75
+ targets = Hash.new
76
+ crbblast.reciprocals.each_pair do |key, list|
77
+ list.each do |hit|
78
+ targets[hit.target] ||= [] # if key doesn't exist add it with a []
79
+ targets[hit.target] << hit
80
+ end
81
+ end
82
+ @n_refs_with_recip = targets.size
83
+ total_coverage=0
84
+ total_length=0
85
+ targets.each_pair do |key, list|
86
+ blocks = []
87
+ target_length = 0
88
+ list.each do |hit|
89
+ target_length = hit.tlen
90
+ if crbblast.target_is_prot
91
+ target_length *= 3
92
+ start, stop = [hit.tstart*3, hit.tend*3].minmax
93
+ else
94
+ start, stop = [hit.tstart, hit.tend].minmax
95
+ end
96
+ if blocks.empty?
97
+ blocks << [start, stop]
98
+ else
99
+ found=false
100
+ blocks.each do |block|
101
+ # if query overlaps with any block extend that block
102
+ o = overlap(block[0], block[1], start, stop)
103
+ if o == 0 # perfect overlap
104
+ found=true
105
+ elsif o == 1 # partial overlap
106
+ block[0] = start
107
+ found=true
108
+ elsif o == 2 # partial overlap
109
+ block[1] = stop
110
+ found=true
111
+ elsif o == 3 # full overlap
112
+ block[0] = start
113
+ block[1] = stop
114
+ found=true
115
+ # elsif o == 4 # full overlap
116
+ # nothing
117
+ # elsif o == 5 || o == 6 # no overlap
118
+
119
+ end
120
+ end
121
+ if !found
122
+ blocks << [start, stop]
123
+ end
124
+ # if any blocks now overlap then extend one block and remove
125
+ # the other
126
+ end
127
+ end
128
+ blocks.each_with_index do |block_a,a|
129
+ blocks.each_with_index do |block_b,b|
130
+ if a!=b
131
+ o = overlap(block_a[0], block_a[1], block_b[0], block_b[1])
132
+ if o == 0 # perfect overlap
133
+ block_b[0]=-1
134
+ block_b[1]=-1
135
+ elsif o == 1 # partial overlap
136
+ block_a[0] = block_b[0]
137
+ block_b[0] = -1
138
+ block_b[1] = -1
139
+ elsif o == 2 # partial overlap
140
+ block_a[1] = block_b[1]
141
+ block_b[0] = -1
142
+ block_b[1] = -1
143
+ elsif o == 3 # full overlap
144
+ block_a[0] = block_b[0]
145
+ block_a[1] = block_b[1]
146
+ block_b[0] = -1
147
+ block_b[1] = -1
148
+ elsif o == 4 # full overlap
149
+ block_b[0] = -1
150
+ block_b[1] = -1
151
+ # elsif o == 5 || o == 6# no overlap
152
+ # do nothing
153
+ # elsif # no overlap
154
+ # do nothing
155
+ end
156
+ end
157
+ end # each_with_index b
158
+ end # each_with_index a
159
+ # sum blocks to find total coverage
160
+ length_of_coverage=0
161
+ blocks.each do |block|
162
+ if block[0] and block[1]
163
+ if block[0]>=0 and block[1]>=0
164
+ length_of_coverage += block[1] - block[0] + 1
165
+ end
166
+ else
167
+ puts "error: key = #{key}, #{blocks}"
168
+ end
169
+ end
170
+ cov = [0.25, 0.5, 0.75, 0.85, 0.95]
171
+ @cov ||= [0, 0, 0, 0, 0]
172
+ p = length_of_coverage / target_length.to_f
173
+ cov.each_with_index do |c, i|
174
+ if p >= c
175
+ @cov[i] +=1
176
+ end
177
+ end
178
+ total_coverage += length_of_coverage
179
+ total_length += target_length
180
+ end
181
+ return ortholog_hit_ratio = total_coverage / total_length.to_f
182
+ end
183
+
184
+ def chimeras crbblast
185
+ @n_chimeras = 0
186
+ crbblast.reciprocals.each_pair do |key, list|
187
+ p = 0
188
+ list.each_with_index do |a, i|
189
+ list.each_with_index do |b, j|
190
+ if j>i
191
+ if a.target == b.target
192
+ astart, astop = [a.tstart, a.tend].minmax
193
+ bstart, bstop = [b.tstart, b.tend].minmax
194
+
195
+ oa = overlap_amount(astart, astop, bstart, bstop)
196
+ if oa > 0.75
197
+ p += 1
198
+ end
199
+ else
200
+ astart, astop = [a.qstart, a.qend].minmax
201
+ bstart, bstop = [b.qstart, b.qend].minmax
202
+
203
+ oa = overlap_amount(astart, astop, bstart, bstop)
204
+ if oa < 0.25
205
+ p += 1
206
+ end
207
+ end
208
+ end
209
+ end
210
+ end
211
+ if p/list.size.to_f >= 0.5
212
+ @n_chimeras += 1
213
+ end
214
+ end
215
+ @p_chimeras = @n_chimeras / crbblast.reciprocals.length.to_f
216
+ end
217
+
218
+ def overlap(astart, astop, bstart, bstop)
219
+ if astart == bstart and astop == bstop
220
+ return 0
221
+ elsif astart < bstart
222
+ if astop > bstart
223
+ if astop > bstop
224
+ return 4
225
+ else
226
+ return 2
227
+ end
228
+ else
229
+ return 5 # no overlap
230
+ end
231
+ else
232
+ if bstop > astart
233
+ if bstop > astop
234
+ return 3
235
+ else
236
+ return 1
237
+ end
238
+ else
239
+ return 6 # no overlap
240
+ end
241
+ end
242
+ end
243
+
244
+ def overlap_amount(astart, astop, bstart, bstop)
245
+ if astart == bstart and astop == bstop
246
+ return 1
247
+ elsif astart < bstart
248
+ if astop > bstart
249
+ if astop > bstop
250
+ return (bstop-bstart+1)/(astop-astart+1).to_f # 4
251
+ else
252
+ return (astop-bstart+1)/(bstop-astart+1).to_f # 2
253
+ end
254
+ else
255
+ return 0 # 5 no overlap
256
+ end
257
+ else
258
+ if bstop > astart
259
+ if bstop > astop
260
+ return (astop-astart+1)/(bstop-bstart+1).to_f # 3
261
+ else
262
+ return (bstop-astart+1)/(astop-bstart+1).to_f # 1
263
+ end
264
+ else
265
+ return 0 # 6 no overlap
266
+ end
267
+ end
50
268
  end
51
269
 
52
270
  def collapse_factor hits=nil
53
271
  return @collapse_factor unless @collapse_factor.nil?
54
272
  targets = {}
55
- hits.each_pair do |query, hit|
56
- target = hit.target
57
- unless targets.has_key? target
58
- targets[target] = Set.new
273
+ hits.each_pair do |query, list|
274
+ list.each do |hit|
275
+ target = hit.target
276
+ unless targets.has_key? target
277
+ targets[target] = Set.new
278
+ end
279
+ targets[target] << query
59
280
  end
60
- targets[target] << query
61
281
  end
62
282
  sum = targets.values.reduce(0.0){ |summer, val| summer += val.size }
63
283
  sum / targets.size
@@ -0,0 +1,212 @@
1
+ require 'forwardable'
2
+ require 'inline'
3
+
4
+ module Transrate
5
+
6
+ # A contig in a transcriptome assembly.
7
+ class Contig
8
+
9
+ include Enumerable
10
+ extend Forwardable
11
+ def_delegators :@seq, :size, :length
12
+ attr_accessor :seq, :name, :coverage
13
+
14
+ def initialize(seq, name: nil)
15
+ @seq = seq
16
+ @name = seq.respond_to?(:entry_id) ? seq.entry_id : name
17
+ end
18
+
19
+ def each &block
20
+ @seq.seq.each_char &block
21
+ end
22
+
23
+ # Base composition of the contig
24
+ def base_composition
25
+ if @base_composition
26
+ return @base_composition
27
+ end
28
+ base_comp = {
29
+ :a => 0,
30
+ :t => 0,
31
+ :c => 0,
32
+ :g => 0,
33
+ :n => 0
34
+ }
35
+ dibase_comp = {
36
+ :cg => 0
37
+ }
38
+ last_base = nil
39
+ @seq.seq.each_char do |base|
40
+ # single bases
41
+ key = base.downcase.to_sym
42
+ base_comp[key] += 1
43
+ if last_base
44
+ # pairs of bases
45
+ dikey = "#{last_base}#{base}".downcase.to_sym
46
+ if dibase_comp[dikey]
47
+ dibase_comp[dikey] += 1
48
+ else
49
+ dibase_comp[dikey] = 1
50
+ end
51
+ end
52
+ last_base = base
53
+ end
54
+ @base_composition = base_comp
55
+ @dibase_composition = dibase_comp
56
+ return base_comp
57
+ end
58
+
59
+ # Dibase composition of the contig
60
+ def dibase_composition
61
+ if @dibase_composition
62
+ return @dibase_composition
63
+ end
64
+ base_composition
65
+ @dibase_composition
66
+ end
67
+
68
+ # Number of bases that are C
69
+ def bases_c
70
+ base_composition[:c]
71
+ end
72
+
73
+ # Proportion of bases that are C
74
+ def prop_c
75
+ bases_c / length.to_f
76
+ end
77
+
78
+ # Number of bases that are G
79
+ def bases_g
80
+ base_composition[:g]
81
+ end
82
+
83
+ # Proportion of bases that are G
84
+ def prop_g
85
+ bases_g / length.to_f
86
+ end
87
+
88
+ # Number of bases that are A
89
+ def bases_a
90
+ base_composition[:a]
91
+ end
92
+
93
+ # Proportion of bases that are A
94
+ def prop_a
95
+ bases_a / length.to_f
96
+ end
97
+
98
+ # Number of bases that are T
99
+ def bases_t
100
+ base_composition[:t]
101
+ end
102
+
103
+ # Proportion of bases that are T
104
+ def prop_t
105
+ bases_t / length.to_f
106
+ end
107
+
108
+ def bases_n
109
+ base_composition[:n]
110
+ end
111
+
112
+ def prop_n
113
+ bases_n / length.to_f
114
+ end
115
+
116
+ # GC
117
+ def bases_gc
118
+ bases_g + bases_c
119
+ end
120
+
121
+ def prop_gc
122
+ prop_g + prop_c
123
+ end
124
+
125
+ # GC skew
126
+ def gc_skew
127
+ prop_gc / (prop_a + prop_t + prop_gc)
128
+ end
129
+
130
+ # AT skew
131
+ def at_skew
132
+ prop_a + prop_t / (prop_a + prop_t + prop_gc)
133
+ end
134
+
135
+ # CpG count
136
+ def cpg_count
137
+ dibase_composition[:cg]
138
+ end
139
+
140
+ # CpG (C-phosphate-G) ratio
141
+ def cpg_ratio
142
+ dibase_composition[:cg] / (prop_c * prop_g)
143
+ end
144
+
145
+ # Find the longest orf in the contig
146
+ def orf_length
147
+ longest = longest_orf @seq.seq
148
+ return longest
149
+ end
150
+
151
+ # Inlined C longest-ORF function
152
+ inline do |builder|
153
+ builder.c <<SRC
154
+ static
155
+ void
156
+ longest_orf(VALUE _s) {
157
+ int i,sl,longest=0;
158
+ int len[6];
159
+ char * c_str;
160
+
161
+ sl = RSTRING_LEN(_s);
162
+ c_str = StringValueCStr(_s);
163
+ for (i=0;i<6;i++) {
164
+ len[i]=0;
165
+ }
166
+ for (i=0;i<sl-2;i++) {
167
+ if (c_str[i]=='T' &&
168
+ ((c_str[i+1]=='A' && c_str[i+2]=='G') ||
169
+ (c_str[i+1]=='A' && c_str[i+2]=='A') ||
170
+ (c_str[i+1]=='G' && c_str[i+2]=='A'))) {
171
+ if (len[i%3] > longest) {
172
+ longest = len[i%3];
173
+ }
174
+ len[i%3]=0;
175
+ } else {
176
+ len[i%3]++;
177
+ }
178
+ if (c_str[i+2]=='A' &&
179
+ ((c_str[i]=='C' && c_str[i+1]=='T') ||
180
+ (c_str[i]=='T' && c_str[i+1]=='T') ||
181
+ (c_str[i]=='T' && c_str[i+1]=='C'))) {
182
+ if (len[3+i%3] > longest) {
183
+ longest = len[3+i%3];
184
+ }
185
+ len[3+i%3]=0;
186
+ } else {
187
+ len[3+i%3]++;
188
+ }
189
+ }
190
+ if (len[i%3] > longest) {
191
+ longest = len[i%3];
192
+ }
193
+ if (len[3+i%3] > longest) {
194
+ longest = len[3+i%3];
195
+ }
196
+ return INT2NUM(longest);
197
+ }
198
+ SRC
199
+ end
200
+
201
+ def linguistic_complexity k
202
+ d = 4 ** k
203
+ set = Set.new
204
+ (0..@seq.length-k).each do |i|
205
+ set << @seq.seq.slice(i,k).upcase # slice(start, length)
206
+ end # count how many kmers in seq
207
+ set.size / d.to_f
208
+ end
209
+
210
+ end
211
+
212
+ end