transrate 0.2.0 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/transrate.rb CHANGED
@@ -15,6 +15,7 @@ require 'transrate/metric'
15
15
  require 'transrate/dimension_reduce'
16
16
  require 'transrate/samtools'
17
17
  require 'transrate/cmd'
18
+ require 'transrate/transrate.so'
18
19
 
19
20
  # Transrate is a comprehensive transcriptome assembly
20
21
  # quality assessment tool.
@@ -26,7 +26,7 @@ module Transrate
26
26
 
27
27
  include Enumerable
28
28
  extend Forwardable
29
- def_delegators :@assembly, :each, :<<, :size, :length
29
+ def_delegators :@assembly, :each, :each_value, :<<, :size, :length, :[]
30
30
 
31
31
  attr_accessor :file
32
32
  attr_reader :assembly
@@ -43,11 +43,12 @@ module Transrate
43
43
  unless File.exist? @file
44
44
  raise IOError.new "Assembly file doesn't exist: #{@file}"
45
45
  end
46
- @assembly = []
46
+ @assembly = {}
47
47
  @n_bases = 0
48
48
  Bio::FastaFormat.open(file).each do |entry|
49
49
  @n_bases += entry.length
50
- @assembly << Contig.new(entry)
50
+ contig = Contig.new(entry)
51
+ @assembly[contig.name] = contig
51
52
  end
52
53
  @contig_metrics = ContigMetrics.new self
53
54
  end
@@ -78,7 +79,7 @@ module Transrate
78
79
  # @return [Hash] basic statistics about the assembly
79
80
  def basic_stats threads=1
80
81
  return @basic_stats if @basic_stats
81
- bin = @assembly.dup
82
+ bin = @assembly.values
82
83
  @basic_stats = basic_bin_stats bin
83
84
  @basic_stats
84
85
  end # basic_stats
@@ -103,7 +104,7 @@ module Transrate
103
104
  # representing contigs in the assembly
104
105
 
105
106
  def basic_bin_stats bin
106
-
107
+
107
108
  # cumulative length is a float so we can divide it
108
109
  # accurately later to get the mean length
109
110
  cumulative_length = 0.0
@@ -194,7 +195,7 @@ module Transrate
194
195
  covfile = Samtools.coverage bam
195
196
  # get an assembly enumerator
196
197
  assembly_enum = @assembly.to_enum
197
- contig = assembly_enum.next
198
+ contig_name, contig = assembly_enum.next
198
199
  # precreate an array of the correct size to contain
199
200
  # coverage. this is necessary because samtools mpileup
200
201
  # doesn't print a result line for bases with 0 coverage
@@ -209,12 +210,13 @@ module Transrate
209
210
  break
210
211
  end
211
212
  # extract the columns
212
- name, pos, cov = cols[name_i], cols[pos_i].to_i, cols[cov_i].to_i
213
- unless contig.name == name
214
- while contig.name != name
213
+ name = Bio::FastaDefline.new(cols[name_i]).entry_id
214
+ pos, cov = cols[pos_i].to_i, cols[cov_i].to_i
215
+ unless contig_name == name
216
+ while contig_name != name
215
217
  begin
216
218
  block.call(contig, contig.coverage)
217
- contig = assembly_enum.next
219
+ contig_name, contig = assembly_enum.next
218
220
  contig.coverage = Array.new(contig.length, 0)
219
221
  rescue StopIteration => stop_error
220
222
  logger.error 'reached the end of assembly enumerator while ' +
@@ -49,6 +49,9 @@ module Transrate
49
49
  # run bowtie
50
50
  runner = Cmd.new bowtiecmd
51
51
  runner.run
52
+ if !runner.status.success?
53
+ raise Bowtie2Error.new("Bowtie2 failed\n#{runner.stderr}")
54
+ end
52
55
  end
53
56
  @sam
54
57
  end
@@ -59,6 +62,10 @@ module Transrate
59
62
  cmd = "#{@bowtie2_build} --quiet --offrate 1 #{file} #{@index_name}"
60
63
  runner = Cmd.new cmd
61
64
  runner.run
65
+ if !runner.status.success?
66
+ msg = "Failed to build Bowtie2 index\n#{runner.stderr}"
67
+ raise Bowtie2Error.new(msg)
68
+ end
62
69
  end
63
70
  @index_built = true
64
71
  end
@@ -10,86 +10,89 @@ module Transrate
10
10
  attr_reader :reciprocal_hits
11
11
  attr_reader :has_run
12
12
  attr_reader :reference_coverage
13
+ attr_reader :comp_stats
13
14
  attr_reader :n_chimeras, :p_chimeras
14
15
 
15
16
  def initialize assembly, reference, threads
16
17
  @assembly = assembly
17
18
  @reference = reference
18
19
  @threads = threads
20
+ @comp_stats = Hash.new
19
21
  end
20
22
 
21
23
  def run
22
24
  @crbblast = reciprocal_best_blast
23
- @ortholog_hit_ratio = ortholog_hit_ratio @crbblast
24
- @collapse_factor = collapse_factor @crbblast.target_results
25
+ @reference_coverage = coverage @crbblast
26
+ @collapse_factor = collapse_factor @crbblast.reciprocals
25
27
  @reciprocal_hits = @crbblast.size
26
28
  @rbh_per_reference = @reciprocal_hits.to_f / @reference.size.to_f
27
- @reference_coverage = @ortholog_hit_ratio * @rbh_per_reference
28
- @rbh_per_contig = @reciprocal_hits.to_f / @assembly.assembly.size.to_f
29
29
  @p_contigs_with_recip = @crbblast.reciprocals.size / @assembly.size.to_f
30
30
  @n_contigs_with_recip = @crbblast.reciprocals.size
31
+ count_ref_crbbs
31
32
  @p_refs_with_recip = @n_refs_with_recip / @reference.size.to_f
32
33
  chimeras @crbblast
34
+ self.run_comp_stats
33
35
  @has_run = true
34
36
  end
35
37
 
36
- def comp_stats
37
- {
38
- :reciprocal_hits => @reciprocal_hits,
39
- :rbh_per_contig => @rbh_per_contig,
40
- :p_contigs_with_recip => @p_contigs_with_recip,
41
- :n_contigs_with_recip => @n_contigs_with_recip,
42
- :p_refs_with_recip => @p_refs_with_recip,
43
- :n_refs_with_recip => @n_refs_with_recip,
44
- :rbh_per_reference => @rbh_per_reference,
45
- :reference_coverage => @reference_coverage,
46
- :ortholog_hit_ratio => @ortholog_hit_ratio,
47
- :collapse_factor => @collapse_factor,
48
- :n_chimeras => @n_chimeras,
49
- :p_chimeras => @p_chimeras,
50
- :cov25 => @cov[0],
51
- :cov50 => @cov[1],
52
- :cov75 => @cov[2],
53
- :cov85 => @cov[3],
54
- :cov95 => @cov[4],
55
- :p_cov25 => @cov[0]/@reference.size.to_f,
56
- :p_cov50 => @cov[1]/@reference.size.to_f,
57
- :p_cov75 => @cov[2]/@reference.size.to_f,
58
- :p_cov85 => @cov[3]/@reference.size.to_f,
59
- :p_cov95 => @cov[4]/@reference.size.to_f
60
- }
38
+ def run_comp_stats
39
+ @comp_stats[:CRBB_hits] = @reciprocal_hits # CRBB hits
40
+ @comp_stats[:p_contigs_with_CRBB] = @p_contigs_with_recip
41
+ @comp_stats[:n_contigs_with_CRBB] = @n_contigs_with_recip
42
+ @comp_stats[:p_refs_with_CRBB] = @p_refs_with_recip
43
+ @comp_stats[:n_refs_with_CRBB] = @n_refs_with_recip
44
+ @comp_stats[:rbh_per_reference] = @rbh_per_reference
45
+ @comp_stats[:reference_coverage] = @reference_coverage
46
+ @comp_stats[:collapse_factor] = @collapse_factor
47
+ @comp_stats[:n_chimeras] = @n_chimeras
48
+ @comp_stats[:p_chimeras] = @p_chimeras
61
49
  end
62
50
 
63
51
  def reciprocal_best_blast
64
- crbblast = CRB_Blast.new @assembly.file, @reference.file
65
- crbblast.run 1e-5, @threads
52
+ crbblast = CRB_Blast::CRB_Blast.new @assembly.file, @reference.file
53
+ crbblast.run(1e-5, @threads, true)
66
54
  crbblast
67
55
  end
68
56
 
69
57
  # coverage of contigs that have reciprocal hits
70
- # divided by
71
- # number of reciprocal targets
72
- def ortholog_hit_ratio crbblast
73
- return @ortholog_hit_ratio unless @ortholog_hit_ratio.nil?
58
+ # divided by number of reciprocal targets
59
+ def coverage crbblast
60
+ return @reference_coverage unless @reference_coverage.nil?
61
+ crbblast.reciprocals.each do |key, list|
62
+ list.each_with_index do |hit, i|
63
+ unless @reference.assembly.key? hit.target
64
+ raise "#{hit.target} not in reference"
65
+ end
66
+ @reference[hit.target].hits << hit
74
67
 
75
- targets = Hash.new
76
- crbblast.reciprocals.each_pair do |key, list|
77
- list.each do |hit|
78
- targets[hit.target] ||= [] # if key doesn't exist add it with a []
79
- targets[hit.target] << hit
68
+ unless @assembly.assembly.key? hit.query
69
+ raise "#{hit.query} not in assembly"
70
+ end
71
+ contig = @assembly[hit.query]
72
+ contig.has_crb = true
73
+ # how much of the reference is covered by this single contig
74
+ contig.reference_coverage = hit.alnlen / hit.tlen
75
+ contig.hits << hit
80
76
  end
81
77
  end
82
- @n_refs_with_recip = targets.size
83
- total_coverage=0
84
- total_length=0
85
- targets.each_pair do |key, list|
78
+ total_coverage = 0
79
+ total_length = 0
80
+ cov = [0.25, 0.5, 0.75, 0.85, 0.95]
81
+ @reference.each_value do |ref_contig|
82
+ key = ref_contig.name
83
+ list = ref_contig.hits
84
+ total_length += crbblast.target_is_prot ? ref_contig.length : ref_contig.length*3
85
+
86
+ next if list.empty? # ah this is what was breaking everything
86
87
  blocks = []
87
88
  target_length = 0
88
89
  list.each do |hit|
89
90
  target_length = hit.tlen
90
91
  if crbblast.target_is_prot
91
92
  target_length *= 3
92
- start, stop = [hit.tstart*3, hit.tend*3].minmax
93
+ start, stop = [hit.tstart, hit.tend].minmax
94
+ start = start*3-2
95
+ stop = stop*3
93
96
  else
94
97
  start, stop = [hit.tstart, hit.tend].minmax
95
98
  end
@@ -112,7 +115,8 @@ module Transrate
112
115
  block[0] = start
113
116
  block[1] = stop
114
117
  found=true
115
- # elsif o == 4 # full overlap
118
+ elsif o == 4 # full overlap
119
+ found=true
116
120
  # nothing
117
121
  # elsif o == 5 || o == 6 # no overlap
118
122
 
@@ -157,28 +161,53 @@ module Transrate
157
161
  end # each_with_index b
158
162
  end # each_with_index a
159
163
  # sum blocks to find total coverage
160
- length_of_coverage=0
161
- blocks.each do |block|
162
- if block[0] and block[1]
163
- if block[0]>=0 and block[1]>=0
164
- length_of_coverage += block[1] - block[0] + 1
165
- end
166
- else
167
- puts "error: key = #{key}, #{blocks}"
168
- end
169
- end
170
- cov = [0.25, 0.5, 0.75, 0.85, 0.95]
164
+ length_of_coverage = calculate_coverage blocks
171
165
  @cov ||= [0, 0, 0, 0, 0]
172
- p = length_of_coverage / target_length.to_f
166
+ if target_length > 0
167
+ # puts "#{length_of_coverage} / #{target_length.to_f}"
168
+ ref_p = length_of_coverage / target_length.to_f
169
+ else
170
+ ref_p = 0
171
+ end
172
+ ref_contig.reference_coverage = ref_p
173
+
173
174
  cov.each_with_index do |c, i|
174
- if p >= c
175
+ if ref_p >= c
175
176
  @cov[i] +=1
176
177
  end
177
178
  end
179
+
178
180
  total_coverage += length_of_coverage
179
- total_length += target_length
180
181
  end
181
- return ortholog_hit_ratio = total_coverage / total_length.to_f
182
+ cov.each_with_index do |p, i|
183
+ @comp_stats["cov#{(100*p).to_i}".to_sym] = @cov[i]
184
+ @comp_stats["p_cov#{(100*p).to_i}".to_sym] =
185
+ @cov[i]/@reference.size.to_f
186
+ end
187
+ total_coverage / total_length.to_f
188
+ end
189
+
190
+ # Calculate the total coverage from a set of coverage blocks
191
+ def calculate_coverage blocks
192
+ coverage = 0
193
+ blocks.each do |block|
194
+ if block[0] and block[1]
195
+ if block[0]>=0 and block[1]>=0
196
+ coverage += block[1] - block[0] + 1
197
+ end
198
+ else
199
+ puts "error: key = #{key}, #{blocks}"
200
+ end
201
+ end
202
+ coverage
203
+ end
204
+
205
+ # Count reference proteins with at least one recprocal hit
206
+ def count_ref_crbbs
207
+ @n_refs_with_recip = @reference.assembly.inject(0) do |sum, entry|
208
+ name, contig = entry
209
+ sum + (contig.hits.length > 0 ? 1 : 0)
210
+ end
182
211
  end
183
212
 
184
213
  def chimeras crbblast
@@ -210,6 +239,10 @@ module Transrate
210
239
  end
211
240
  if p/list.size.to_f >= 0.5
212
241
  @n_chimeras += 1
242
+ unless @assembly.assembly.key? key
243
+ puts "key not in assembly: #{key}"
244
+ end
245
+ @assembly[key].is_chimera = true
213
246
  end
214
247
  end
215
248
  @p_chimeras = @n_chimeras / crbblast.reciprocals.length.to_f
@@ -267,20 +300,17 @@ module Transrate
267
300
  end
268
301
  end
269
302
 
270
- def collapse_factor hits=nil
303
+ # Count unique reference proteins per contig
304
+ def collapse_factor reciprocals
271
305
  return @collapse_factor unless @collapse_factor.nil?
272
- targets = {}
273
- hits.each_pair do |query, list|
274
- list.each do |hit|
275
- target = hit.target
276
- unless targets.has_key? target
277
- targets[target] = Set.new
278
- end
279
- targets[target] << query
280
- end
306
+ cf_sum = 0
307
+ reciprocals.each do |query, hits|
308
+ uniq_hits = Set.new hits.map{ |h| h.target }
309
+ cf = uniq_hits.length
310
+ @assembly[query].collapse_factor = cf
311
+ cf_sum += cf
281
312
  end
282
- sum = targets.values.reduce(0.0){ |summer, val| summer += val.size }
283
- sum / targets.size
313
+ cf_sum / reciprocals.size
284
314
  end
285
315
 
286
316
  end # ComparativeMetrics
@@ -1,5 +1,4 @@
1
1
  require 'forwardable'
2
- require 'inline'
3
2
 
4
3
  module Transrate
5
4
 
@@ -9,51 +8,105 @@ module Transrate
9
8
  include Enumerable
10
9
  extend Forwardable
11
10
  def_delegators :@seq, :size, :length
12
- attr_accessor :seq, :name, :coverage
11
+ attr_accessor :seq, :name
12
+ # read-based metrics
13
+ attr_accessor :coverage, :uncovered_bases, :mean_coverage, :in_bridges
14
+ # reference-based metrics
15
+ attr_accessor :has_crb, :is_chimera, :collapse_factor, :reference_coverage
16
+ attr_accessor :hits
13
17
 
14
18
  def initialize(seq, name: nil)
19
+ seq.seq.gsub!("\0", "") # there is probably a better fix than this
15
20
  @seq = seq
21
+ @seq.data = nil # no need to store raw fasta string
16
22
  @name = seq.respond_to?(:entry_id) ? seq.entry_id : name
23
+ @hits = []
24
+ @reference_coverage = 0
25
+ @collapse_factor = 0
26
+ @is_chimera = false
27
+ @has_crb = false
28
+ @in_bridges = 0
29
+ @mean_coverage = 0
17
30
  end
18
31
 
19
32
  def each &block
20
33
  @seq.seq.each_char &block
21
34
  end
22
35
 
36
+ # Get all metrics available for this contig
37
+ def basic_metrics
38
+ basic = {
39
+ :length => length,
40
+ :prop_gc => prop_gc,
41
+ :gc_skew => gc_skew,
42
+ :at_skew => at_skew,
43
+ :cpg_count => cpg_count,
44
+ :cpg_ratio => cpg_ratio,
45
+ :orf_length => orf_length,
46
+ :linguistic_complexity_6 => linguistic_complexity(6)
47
+ }
48
+ end
49
+
50
+ def read_metrics
51
+ read = @coverage ? {
52
+ :uncovered_bases => uncovered_bases,
53
+ :mean_coverage => mean_coverage,
54
+ :in_bridges => in_bridges
55
+ } : {
56
+ :uncovered_bases => "NA",
57
+ :mean_coverage => "NA",
58
+ :in_bridges => in_bridges
59
+ }
60
+ end
61
+
62
+ def comparative_metrics
63
+ reference = @has_crb ? {
64
+ :has_crb => has_crb,
65
+ :collapse_factor => collapse_factor,
66
+ :reference_coverage => reference_coverage,
67
+ :is_chimera => is_chimera,
68
+ :hits => hits.map{ |h| h.target }.join(";")
69
+ } : {
70
+ :has_crb => false,
71
+ :collapse_factor => "NA",
72
+ :reference_coverage => "NA",
73
+ :is_chimera => "NA",
74
+ :hits => "NA"
75
+ }
76
+ end
77
+
23
78
  # Base composition of the contig
79
+ #
80
+ # If called and the instance variable @base_composition is nil
81
+ # then call the c method to count the bases and dibases in the sequence
82
+ # then get the info out of the c array and store it in the hash
83
+ # then if it is called again just return the hash as before
24
84
  def base_composition
25
85
  if @base_composition
26
86
  return @base_composition
27
87
  end
28
- base_comp = {
29
- :a => 0,
30
- :t => 0,
31
- :c => 0,
32
- :g => 0,
33
- :n => 0
34
- }
35
- dibase_comp = {
36
- :cg => 0
37
- }
38
- last_base = nil
39
- @seq.seq.each_char do |base|
40
- # single bases
41
- key = base.downcase.to_sym
42
- base_comp[key] += 1
43
- if last_base
44
- # pairs of bases
45
- dikey = "#{last_base}#{base}".downcase.to_sym
46
- if dibase_comp[dikey]
47
- dibase_comp[dikey] += 1
48
- else
49
- dibase_comp[dikey] = 1
50
- end
88
+ # else run the C method
89
+ composition(@seq.seq)
90
+ alphabet = ['a', 'c', 'g', 't', 'n']
91
+ @base_composition = {}
92
+ @dibase_composition={}
93
+ bases = []
94
+ dibases = []
95
+ alphabet.each do |c|
96
+ bases << "#{c}".to_sym
97
+ end
98
+ alphabet.each do |c|
99
+ alphabet.each do |d|
100
+ dibases << "#{c}#{d}".to_sym
51
101
  end
52
- last_base = base
53
102
  end
54
- @base_composition = base_comp
55
- @dibase_composition = dibase_comp
56
- return base_comp
103
+ bases.each_with_index do |a,i|
104
+ @base_composition[a] = base_count(i)
105
+ end
106
+ dibases.each_with_index do |a,i|
107
+ @dibase_composition[a] = dibase_count(i)
108
+ end
109
+ return @base_composition
57
110
  end
58
111
 
59
112
  # Dibase composition of the contig
@@ -124,89 +177,37 @@ module Transrate
124
177
 
125
178
  # GC skew
126
179
  def gc_skew
127
- prop_gc / (prop_a + prop_t + prop_gc)
180
+ (bases_g - bases_c) / (bases_g + bases_c).to_f
128
181
  end
129
182
 
130
183
  # AT skew
131
184
  def at_skew
132
- prop_a + prop_t / (prop_a + prop_t + prop_gc)
185
+ (bases_a - bases_t) / (bases_a + bases_t).to_f
133
186
  end
134
187
 
135
188
  # CpG count
136
189
  def cpg_count
137
- dibase_composition[:cg]
190
+ dibase_composition[:cg] + dibase_composition[:gc]
138
191
  end
139
192
 
140
- # CpG (C-phosphate-G) ratio
193
+ # observed-to-expected CpG (C-phosphate-G) ratio
141
194
  def cpg_ratio
142
- dibase_composition[:cg] / (prop_c * prop_g)
195
+ r = dibase_composition[:cg] + dibase_composition[:gc]
196
+ r /= (bases_c * bases_g).to_f
197
+ r *= (length - bases_n)
198
+ return r
143
199
  end
144
200
 
145
201
  # Find the longest orf in the contig
146
202
  def orf_length
147
- longest = longest_orf @seq.seq
148
- return longest
149
- end
150
-
151
- # Inlined C longest-ORF function
152
- inline do |builder|
153
- builder.c <<SRC
154
- static
155
- void
156
- longest_orf(VALUE _s) {
157
- int i,sl,longest=0;
158
- int len[6];
159
- char * c_str;
160
-
161
- sl = RSTRING_LEN(_s);
162
- c_str = StringValueCStr(_s);
163
- for (i=0;i<6;i++) {
164
- len[i]=0;
165
- }
166
- for (i=0;i<sl-2;i++) {
167
- if (c_str[i]=='T' &&
168
- ((c_str[i+1]=='A' && c_str[i+2]=='G') ||
169
- (c_str[i+1]=='A' && c_str[i+2]=='A') ||
170
- (c_str[i+1]=='G' && c_str[i+2]=='A'))) {
171
- if (len[i%3] > longest) {
172
- longest = len[i%3];
173
- }
174
- len[i%3]=0;
175
- } else {
176
- len[i%3]++;
177
- }
178
- if (c_str[i+2]=='A' &&
179
- ((c_str[i]=='C' && c_str[i+1]=='T') ||
180
- (c_str[i]=='T' && c_str[i+1]=='T') ||
181
- (c_str[i]=='T' && c_str[i+1]=='C'))) {
182
- if (len[3+i%3] > longest) {
183
- longest = len[3+i%3];
184
- }
185
- len[3+i%3]=0;
186
- } else {
187
- len[3+i%3]++;
188
- }
189
- }
190
- if (len[i%3] > longest) {
191
- longest = len[i%3];
192
- }
193
- if (len[3+i%3] > longest) {
194
- longest = len[3+i%3];
195
- }
196
- return INT2NUM(longest);
197
- }
198
- SRC
203
+ return @orf_length if @orf_length
204
+ @orf_length = longest_orf(@seq.seq) # call to C
205
+ return @orf_length
199
206
  end
200
207
 
201
208
  def linguistic_complexity k
202
- d = 4 ** k
203
- set = Set.new
204
- (0..@seq.length-k).each do |i|
205
- set << @seq.seq.slice(i,k).upcase # slice(start, length)
206
- end # count how many kmers in seq
207
- set.size / d.to_f
209
+ return kmer_count(k, @seq.seq)/(4**k).to_f # call to C
208
210
  end
209
-
210
211
  end
211
212
 
212
213
  end