transrate 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/transrate.rb CHANGED
@@ -15,6 +15,7 @@ require 'transrate/metric'
15
15
  require 'transrate/dimension_reduce'
16
16
  require 'transrate/samtools'
17
17
  require 'transrate/cmd'
18
+ require 'transrate/transrate.so'
18
19
 
19
20
  # Transrate is a comprehensive transcriptome assembly
20
21
  # quality assessment tool.
@@ -26,7 +26,7 @@ module Transrate
26
26
 
27
27
  include Enumerable
28
28
  extend Forwardable
29
- def_delegators :@assembly, :each, :<<, :size, :length
29
+ def_delegators :@assembly, :each, :each_value, :<<, :size, :length, :[]
30
30
 
31
31
  attr_accessor :file
32
32
  attr_reader :assembly
@@ -43,11 +43,12 @@ module Transrate
43
43
  unless File.exist? @file
44
44
  raise IOError.new "Assembly file doesn't exist: #{@file}"
45
45
  end
46
- @assembly = []
46
+ @assembly = {}
47
47
  @n_bases = 0
48
48
  Bio::FastaFormat.open(file).each do |entry|
49
49
  @n_bases += entry.length
50
- @assembly << Contig.new(entry)
50
+ contig = Contig.new(entry)
51
+ @assembly[contig.name] = contig
51
52
  end
52
53
  @contig_metrics = ContigMetrics.new self
53
54
  end
@@ -78,7 +79,7 @@ module Transrate
78
79
  # @return [Hash] basic statistics about the assembly
79
80
  def basic_stats threads=1
80
81
  return @basic_stats if @basic_stats
81
- bin = @assembly.dup
82
+ bin = @assembly.values
82
83
  @basic_stats = basic_bin_stats bin
83
84
  @basic_stats
84
85
  end # basic_stats
@@ -103,7 +104,7 @@ module Transrate
103
104
  # representing contigs in the assembly
104
105
 
105
106
  def basic_bin_stats bin
106
-
107
+
107
108
  # cumulative length is a float so we can divide it
108
109
  # accurately later to get the mean length
109
110
  cumulative_length = 0.0
@@ -194,7 +195,7 @@ module Transrate
194
195
  covfile = Samtools.coverage bam
195
196
  # get an assembly enumerator
196
197
  assembly_enum = @assembly.to_enum
197
- contig = assembly_enum.next
198
+ contig_name, contig = assembly_enum.next
198
199
  # precreate an array of the correct size to contain
199
200
  # coverage. this is necessary because samtools mpileup
200
201
  # doesn't print a result line for bases with 0 coverage
@@ -209,12 +210,13 @@ module Transrate
209
210
  break
210
211
  end
211
212
  # extract the columns
212
- name, pos, cov = cols[name_i], cols[pos_i].to_i, cols[cov_i].to_i
213
- unless contig.name == name
214
- while contig.name != name
213
+ name = Bio::FastaDefline.new(cols[name_i]).entry_id
214
+ pos, cov = cols[pos_i].to_i, cols[cov_i].to_i
215
+ unless contig_name == name
216
+ while contig_name != name
215
217
  begin
216
218
  block.call(contig, contig.coverage)
217
- contig = assembly_enum.next
219
+ contig_name, contig = assembly_enum.next
218
220
  contig.coverage = Array.new(contig.length, 0)
219
221
  rescue StopIteration => stop_error
220
222
  logger.error 'reached the end of assembly enumerator while ' +
@@ -49,6 +49,9 @@ module Transrate
49
49
  # run bowtie
50
50
  runner = Cmd.new bowtiecmd
51
51
  runner.run
52
+ if !runner.status.success?
53
+ raise Bowtie2Error.new("Bowtie2 failed\n#{runner.stderr}")
54
+ end
52
55
  end
53
56
  @sam
54
57
  end
@@ -59,6 +62,10 @@ module Transrate
59
62
  cmd = "#{@bowtie2_build} --quiet --offrate 1 #{file} #{@index_name}"
60
63
  runner = Cmd.new cmd
61
64
  runner.run
65
+ if !runner.status.success?
66
+ msg = "Failed to build Bowtie2 index\n#{runner.stderr}"
67
+ raise Bowtie2Error.new(msg)
68
+ end
62
69
  end
63
70
  @index_built = true
64
71
  end
@@ -10,86 +10,89 @@ module Transrate
10
10
  attr_reader :reciprocal_hits
11
11
  attr_reader :has_run
12
12
  attr_reader :reference_coverage
13
+ attr_reader :comp_stats
13
14
  attr_reader :n_chimeras, :p_chimeras
14
15
 
15
16
  def initialize assembly, reference, threads
16
17
  @assembly = assembly
17
18
  @reference = reference
18
19
  @threads = threads
20
+ @comp_stats = Hash.new
19
21
  end
20
22
 
21
23
  def run
22
24
  @crbblast = reciprocal_best_blast
23
- @ortholog_hit_ratio = ortholog_hit_ratio @crbblast
24
- @collapse_factor = collapse_factor @crbblast.target_results
25
+ @reference_coverage = coverage @crbblast
26
+ @collapse_factor = collapse_factor @crbblast.reciprocals
25
27
  @reciprocal_hits = @crbblast.size
26
28
  @rbh_per_reference = @reciprocal_hits.to_f / @reference.size.to_f
27
- @reference_coverage = @ortholog_hit_ratio * @rbh_per_reference
28
- @rbh_per_contig = @reciprocal_hits.to_f / @assembly.assembly.size.to_f
29
29
  @p_contigs_with_recip = @crbblast.reciprocals.size / @assembly.size.to_f
30
30
  @n_contigs_with_recip = @crbblast.reciprocals.size
31
+ count_ref_crbbs
31
32
  @p_refs_with_recip = @n_refs_with_recip / @reference.size.to_f
32
33
  chimeras @crbblast
34
+ self.run_comp_stats
33
35
  @has_run = true
34
36
  end
35
37
 
36
- def comp_stats
37
- {
38
- :reciprocal_hits => @reciprocal_hits,
39
- :rbh_per_contig => @rbh_per_contig,
40
- :p_contigs_with_recip => @p_contigs_with_recip,
41
- :n_contigs_with_recip => @n_contigs_with_recip,
42
- :p_refs_with_recip => @p_refs_with_recip,
43
- :n_refs_with_recip => @n_refs_with_recip,
44
- :rbh_per_reference => @rbh_per_reference,
45
- :reference_coverage => @reference_coverage,
46
- :ortholog_hit_ratio => @ortholog_hit_ratio,
47
- :collapse_factor => @collapse_factor,
48
- :n_chimeras => @n_chimeras,
49
- :p_chimeras => @p_chimeras,
50
- :cov25 => @cov[0],
51
- :cov50 => @cov[1],
52
- :cov75 => @cov[2],
53
- :cov85 => @cov[3],
54
- :cov95 => @cov[4],
55
- :p_cov25 => @cov[0]/@reference.size.to_f,
56
- :p_cov50 => @cov[1]/@reference.size.to_f,
57
- :p_cov75 => @cov[2]/@reference.size.to_f,
58
- :p_cov85 => @cov[3]/@reference.size.to_f,
59
- :p_cov95 => @cov[4]/@reference.size.to_f
60
- }
38
+ def run_comp_stats
39
+ @comp_stats[:CRBB_hits] = @reciprocal_hits # CRBB hits
40
+ @comp_stats[:p_contigs_with_CRBB] = @p_contigs_with_recip
41
+ @comp_stats[:n_contigs_with_CRBB] = @n_contigs_with_recip
42
+ @comp_stats[:p_refs_with_CRBB] = @p_refs_with_recip
43
+ @comp_stats[:n_refs_with_CRBB] = @n_refs_with_recip
44
+ @comp_stats[:rbh_per_reference] = @rbh_per_reference
45
+ @comp_stats[:reference_coverage] = @reference_coverage
46
+ @comp_stats[:collapse_factor] = @collapse_factor
47
+ @comp_stats[:n_chimeras] = @n_chimeras
48
+ @comp_stats[:p_chimeras] = @p_chimeras
61
49
  end
62
50
 
63
51
  def reciprocal_best_blast
64
- crbblast = CRB_Blast.new @assembly.file, @reference.file
65
- crbblast.run 1e-5, @threads
52
+ crbblast = CRB_Blast::CRB_Blast.new @assembly.file, @reference.file
53
+ crbblast.run(1e-5, @threads, true)
66
54
  crbblast
67
55
  end
68
56
 
69
57
  # coverage of contigs that have reciprocal hits
70
- # divided by
71
- # number of reciprocal targets
72
- def ortholog_hit_ratio crbblast
73
- return @ortholog_hit_ratio unless @ortholog_hit_ratio.nil?
58
+ # divided by number of reciprocal targets
59
+ def coverage crbblast
60
+ return @reference_coverage unless @reference_coverage.nil?
61
+ crbblast.reciprocals.each do |key, list|
62
+ list.each_with_index do |hit, i|
63
+ unless @reference.assembly.key? hit.target
64
+ raise "#{hit.target} not in reference"
65
+ end
66
+ @reference[hit.target].hits << hit
74
67
 
75
- targets = Hash.new
76
- crbblast.reciprocals.each_pair do |key, list|
77
- list.each do |hit|
78
- targets[hit.target] ||= [] # if key doesn't exist add it with a []
79
- targets[hit.target] << hit
68
+ unless @assembly.assembly.key? hit.query
69
+ raise "#{hit.query} not in assembly"
70
+ end
71
+ contig = @assembly[hit.query]
72
+ contig.has_crb = true
73
+ # how much of the reference is covered by this single contig
74
+ contig.reference_coverage = hit.alnlen / hit.tlen
75
+ contig.hits << hit
80
76
  end
81
77
  end
82
- @n_refs_with_recip = targets.size
83
- total_coverage=0
84
- total_length=0
85
- targets.each_pair do |key, list|
78
+ total_coverage = 0
79
+ total_length = 0
80
+ cov = [0.25, 0.5, 0.75, 0.85, 0.95]
81
+ @reference.each_value do |ref_contig|
82
+ key = ref_contig.name
83
+ list = ref_contig.hits
84
+ total_length += crbblast.target_is_prot ? ref_contig.length : ref_contig.length*3
85
+
86
+ next if list.empty? # ah this is what was breaking everything
86
87
  blocks = []
87
88
  target_length = 0
88
89
  list.each do |hit|
89
90
  target_length = hit.tlen
90
91
  if crbblast.target_is_prot
91
92
  target_length *= 3
92
- start, stop = [hit.tstart*3, hit.tend*3].minmax
93
+ start, stop = [hit.tstart, hit.tend].minmax
94
+ start = start*3-2
95
+ stop = stop*3
93
96
  else
94
97
  start, stop = [hit.tstart, hit.tend].minmax
95
98
  end
@@ -112,7 +115,8 @@ module Transrate
112
115
  block[0] = start
113
116
  block[1] = stop
114
117
  found=true
115
- # elsif o == 4 # full overlap
118
+ elsif o == 4 # full overlap
119
+ found=true
116
120
  # nothing
117
121
  # elsif o == 5 || o == 6 # no overlap
118
122
 
@@ -157,28 +161,53 @@ module Transrate
157
161
  end # each_with_index b
158
162
  end # each_with_index a
159
163
  # sum blocks to find total coverage
160
- length_of_coverage=0
161
- blocks.each do |block|
162
- if block[0] and block[1]
163
- if block[0]>=0 and block[1]>=0
164
- length_of_coverage += block[1] - block[0] + 1
165
- end
166
- else
167
- puts "error: key = #{key}, #{blocks}"
168
- end
169
- end
170
- cov = [0.25, 0.5, 0.75, 0.85, 0.95]
164
+ length_of_coverage = calculate_coverage blocks
171
165
  @cov ||= [0, 0, 0, 0, 0]
172
- p = length_of_coverage / target_length.to_f
166
+ if target_length > 0
167
+ # puts "#{length_of_coverage} / #{target_length.to_f}"
168
+ ref_p = length_of_coverage / target_length.to_f
169
+ else
170
+ ref_p = 0
171
+ end
172
+ ref_contig.reference_coverage = ref_p
173
+
173
174
  cov.each_with_index do |c, i|
174
- if p >= c
175
+ if ref_p >= c
175
176
  @cov[i] +=1
176
177
  end
177
178
  end
179
+
178
180
  total_coverage += length_of_coverage
179
- total_length += target_length
180
181
  end
181
- return ortholog_hit_ratio = total_coverage / total_length.to_f
182
+ cov.each_with_index do |p, i|
183
+ @comp_stats["cov#{(100*p).to_i}".to_sym] = @cov[i]
184
+ @comp_stats["p_cov#{(100*p).to_i}".to_sym] =
185
+ @cov[i]/@reference.size.to_f
186
+ end
187
+ total_coverage / total_length.to_f
188
+ end
189
+
190
+ # Calculate the total coverage from a set of coverage blocks
191
+ def calculate_coverage blocks
192
+ coverage = 0
193
+ blocks.each do |block|
194
+ if block[0] and block[1]
195
+ if block[0]>=0 and block[1]>=0
196
+ coverage += block[1] - block[0] + 1
197
+ end
198
+ else
199
+ puts "error: key = #{key}, #{blocks}"
200
+ end
201
+ end
202
+ coverage
203
+ end
204
+
205
+ # Count reference proteins with at least one recprocal hit
206
+ def count_ref_crbbs
207
+ @n_refs_with_recip = @reference.assembly.inject(0) do |sum, entry|
208
+ name, contig = entry
209
+ sum + (contig.hits.length > 0 ? 1 : 0)
210
+ end
182
211
  end
183
212
 
184
213
  def chimeras crbblast
@@ -210,6 +239,10 @@ module Transrate
210
239
  end
211
240
  if p/list.size.to_f >= 0.5
212
241
  @n_chimeras += 1
242
+ unless @assembly.assembly.key? key
243
+ puts "key not in assembly: #{key}"
244
+ end
245
+ @assembly[key].is_chimera = true
213
246
  end
214
247
  end
215
248
  @p_chimeras = @n_chimeras / crbblast.reciprocals.length.to_f
@@ -267,20 +300,17 @@ module Transrate
267
300
  end
268
301
  end
269
302
 
270
- def collapse_factor hits=nil
303
+ # Count unique reference proteins per contig
304
+ def collapse_factor reciprocals
271
305
  return @collapse_factor unless @collapse_factor.nil?
272
- targets = {}
273
- hits.each_pair do |query, list|
274
- list.each do |hit|
275
- target = hit.target
276
- unless targets.has_key? target
277
- targets[target] = Set.new
278
- end
279
- targets[target] << query
280
- end
306
+ cf_sum = 0
307
+ reciprocals.each do |query, hits|
308
+ uniq_hits = Set.new hits.map{ |h| h.target }
309
+ cf = uniq_hits.length
310
+ @assembly[query].collapse_factor = cf
311
+ cf_sum += cf
281
312
  end
282
- sum = targets.values.reduce(0.0){ |summer, val| summer += val.size }
283
- sum / targets.size
313
+ cf_sum / reciprocals.size
284
314
  end
285
315
 
286
316
  end # ComparativeMetrics
@@ -1,5 +1,4 @@
1
1
  require 'forwardable'
2
- require 'inline'
3
2
 
4
3
  module Transrate
5
4
 
@@ -9,51 +8,105 @@ module Transrate
9
8
  include Enumerable
10
9
  extend Forwardable
11
10
  def_delegators :@seq, :size, :length
12
- attr_accessor :seq, :name, :coverage
11
+ attr_accessor :seq, :name
12
+ # read-based metrics
13
+ attr_accessor :coverage, :uncovered_bases, :mean_coverage, :in_bridges
14
+ # reference-based metrics
15
+ attr_accessor :has_crb, :is_chimera, :collapse_factor, :reference_coverage
16
+ attr_accessor :hits
13
17
 
14
18
  def initialize(seq, name: nil)
19
+ seq.seq.gsub!("\0", "") # there is probably a better fix than this
15
20
  @seq = seq
21
+ @seq.data = nil # no need to store raw fasta string
16
22
  @name = seq.respond_to?(:entry_id) ? seq.entry_id : name
23
+ @hits = []
24
+ @reference_coverage = 0
25
+ @collapse_factor = 0
26
+ @is_chimera = false
27
+ @has_crb = false
28
+ @in_bridges = 0
29
+ @mean_coverage = 0
17
30
  end
18
31
 
19
32
  def each &block
20
33
  @seq.seq.each_char &block
21
34
  end
22
35
 
36
+ # Get all metrics available for this contig
37
+ def basic_metrics
38
+ basic = {
39
+ :length => length,
40
+ :prop_gc => prop_gc,
41
+ :gc_skew => gc_skew,
42
+ :at_skew => at_skew,
43
+ :cpg_count => cpg_count,
44
+ :cpg_ratio => cpg_ratio,
45
+ :orf_length => orf_length,
46
+ :linguistic_complexity_6 => linguistic_complexity(6)
47
+ }
48
+ end
49
+
50
+ def read_metrics
51
+ read = @coverage ? {
52
+ :uncovered_bases => uncovered_bases,
53
+ :mean_coverage => mean_coverage,
54
+ :in_bridges => in_bridges
55
+ } : {
56
+ :uncovered_bases => "NA",
57
+ :mean_coverage => "NA",
58
+ :in_bridges => in_bridges
59
+ }
60
+ end
61
+
62
+ def comparative_metrics
63
+ reference = @has_crb ? {
64
+ :has_crb => has_crb,
65
+ :collapse_factor => collapse_factor,
66
+ :reference_coverage => reference_coverage,
67
+ :is_chimera => is_chimera,
68
+ :hits => hits.map{ |h| h.target }.join(";")
69
+ } : {
70
+ :has_crb => false,
71
+ :collapse_factor => "NA",
72
+ :reference_coverage => "NA",
73
+ :is_chimera => "NA",
74
+ :hits => "NA"
75
+ }
76
+ end
77
+
23
78
  # Base composition of the contig
79
+ #
80
+ # If called and the instance variable @base_composition is nil
81
+ # then call the c method to count the bases and dibases in the sequence
82
+ # then get the info out of the c array and store it in the hash
83
+ # then if it is called again just return the hash as before
24
84
  def base_composition
25
85
  if @base_composition
26
86
  return @base_composition
27
87
  end
28
- base_comp = {
29
- :a => 0,
30
- :t => 0,
31
- :c => 0,
32
- :g => 0,
33
- :n => 0
34
- }
35
- dibase_comp = {
36
- :cg => 0
37
- }
38
- last_base = nil
39
- @seq.seq.each_char do |base|
40
- # single bases
41
- key = base.downcase.to_sym
42
- base_comp[key] += 1
43
- if last_base
44
- # pairs of bases
45
- dikey = "#{last_base}#{base}".downcase.to_sym
46
- if dibase_comp[dikey]
47
- dibase_comp[dikey] += 1
48
- else
49
- dibase_comp[dikey] = 1
50
- end
88
+ # else run the C method
89
+ composition(@seq.seq)
90
+ alphabet = ['a', 'c', 'g', 't', 'n']
91
+ @base_composition = {}
92
+ @dibase_composition={}
93
+ bases = []
94
+ dibases = []
95
+ alphabet.each do |c|
96
+ bases << "#{c}".to_sym
97
+ end
98
+ alphabet.each do |c|
99
+ alphabet.each do |d|
100
+ dibases << "#{c}#{d}".to_sym
51
101
  end
52
- last_base = base
53
102
  end
54
- @base_composition = base_comp
55
- @dibase_composition = dibase_comp
56
- return base_comp
103
+ bases.each_with_index do |a,i|
104
+ @base_composition[a] = base_count(i)
105
+ end
106
+ dibases.each_with_index do |a,i|
107
+ @dibase_composition[a] = dibase_count(i)
108
+ end
109
+ return @base_composition
57
110
  end
58
111
 
59
112
  # Dibase composition of the contig
@@ -124,89 +177,37 @@ module Transrate
124
177
 
125
178
  # GC skew
126
179
  def gc_skew
127
- prop_gc / (prop_a + prop_t + prop_gc)
180
+ (bases_g - bases_c) / (bases_g + bases_c).to_f
128
181
  end
129
182
 
130
183
  # AT skew
131
184
  def at_skew
132
- prop_a + prop_t / (prop_a + prop_t + prop_gc)
185
+ (bases_a - bases_t) / (bases_a + bases_t).to_f
133
186
  end
134
187
 
135
188
  # CpG count
136
189
  def cpg_count
137
- dibase_composition[:cg]
190
+ dibase_composition[:cg] + dibase_composition[:gc]
138
191
  end
139
192
 
140
- # CpG (C-phosphate-G) ratio
193
+ # observed-to-expected CpG (C-phosphate-G) ratio
141
194
  def cpg_ratio
142
- dibase_composition[:cg] / (prop_c * prop_g)
195
+ r = dibase_composition[:cg] + dibase_composition[:gc]
196
+ r /= (bases_c * bases_g).to_f
197
+ r *= (length - bases_n)
198
+ return r
143
199
  end
144
200
 
145
201
  # Find the longest orf in the contig
146
202
  def orf_length
147
- longest = longest_orf @seq.seq
148
- return longest
149
- end
150
-
151
- # Inlined C longest-ORF function
152
- inline do |builder|
153
- builder.c <<SRC
154
- static
155
- void
156
- longest_orf(VALUE _s) {
157
- int i,sl,longest=0;
158
- int len[6];
159
- char * c_str;
160
-
161
- sl = RSTRING_LEN(_s);
162
- c_str = StringValueCStr(_s);
163
- for (i=0;i<6;i++) {
164
- len[i]=0;
165
- }
166
- for (i=0;i<sl-2;i++) {
167
- if (c_str[i]=='T' &&
168
- ((c_str[i+1]=='A' && c_str[i+2]=='G') ||
169
- (c_str[i+1]=='A' && c_str[i+2]=='A') ||
170
- (c_str[i+1]=='G' && c_str[i+2]=='A'))) {
171
- if (len[i%3] > longest) {
172
- longest = len[i%3];
173
- }
174
- len[i%3]=0;
175
- } else {
176
- len[i%3]++;
177
- }
178
- if (c_str[i+2]=='A' &&
179
- ((c_str[i]=='C' && c_str[i+1]=='T') ||
180
- (c_str[i]=='T' && c_str[i+1]=='T') ||
181
- (c_str[i]=='T' && c_str[i+1]=='C'))) {
182
- if (len[3+i%3] > longest) {
183
- longest = len[3+i%3];
184
- }
185
- len[3+i%3]=0;
186
- } else {
187
- len[3+i%3]++;
188
- }
189
- }
190
- if (len[i%3] > longest) {
191
- longest = len[i%3];
192
- }
193
- if (len[3+i%3] > longest) {
194
- longest = len[3+i%3];
195
- }
196
- return INT2NUM(longest);
197
- }
198
- SRC
203
+ return @orf_length if @orf_length
204
+ @orf_length = longest_orf(@seq.seq) # call to C
205
+ return @orf_length
199
206
  end
200
207
 
201
208
  def linguistic_complexity k
202
- d = 4 ** k
203
- set = Set.new
204
- (0..@seq.length-k).each do |i|
205
- set << @seq.seq.slice(i,k).upcase # slice(start, length)
206
- end # count how many kmers in seq
207
- set.size / d.to_f
209
+ return kmer_count(k, @seq.seq)/(4**k).to_f # call to C
208
210
  end
209
-
210
211
  end
211
212
 
212
213
  end