transrate 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (47) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +16 -1
  3. data/.travis.yml +8 -0
  4. data/README.md +45 -43
  5. data/Rakefile +36 -0
  6. data/bin/transrate +98 -50
  7. data/deps/deps.yaml +55 -0
  8. data/lib/transrate.rb +19 -4
  9. data/lib/transrate/assembly.rb +93 -182
  10. data/lib/transrate/bowtie2.rb +37 -13
  11. data/lib/transrate/cmd.rb +19 -0
  12. data/lib/transrate/comparative_metrics.rb +239 -19
  13. data/lib/transrate/contig.rb +212 -0
  14. data/lib/transrate/contig_metrics.rb +76 -0
  15. data/lib/transrate/read_metrics.rb +83 -41
  16. data/lib/transrate/samtools.rb +73 -0
  17. data/lib/transrate/transrater.rb +31 -11
  18. data/lib/transrate/version.rb +1 -1
  19. data/test/data/150uncovered.l.fq +892 -0
  20. data/test/data/150uncovered.r.fq +892 -0
  21. data/test/data/Os.protein.2.fa +95 -0
  22. data/test/data/Os.protein.fa +199 -0
  23. data/test/data/assembly.2.fa +26 -0
  24. data/test/{assembly.fasta → data/assembly.fasta} +0 -0
  25. data/test/data/bridging_reads.l.fastq +20 -0
  26. data/test/data/bridging_reads.r.fastq +20 -0
  27. data/test/data/sorghum_transcript.fa +4 -0
  28. data/test/data/tiny.sam +4 -0
  29. data/test/helper.rb +33 -2
  30. data/test/test_bowtie.rb +54 -0
  31. data/test/test_cmd.rb +15 -0
  32. data/test/test_comp_metrics.rb +177 -0
  33. data/test/test_contig.rb +61 -0
  34. data/test/test_contig_metrics.rb +50 -0
  35. data/test/test_inline.rb +10 -9
  36. data/test/test_read_metrics.rb +68 -0
  37. data/test/test_samtools.rb +22 -0
  38. data/test/test_transrate.rb +40 -0
  39. data/test/test_transrater.rb +68 -0
  40. data/transrate.gemspec +16 -10
  41. metadata +232 -57
  42. data/lib/transrate/express.rb +0 -37
  43. data/lib/transrate/log.rb +0 -16
  44. data/lib/transrate/rb_hit.rb +0 -33
  45. data/lib/transrate/reciprocal_annotation.rb +0 -105
  46. data/lib/transrate/usearch.rb +0 -66
  47. data/test/test_test.rb +0 -41
data/deps/deps.yaml ADDED
@@ -0,0 +1,55 @@
1
+ blastplus:
2
+ binaries:
3
+ - makeblastdb
4
+ - blastn
5
+ - tblastn
6
+ - blastp
7
+ - blastx
8
+ - tblastx
9
+ - makembindex
10
+ - psiblast
11
+ - rpsblast
12
+ - blastdbcmd
13
+ - segmasker
14
+ - dustmasker
15
+ - blast_formatter
16
+ - windowmasker
17
+ - blastdb_aliastool
18
+ - deltablast
19
+ - rpstblastn
20
+ - blastdbcheck
21
+ version:
22
+ number: '2.2.29'
23
+ command: 'blastx -version'
24
+ url:
25
+ 64bit:
26
+ macosx: ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/2.2.29/ncbi-blast-2.2.29+-universal-macosx.tar.gz
27
+ linux: ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/2.2.29/ncbi-blast-2.2.29+-x64-linux.tar.gz
28
+ bowtie2:
29
+ binaries:
30
+ - bowtie2
31
+ - bowtie2-align-l
32
+ - bowtie2-align-s
33
+ - bowtie2-build
34
+ - bowtie2-build-l
35
+ - bowtie2-build-s
36
+ - bowtie2-inspect
37
+ - bowtie2-inspect-l
38
+ - bowtie2-inspect-s
39
+ version:
40
+ number: '2.2.3'
41
+ command: 'bowtie2 --version'
42
+ url:
43
+ 64bit:
44
+ linux: http://downloads.sourceforge.net/project/bowtie-bio/bowtie2/2.2.3/bowtie2-2.2.3-linux-x86_64.zip
45
+ macosx: http://downloads.sourceforge.net/project/bowtie-bio/bowtie2/2.2.3/bowtie2-2.2.3-macos-x86_64.zip
46
+ express:
47
+ binaries:
48
+ - express
49
+ version:
50
+ number: '1.5.1'
51
+ command: 'express --version'
52
+ url:
53
+ 64bit:
54
+ linux: http://bio.math.berkeley.edu/eXpress/downloads/express-1.5.1/express-1.5.1-linux_x86_64.tgz
55
+ macosx: http://bio.math.berkeley.edu/eXpress/downloads/express-1.5.1/express-1.5.1-macosx_x86_64.tgz
data/lib/transrate.rb CHANGED
@@ -1,18 +1,33 @@
1
+ # before the
2
+ require 'rbconfig'
3
+ require 'yell'
4
+ RbConfig::CONFIG['CFLAGS'] = ''
5
+
1
6
  require 'transrate/transrater'
2
7
  require 'transrate/version'
8
+ require 'transrate/contig'
3
9
  require 'transrate/assembly'
4
10
  require 'transrate/bowtie2'
5
11
  require 'transrate/read_metrics'
6
- require 'transrate/usearch'
7
- require 'transrate/rb_hit'
8
- require 'transrate/reciprocal_annotation'
9
12
  require 'transrate/comparative_metrics'
13
+ require 'transrate/contig_metrics'
10
14
  require 'transrate/metric'
11
15
  require 'transrate/dimension_reduce'
12
- require 'transrate/express'
16
+ require 'transrate/samtools'
17
+ require 'transrate/cmd'
13
18
 
14
19
  # Transrate is a comprehensive transcriptome assembly
15
20
  # quality assessment tool.
16
21
  module Transrate
17
22
 
23
+ # Create the universal logger and include it in Object
24
+ # making the logger object available everywhere
25
+ Yell.new(:format => "[%5L]: %m") do |l|
26
+ l.level = :info
27
+ l.name = Object
28
+ l.adapter STDOUT, level: [:debug, :info, :warn]
29
+ l.adapter STDERR, level: [:error, :fatal]
30
+ end
31
+ Object.send :include, Yell::Loggable
32
+
18
33
  end # Transrate
@@ -1,8 +1,6 @@
1
1
  require 'bio'
2
- require 'bettersam'
3
2
  require 'csv'
4
3
  require 'forwardable'
5
- require 'inline'
6
4
 
7
5
  module Transrate
8
6
 
@@ -30,36 +28,28 @@ module Transrate
30
28
  extend Forwardable
31
29
  def_delegators :@assembly, :each, :<<, :size, :length
32
30
 
33
- attr_accessor :ublast_db
34
- attr_accessor :orfs_ublast_db
31
+ attr_accessor :file
35
32
  attr_reader :assembly
36
33
  attr_reader :has_run
37
- attr_writer :n_bases
38
- attr_accessor :file
34
+ attr_accessor :n_bases
39
35
  attr_reader :n50
36
+ attr_accessor :contig_metrics
40
37
 
41
38
  # Create a new Assembly.
42
39
  #
43
40
  # @param file [String] path to the assembly FASTA file
44
41
  def initialize file
45
- @file = file
42
+ @file = File.expand_path file
43
+ unless File.exist? @file
44
+ raise IOError.new "Assembly file doesn't exist: #{@file}"
45
+ end
46
46
  @assembly = []
47
47
  @n_bases = 0
48
48
  Bio::FastaFormat.open(file).each do |entry|
49
49
  @n_bases += entry.length
50
- @assembly << entry
50
+ @assembly << Contig.new(entry)
51
51
  end
52
- end
53
-
54
- # Return basic statistics about the assembly in
55
- # the specified FASTA file
56
- #
57
- # @param file [String] path to assebmly FASTA file
58
- #
59
- # @return [Hash] basic statistics about the assembly
60
- def self.stats_from_fasta file
61
- a = Assembly.new file
62
- a.basic_stats
52
+ @contig_metrics = ContigMetrics.new self
63
53
  end
64
54
 
65
55
  # Generate and store the basic statistics for this assembly
@@ -74,6 +64,7 @@ module Transrate
74
64
  singleton_class.class_eval { attr_accessor attr_ivar }
75
65
  self.instance_variable_set(ivar, value)
76
66
  end
67
+ @contig_metrics.run
77
68
  @has_run = true
78
69
  end
79
70
 
@@ -85,60 +76,14 @@ module Transrate
85
76
  # @param threads [Integer] number of threads to use
86
77
  #
87
78
  # @return [Hash] basic statistics about the assembly
88
- def basic_stats threads=8
89
-
90
- # disable threading basic stats for now
91
- threads = 1
92
-
93
- # create a work queue to process contigs in parallel
94
- queue = Queue.new
95
-
96
- # split the contigs into equal sized bins, one bin per thread
97
- binsize = (@assembly.size / threads.to_f).ceil
98
- @assembly.each_slice(binsize) do |bin|
99
- queue << bin
100
- end
101
-
102
- # a classic threadpool - an Array of threads that allows
103
- # us to assign work to each thread and then aggregate their
104
- # results when they are all finished
105
- threadpool = []
106
-
107
- # assign one bin of contigs to each thread from the queue.
108
- # each thread will process its bin of contigs and then wait
109
- # for the others to finish.
110
- semaphore = Mutex.new
111
- stats = []
112
-
113
- threads.times do
114
- threadpool << Thread.new do |thread|
115
- # keep looping until we run out of bins
116
- until queue.empty?
117
-
118
- # use non-blocking pop, so an exception is raised
119
- # when the queue runs dry
120
- bin = queue.pop(true) rescue nil
121
- if bin
122
- # calculate basic stats for the bin, storing them
123
- # in the current thread so they can be collected
124
- # in the main thread.
125
- bin_stats = basic_bin_stats bin
126
- semaphore.synchronize { stats << bin_stats }
127
- end
128
- end
129
- end
130
- end
131
-
132
- # collect the stats calculated in each thread and join
133
- # the threads to terminate them
134
- threadpool.each(&:join)
135
-
136
- # merge the collected stats and return then
137
- merge_basic_stats stats
138
-
79
+ def basic_stats threads=1
80
+ return @basic_stats if @basic_stats
81
+ bin = @assembly.dup
82
+ @basic_stats = basic_bin_stats bin
83
+ @basic_stats
139
84
  end # basic_stats
140
85
 
141
-
86
+
142
87
  # Calculate basic statistics in an single thread for a bin
143
88
  # of contigs.
144
89
  #
@@ -156,13 +101,13 @@ module Transrate
156
101
  #
157
102
  # @param [Array] bin An array of Bio::Sequence objects
158
103
  # representing contigs in the assembly
159
-
160
- def basic_bin_stats bin
161
104
 
105
+ def basic_bin_stats bin
106
+
162
107
  # cumulative length is a float so we can divide it
163
108
  # accurately later to get the mean length
164
109
  cumulative_length = 0.0
165
-
110
+
166
111
  # we'll calculate Nx for x in [10, 30, 50, 70, 90]
167
112
  # to do this we create a stack of the x values and
168
113
  # pop the first one to set the first cutoff. when
@@ -174,23 +119,30 @@ module Transrate
174
119
  x2 = x.clone
175
120
  cutoff = x2.pop / 100.0
176
121
  res = []
177
- n1k = 0
178
- n10k = 0
179
- orf_length_sum = 0
180
-
122
+ n_under_200, n_over_1k, n_over_10k, n_with_orf, orf_length_sum = 0,0,0,0,0
181
123
  # sort the contigs in ascending length order
182
124
  # and iterate over them
183
- bin.sort_by! { |c| c.seq.size }
125
+ bin.sort_by! { |c| c.seq.length }
184
126
  bin.each do |contig|
185
-
127
+
186
128
  # increment our long contig counters if this
187
129
  # contig is above the thresholds
188
- n1k += 1 if contig.length > 1_000
189
- n10k += 1 if contig.length > 10_000
130
+ if contig.length < 200
131
+ # ignore contigs less than 200 bases,
132
+ # but record how many there are
133
+ n_under_200 += 1
134
+ next
135
+ end
136
+ n_over_1k += 1 if contig.length > 1_000
137
+ n_over_10k += 1 if contig.length > 10_000
190
138
 
191
139
  # add the length of the longest orf to the
192
140
  # running total
193
- orf_length_sum += orf_length(contig.seq)
141
+ orf_length = contig.orf_length
142
+ orf_length_sum += orf_length
143
+ # only consider orfs that are realistic length
144
+ # (here we set minimum amino acid length as 50)
145
+ n_with_orf += 1 if orf_length > 149
194
146
 
195
147
  # increment the cumulative length and check whether the Nx
196
148
  # cutoff has been reached. if it has, store the Nx value and
@@ -199,125 +151,84 @@ module Transrate
199
151
  if cumulative_length >= @n_bases * cutoff
200
152
  res << contig.length
201
153
  if x2.empty?
202
- cutoff=1
154
+ cutoff = 1
203
155
  else
204
156
  cutoff = x2.pop / 100.0
205
- end
157
+ end
206
158
  end
207
159
 
208
160
  end
209
161
 
162
+ # if there aren't enough sequences we might have no value for some
163
+ # of the Nx. Fill the empty ones in with the longest contig length.
164
+ while res.length < x.length do
165
+ res << bin.last.length
166
+ end
167
+
210
168
  # calculate and return the statistics as a hash
211
169
  mean = cumulative_length / @assembly.size
212
- ns = Hash[x.map { |n| "N#{n}" }.zip(res)]
170
+ ns = Hash[x.map { |n| "n#{n}" }.zip(res)]
213
171
  {
214
172
  'n_seqs' => bin.size,
215
173
  'smallest' => bin.first.length,
216
174
  'largest' => bin.last.length,
217
175
  'n_bases' => n_bases,
218
176
  'mean_len' => mean,
219
- 'n_1k' => n1k,
220
- 'n_10k' => n10k,
221
- 'orf_percent' => 300 * orf_length_sum / (@assembly.size * mean)
177
+ 'n_under_200' => n_under_200,
178
+ 'n_over_1k' => n_over_1k,
179
+ 'n_over_10k' => n_over_10k,
180
+ 'n_with_orf' => n_with_orf,
181
+ 'mean_orf_percent' => 300 * orf_length_sum / (@assembly.size * mean)
222
182
  }.merge ns
223
183
 
224
184
  end # basic_bin_stats
225
185
 
226
- def merge_basic_stats stats
227
- # convert the array of hashes into a hash of arrays
228
- collect = Hash.new{|h,k| h[k]=[]}
229
- stats.each_with_object(collect) do |collect, result|
230
- collect.each{ |k, v| result[k] << v }
231
- end
232
- merged = {}
233
- collect.each_pair do |stat, values|
234
- if stat == 'orf_percent' || /N[0-9]{2}/ =~ stat
235
- # store the mean
236
- merged[stat] = values.inject(:+) / values.size
237
- elsif stat == 'smallest'
238
- merged[stat] = values.min
239
- elsif stat == 'largest'
240
- merged[stat] = values.max
241
- else
242
- # store the sum
243
- merged[stat] = values.inject(:+)
186
+ # Calls *block* with two arguments, the contig and an array
187
+ # of integer per-base coverage counts.
188
+ #
189
+ # @param bam [Bio::Db::Sam] a bam alignment of reads against this assembly
190
+ # @param block [Block] the block to call
191
+ def each_with_coverage(bam, &block)
192
+ logger.debug 'enumerating assembly with coverage'
193
+ # generate coverage with samtools
194
+ covfile = Samtools.coverage bam
195
+ # get an assembly enumerator
196
+ assembly_enum = @assembly.to_enum
197
+ contig = assembly_enum.next
198
+ # precreate an array of the correct size to contain
199
+ # coverage. this is necessary because samtools mpileup
200
+ # doesn't print a result line for bases with 0 coverage
201
+ contig.coverage = Array.new(contig.length, 0)
202
+ # the columns we need
203
+ name_i, pos_i, cov_i = 0, 1, 3
204
+ # parse the coverage file
205
+ File.open(covfile).each_line do |line|
206
+ cols = line.chomp.split("\t")
207
+ unless (cols && cols.length > 4)
208
+ # last line
209
+ break
244
210
  end
211
+ # extract the columns
212
+ name, pos, cov = cols[name_i], cols[pos_i].to_i, cols[cov_i].to_i
213
+ unless contig.name == name
214
+ while contig.name != name
215
+ begin
216
+ block.call(contig, contig.coverage)
217
+ contig = assembly_enum.next
218
+ contig.coverage = Array.new(contig.length, 0)
219
+ rescue StopIteration => stop_error
220
+ logger.error 'reached the end of assembly enumerator while ' +
221
+ 'there were contigs left in the coverage results'
222
+ logger.error "final assembly contig: #{@assembly.last.name}"
223
+ logger.error "coverage contig: #{name}"
224
+ raise stop_error
225
+ end
226
+ end
227
+ end
228
+ contig.coverage[pos - 1] = cov
245
229
  end
246
-
247
- merged
248
-
249
- end # merge_basic_stats
250
-
251
- inline do |builder|
252
-
253
- builder.c <<SRC
254
- static
255
- void
256
- longest_orf(VALUE _s) {
257
- int i,sl,longest=0;
258
- int len[6];
259
- char * c_str;
260
-
261
- sl = RSTRING_LEN(_s);
262
- c_str = StringValueCStr(_s);
263
- for (i=0;i<6;i++) {
264
- len[i]=0;
265
- }
266
- for (i=0;i<sl-2;i++) {
267
- if (c_str[i]=='T' &&
268
- ((c_str[i+1]=='A' && c_str[i+2]=='G') ||
269
- (c_str[i+1]=='A' && c_str[i+2]=='A') ||
270
- (c_str[i+1]=='G' && c_str[i+2]=='A'))) {
271
- if (len[i%3] > longest) {
272
- longest = len[i%3];
273
- }
274
- len[i%3]=0;
275
- } else {
276
- len[i%3]++;
277
- }
278
- if (c_str[i+2]=='A' &&
279
- ((c_str[i]=='C' && c_str[i+1]=='T') ||
280
- (c_str[i]=='T' && c_str[i+1]=='T') ||
281
- (c_str[i]=='T' && c_str[i+1]=='C'))) {
282
- if (len[3+i%3] > longest) {
283
- longest = len[3+i%3];
284
- }
285
- len[3+i%3]=0;
286
- } else {
287
- len[3+i%3]++;
288
- }
289
- }
290
- if (len[i%3] > longest) {
291
- longest = len[i%3];
292
- }
293
- if (len[3+i%3] > longest) {
294
- longest = len[3+i%3];
295
- }
296
- return INT2NUM(longest);
297
- }
298
- SRC
299
- end
300
-
301
- # finds longest orf in a sequence
302
- def orf_length sequence
303
- longest = longest_orf(sequence)
304
- return longest
305
- end
306
-
307
- # return the number of bases in the assembly, calculating
308
- # from the assembly if it hasn't already been done.
309
- def n_bases
310
- unless @n_bases
311
- @n_bases = 0
312
- @assembly.each { |s| @n_bases += s.length }
313
- end
314
- @n_bases
315
- end
316
-
317
- def print_stats
318
- self.basic_stats.map do |k, v|
319
- "#{k}#{" " * (20 - (k.length + v.to_i.to_s.length))}#{v.to_i}"
320
- end.join('\n')
230
+ # yield the final contig
231
+ block.call(contig, contig.coverage)
321
232
  end
322
233
 
323
234
  end # Assembly