transrate 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +16 -1
  3. data/.travis.yml +8 -0
  4. data/README.md +45 -43
  5. data/Rakefile +36 -0
  6. data/bin/transrate +98 -50
  7. data/deps/deps.yaml +55 -0
  8. data/lib/transrate.rb +19 -4
  9. data/lib/transrate/assembly.rb +93 -182
  10. data/lib/transrate/bowtie2.rb +37 -13
  11. data/lib/transrate/cmd.rb +19 -0
  12. data/lib/transrate/comparative_metrics.rb +239 -19
  13. data/lib/transrate/contig.rb +212 -0
  14. data/lib/transrate/contig_metrics.rb +76 -0
  15. data/lib/transrate/read_metrics.rb +83 -41
  16. data/lib/transrate/samtools.rb +73 -0
  17. data/lib/transrate/transrater.rb +31 -11
  18. data/lib/transrate/version.rb +1 -1
  19. data/test/data/150uncovered.l.fq +892 -0
  20. data/test/data/150uncovered.r.fq +892 -0
  21. data/test/data/Os.protein.2.fa +95 -0
  22. data/test/data/Os.protein.fa +199 -0
  23. data/test/data/assembly.2.fa +26 -0
  24. data/test/{assembly.fasta → data/assembly.fasta} +0 -0
  25. data/test/data/bridging_reads.l.fastq +20 -0
  26. data/test/data/bridging_reads.r.fastq +20 -0
  27. data/test/data/sorghum_transcript.fa +4 -0
  28. data/test/data/tiny.sam +4 -0
  29. data/test/helper.rb +33 -2
  30. data/test/test_bowtie.rb +54 -0
  31. data/test/test_cmd.rb +15 -0
  32. data/test/test_comp_metrics.rb +177 -0
  33. data/test/test_contig.rb +61 -0
  34. data/test/test_contig_metrics.rb +50 -0
  35. data/test/test_inline.rb +10 -9
  36. data/test/test_read_metrics.rb +68 -0
  37. data/test/test_samtools.rb +22 -0
  38. data/test/test_transrate.rb +40 -0
  39. data/test/test_transrater.rb +68 -0
  40. data/transrate.gemspec +16 -10
  41. metadata +232 -57
  42. data/lib/transrate/express.rb +0 -37
  43. data/lib/transrate/log.rb +0 -16
  44. data/lib/transrate/rb_hit.rb +0 -33
  45. data/lib/transrate/reciprocal_annotation.rb +0 -105
  46. data/lib/transrate/usearch.rb +0 -66
  47. data/test/test_test.rb +0 -41
data/deps/deps.yaml ADDED
@@ -0,0 +1,55 @@
1
+ blastplus:
2
+ binaries:
3
+ - makeblastdb
4
+ - blastn
5
+ - tblastn
6
+ - blastp
7
+ - blastx
8
+ - tblastx
9
+ - makembindex
10
+ - psiblast
11
+ - rpsblast
12
+ - blastdbcmd
13
+ - segmasker
14
+ - dustmasker
15
+ - blast_formatter
16
+ - windowmasker
17
+ - blastdb_aliastool
18
+ - deltablast
19
+ - rpstblastn
20
+ - blastdbcheck
21
+ version:
22
+ number: '2.2.29'
23
+ command: 'blastx -version'
24
+ url:
25
+ 64bit:
26
+ macosx: ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/2.2.29/ncbi-blast-2.2.29+-universal-macosx.tar.gz
27
+ linux: ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/2.2.29/ncbi-blast-2.2.29+-x64-linux.tar.gz
28
+ bowtie2:
29
+ binaries:
30
+ - bowtie2
31
+ - bowtie2-align-l
32
+ - bowtie2-align-s
33
+ - bowtie2-build
34
+ - bowtie2-build-l
35
+ - bowtie2-build-s
36
+ - bowtie2-inspect
37
+ - bowtie2-inspect-l
38
+ - bowtie2-inspect-s
39
+ version:
40
+ number: '2.2.3'
41
+ command: 'bowtie2 --version'
42
+ url:
43
+ 64bit:
44
+ linux: http://downloads.sourceforge.net/project/bowtie-bio/bowtie2/2.2.3/bowtie2-2.2.3-linux-x86_64.zip
45
+ macosx: http://downloads.sourceforge.net/project/bowtie-bio/bowtie2/2.2.3/bowtie2-2.2.3-macos-x86_64.zip
46
+ express:
47
+ binaries:
48
+ - express
49
+ version:
50
+ number: '1.5.1'
51
+ command: 'express --version'
52
+ url:
53
+ 64bit:
54
+ linux: http://bio.math.berkeley.edu/eXpress/downloads/express-1.5.1/express-1.5.1-linux_x86_64.tgz
55
+ macosx: http://bio.math.berkeley.edu/eXpress/downloads/express-1.5.1/express-1.5.1-macosx_x86_64.tgz
data/lib/transrate.rb CHANGED
@@ -1,18 +1,33 @@
1
+ # before the
2
+ require 'rbconfig'
3
+ require 'yell'
4
+ RbConfig::CONFIG['CFLAGS'] = ''
5
+
1
6
  require 'transrate/transrater'
2
7
  require 'transrate/version'
8
+ require 'transrate/contig'
3
9
  require 'transrate/assembly'
4
10
  require 'transrate/bowtie2'
5
11
  require 'transrate/read_metrics'
6
- require 'transrate/usearch'
7
- require 'transrate/rb_hit'
8
- require 'transrate/reciprocal_annotation'
9
12
  require 'transrate/comparative_metrics'
13
+ require 'transrate/contig_metrics'
10
14
  require 'transrate/metric'
11
15
  require 'transrate/dimension_reduce'
12
- require 'transrate/express'
16
+ require 'transrate/samtools'
17
+ require 'transrate/cmd'
13
18
 
14
19
  # Transrate is a comprehensive transcriptome assembly
15
20
  # quality assessment tool.
16
21
  module Transrate
17
22
 
23
+ # Create the universal logger and include it in Object
24
+ # making the logger object available everywhere
25
+ Yell.new(:format => "[%5L]: %m") do |l|
26
+ l.level = :info
27
+ l.name = Object
28
+ l.adapter STDOUT, level: [:debug, :info, :warn]
29
+ l.adapter STDERR, level: [:error, :fatal]
30
+ end
31
+ Object.send :include, Yell::Loggable
32
+
18
33
  end # Transrate
@@ -1,8 +1,6 @@
1
1
  require 'bio'
2
- require 'bettersam'
3
2
  require 'csv'
4
3
  require 'forwardable'
5
- require 'inline'
6
4
 
7
5
  module Transrate
8
6
 
@@ -30,36 +28,28 @@ module Transrate
30
28
  extend Forwardable
31
29
  def_delegators :@assembly, :each, :<<, :size, :length
32
30
 
33
- attr_accessor :ublast_db
34
- attr_accessor :orfs_ublast_db
31
+ attr_accessor :file
35
32
  attr_reader :assembly
36
33
  attr_reader :has_run
37
- attr_writer :n_bases
38
- attr_accessor :file
34
+ attr_accessor :n_bases
39
35
  attr_reader :n50
36
+ attr_accessor :contig_metrics
40
37
 
41
38
  # Create a new Assembly.
42
39
  #
43
40
  # @param file [String] path to the assembly FASTA file
44
41
  def initialize file
45
- @file = file
42
+ @file = File.expand_path file
43
+ unless File.exist? @file
44
+ raise IOError.new "Assembly file doesn't exist: #{@file}"
45
+ end
46
46
  @assembly = []
47
47
  @n_bases = 0
48
48
  Bio::FastaFormat.open(file).each do |entry|
49
49
  @n_bases += entry.length
50
- @assembly << entry
50
+ @assembly << Contig.new(entry)
51
51
  end
52
- end
53
-
54
- # Return basic statistics about the assembly in
55
- # the specified FASTA file
56
- #
57
- # @param file [String] path to assebmly FASTA file
58
- #
59
- # @return [Hash] basic statistics about the assembly
60
- def self.stats_from_fasta file
61
- a = Assembly.new file
62
- a.basic_stats
52
+ @contig_metrics = ContigMetrics.new self
63
53
  end
64
54
 
65
55
  # Generate and store the basic statistics for this assembly
@@ -74,6 +64,7 @@ module Transrate
74
64
  singleton_class.class_eval { attr_accessor attr_ivar }
75
65
  self.instance_variable_set(ivar, value)
76
66
  end
67
+ @contig_metrics.run
77
68
  @has_run = true
78
69
  end
79
70
 
@@ -85,60 +76,14 @@ module Transrate
85
76
  # @param threads [Integer] number of threads to use
86
77
  #
87
78
  # @return [Hash] basic statistics about the assembly
88
- def basic_stats threads=8
89
-
90
- # disable threading basic stats for now
91
- threads = 1
92
-
93
- # create a work queue to process contigs in parallel
94
- queue = Queue.new
95
-
96
- # split the contigs into equal sized bins, one bin per thread
97
- binsize = (@assembly.size / threads.to_f).ceil
98
- @assembly.each_slice(binsize) do |bin|
99
- queue << bin
100
- end
101
-
102
- # a classic threadpool - an Array of threads that allows
103
- # us to assign work to each thread and then aggregate their
104
- # results when they are all finished
105
- threadpool = []
106
-
107
- # assign one bin of contigs to each thread from the queue.
108
- # each thread will process its bin of contigs and then wait
109
- # for the others to finish.
110
- semaphore = Mutex.new
111
- stats = []
112
-
113
- threads.times do
114
- threadpool << Thread.new do |thread|
115
- # keep looping until we run out of bins
116
- until queue.empty?
117
-
118
- # use non-blocking pop, so an exception is raised
119
- # when the queue runs dry
120
- bin = queue.pop(true) rescue nil
121
- if bin
122
- # calculate basic stats for the bin, storing them
123
- # in the current thread so they can be collected
124
- # in the main thread.
125
- bin_stats = basic_bin_stats bin
126
- semaphore.synchronize { stats << bin_stats }
127
- end
128
- end
129
- end
130
- end
131
-
132
- # collect the stats calculated in each thread and join
133
- # the threads to terminate them
134
- threadpool.each(&:join)
135
-
136
- # merge the collected stats and return then
137
- merge_basic_stats stats
138
-
79
+ def basic_stats threads=1
80
+ return @basic_stats if @basic_stats
81
+ bin = @assembly.dup
82
+ @basic_stats = basic_bin_stats bin
83
+ @basic_stats
139
84
  end # basic_stats
140
85
 
141
-
86
+
142
87
  # Calculate basic statistics in an single thread for a bin
143
88
  # of contigs.
144
89
  #
@@ -156,13 +101,13 @@ module Transrate
156
101
  #
157
102
  # @param [Array] bin An array of Bio::Sequence objects
158
103
  # representing contigs in the assembly
159
-
160
- def basic_bin_stats bin
161
104
 
105
+ def basic_bin_stats bin
106
+
162
107
  # cumulative length is a float so we can divide it
163
108
  # accurately later to get the mean length
164
109
  cumulative_length = 0.0
165
-
110
+
166
111
  # we'll calculate Nx for x in [10, 30, 50, 70, 90]
167
112
  # to do this we create a stack of the x values and
168
113
  # pop the first one to set the first cutoff. when
@@ -174,23 +119,30 @@ module Transrate
174
119
  x2 = x.clone
175
120
  cutoff = x2.pop / 100.0
176
121
  res = []
177
- n1k = 0
178
- n10k = 0
179
- orf_length_sum = 0
180
-
122
+ n_under_200, n_over_1k, n_over_10k, n_with_orf, orf_length_sum = 0,0,0,0,0
181
123
  # sort the contigs in ascending length order
182
124
  # and iterate over them
183
- bin.sort_by! { |c| c.seq.size }
125
+ bin.sort_by! { |c| c.seq.length }
184
126
  bin.each do |contig|
185
-
127
+
186
128
  # increment our long contig counters if this
187
129
  # contig is above the thresholds
188
- n1k += 1 if contig.length > 1_000
189
- n10k += 1 if contig.length > 10_000
130
+ if contig.length < 200
131
+ # ignore contigs less than 200 bases,
132
+ # but record how many there are
133
+ n_under_200 += 1
134
+ next
135
+ end
136
+ n_over_1k += 1 if contig.length > 1_000
137
+ n_over_10k += 1 if contig.length > 10_000
190
138
 
191
139
  # add the length of the longest orf to the
192
140
  # running total
193
- orf_length_sum += orf_length(contig.seq)
141
+ orf_length = contig.orf_length
142
+ orf_length_sum += orf_length
143
+ # only consider orfs that are realistic length
144
+ # (here we set minimum amino acid length as 50)
145
+ n_with_orf += 1 if orf_length > 149
194
146
 
195
147
  # increment the cumulative length and check whether the Nx
196
148
  # cutoff has been reached. if it has, store the Nx value and
@@ -199,125 +151,84 @@ module Transrate
199
151
  if cumulative_length >= @n_bases * cutoff
200
152
  res << contig.length
201
153
  if x2.empty?
202
- cutoff=1
154
+ cutoff = 1
203
155
  else
204
156
  cutoff = x2.pop / 100.0
205
- end
157
+ end
206
158
  end
207
159
 
208
160
  end
209
161
 
162
+ # if there aren't enough sequences we might have no value for some
163
+ # of the Nx. Fill the empty ones in with the longest contig length.
164
+ while res.length < x.length do
165
+ res << bin.last.length
166
+ end
167
+
210
168
  # calculate and return the statistics as a hash
211
169
  mean = cumulative_length / @assembly.size
212
- ns = Hash[x.map { |n| "N#{n}" }.zip(res)]
170
+ ns = Hash[x.map { |n| "n#{n}" }.zip(res)]
213
171
  {
214
172
  'n_seqs' => bin.size,
215
173
  'smallest' => bin.first.length,
216
174
  'largest' => bin.last.length,
217
175
  'n_bases' => n_bases,
218
176
  'mean_len' => mean,
219
- 'n_1k' => n1k,
220
- 'n_10k' => n10k,
221
- 'orf_percent' => 300 * orf_length_sum / (@assembly.size * mean)
177
+ 'n_under_200' => n_under_200,
178
+ 'n_over_1k' => n_over_1k,
179
+ 'n_over_10k' => n_over_10k,
180
+ 'n_with_orf' => n_with_orf,
181
+ 'mean_orf_percent' => 300 * orf_length_sum / (@assembly.size * mean)
222
182
  }.merge ns
223
183
 
224
184
  end # basic_bin_stats
225
185
 
226
- def merge_basic_stats stats
227
- # convert the array of hashes into a hash of arrays
228
- collect = Hash.new{|h,k| h[k]=[]}
229
- stats.each_with_object(collect) do |collect, result|
230
- collect.each{ |k, v| result[k] << v }
231
- end
232
- merged = {}
233
- collect.each_pair do |stat, values|
234
- if stat == 'orf_percent' || /N[0-9]{2}/ =~ stat
235
- # store the mean
236
- merged[stat] = values.inject(:+) / values.size
237
- elsif stat == 'smallest'
238
- merged[stat] = values.min
239
- elsif stat == 'largest'
240
- merged[stat] = values.max
241
- else
242
- # store the sum
243
- merged[stat] = values.inject(:+)
186
+ # Calls *block* with two arguments, the contig and an array
187
+ # of integer per-base coverage counts.
188
+ #
189
+ # @param bam [Bio::Db::Sam] a bam alignment of reads against this assembly
190
+ # @param block [Block] the block to call
191
+ def each_with_coverage(bam, &block)
192
+ logger.debug 'enumerating assembly with coverage'
193
+ # generate coverage with samtools
194
+ covfile = Samtools.coverage bam
195
+ # get an assembly enumerator
196
+ assembly_enum = @assembly.to_enum
197
+ contig = assembly_enum.next
198
+ # precreate an array of the correct size to contain
199
+ # coverage. this is necessary because samtools mpileup
200
+ # doesn't print a result line for bases with 0 coverage
201
+ contig.coverage = Array.new(contig.length, 0)
202
+ # the columns we need
203
+ name_i, pos_i, cov_i = 0, 1, 3
204
+ # parse the coverage file
205
+ File.open(covfile).each_line do |line|
206
+ cols = line.chomp.split("\t")
207
+ unless (cols && cols.length > 4)
208
+ # last line
209
+ break
244
210
  end
211
+ # extract the columns
212
+ name, pos, cov = cols[name_i], cols[pos_i].to_i, cols[cov_i].to_i
213
+ unless contig.name == name
214
+ while contig.name != name
215
+ begin
216
+ block.call(contig, contig.coverage)
217
+ contig = assembly_enum.next
218
+ contig.coverage = Array.new(contig.length, 0)
219
+ rescue StopIteration => stop_error
220
+ logger.error 'reached the end of assembly enumerator while ' +
221
+ 'there were contigs left in the coverage results'
222
+ logger.error "final assembly contig: #{@assembly.last.name}"
223
+ logger.error "coverage contig: #{name}"
224
+ raise stop_error
225
+ end
226
+ end
227
+ end
228
+ contig.coverage[pos - 1] = cov
245
229
  end
246
-
247
- merged
248
-
249
- end # merge_basic_stats
250
-
251
- inline do |builder|
252
-
253
- builder.c <<SRC
254
- static
255
- void
256
- longest_orf(VALUE _s) {
257
- int i,sl,longest=0;
258
- int len[6];
259
- char * c_str;
260
-
261
- sl = RSTRING_LEN(_s);
262
- c_str = StringValueCStr(_s);
263
- for (i=0;i<6;i++) {
264
- len[i]=0;
265
- }
266
- for (i=0;i<sl-2;i++) {
267
- if (c_str[i]=='T' &&
268
- ((c_str[i+1]=='A' && c_str[i+2]=='G') ||
269
- (c_str[i+1]=='A' && c_str[i+2]=='A') ||
270
- (c_str[i+1]=='G' && c_str[i+2]=='A'))) {
271
- if (len[i%3] > longest) {
272
- longest = len[i%3];
273
- }
274
- len[i%3]=0;
275
- } else {
276
- len[i%3]++;
277
- }
278
- if (c_str[i+2]=='A' &&
279
- ((c_str[i]=='C' && c_str[i+1]=='T') ||
280
- (c_str[i]=='T' && c_str[i+1]=='T') ||
281
- (c_str[i]=='T' && c_str[i+1]=='C'))) {
282
- if (len[3+i%3] > longest) {
283
- longest = len[3+i%3];
284
- }
285
- len[3+i%3]=0;
286
- } else {
287
- len[3+i%3]++;
288
- }
289
- }
290
- if (len[i%3] > longest) {
291
- longest = len[i%3];
292
- }
293
- if (len[3+i%3] > longest) {
294
- longest = len[3+i%3];
295
- }
296
- return INT2NUM(longest);
297
- }
298
- SRC
299
- end
300
-
301
- # finds longest orf in a sequence
302
- def orf_length sequence
303
- longest = longest_orf(sequence)
304
- return longest
305
- end
306
-
307
- # return the number of bases in the assembly, calculating
308
- # from the assembly if it hasn't already been done.
309
- def n_bases
310
- unless @n_bases
311
- @n_bases = 0
312
- @assembly.each { |s| @n_bases += s.length }
313
- end
314
- @n_bases
315
- end
316
-
317
- def print_stats
318
- self.basic_stats.map do |k, v|
319
- "#{k}#{" " * (20 - (k.length + v.to_i.to_s.length))}#{v.to_i}"
320
- end.join('\n')
230
+ # yield the final contig
231
+ block.call(contig, contig.coverage)
321
232
  end
322
233
 
323
234
  end # Assembly