transrate 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +16 -1
- data/.travis.yml +8 -0
- data/README.md +45 -43
- data/Rakefile +36 -0
- data/bin/transrate +98 -50
- data/deps/deps.yaml +55 -0
- data/lib/transrate.rb +19 -4
- data/lib/transrate/assembly.rb +93 -182
- data/lib/transrate/bowtie2.rb +37 -13
- data/lib/transrate/cmd.rb +19 -0
- data/lib/transrate/comparative_metrics.rb +239 -19
- data/lib/transrate/contig.rb +212 -0
- data/lib/transrate/contig_metrics.rb +76 -0
- data/lib/transrate/read_metrics.rb +83 -41
- data/lib/transrate/samtools.rb +73 -0
- data/lib/transrate/transrater.rb +31 -11
- data/lib/transrate/version.rb +1 -1
- data/test/data/150uncovered.l.fq +892 -0
- data/test/data/150uncovered.r.fq +892 -0
- data/test/data/Os.protein.2.fa +95 -0
- data/test/data/Os.protein.fa +199 -0
- data/test/data/assembly.2.fa +26 -0
- data/test/{assembly.fasta → data/assembly.fasta} +0 -0
- data/test/data/bridging_reads.l.fastq +20 -0
- data/test/data/bridging_reads.r.fastq +20 -0
- data/test/data/sorghum_transcript.fa +4 -0
- data/test/data/tiny.sam +4 -0
- data/test/helper.rb +33 -2
- data/test/test_bowtie.rb +54 -0
- data/test/test_cmd.rb +15 -0
- data/test/test_comp_metrics.rb +177 -0
- data/test/test_contig.rb +61 -0
- data/test/test_contig_metrics.rb +50 -0
- data/test/test_inline.rb +10 -9
- data/test/test_read_metrics.rb +68 -0
- data/test/test_samtools.rb +22 -0
- data/test/test_transrate.rb +40 -0
- data/test/test_transrater.rb +68 -0
- data/transrate.gemspec +16 -10
- metadata +232 -57
- data/lib/transrate/express.rb +0 -37
- data/lib/transrate/log.rb +0 -16
- data/lib/transrate/rb_hit.rb +0 -33
- data/lib/transrate/reciprocal_annotation.rb +0 -105
- data/lib/transrate/usearch.rb +0 -66
- data/test/test_test.rb +0 -41
data/deps/deps.yaml
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
blastplus:
|
2
|
+
binaries:
|
3
|
+
- makeblastdb
|
4
|
+
- blastn
|
5
|
+
- tblastn
|
6
|
+
- blastp
|
7
|
+
- blastx
|
8
|
+
- tblastx
|
9
|
+
- makembindex
|
10
|
+
- psiblast
|
11
|
+
- rpsblast
|
12
|
+
- blastdbcmd
|
13
|
+
- segmasker
|
14
|
+
- dustmasker
|
15
|
+
- blast_formatter
|
16
|
+
- windowmasker
|
17
|
+
- blastdb_aliastool
|
18
|
+
- deltablast
|
19
|
+
- rpstblastn
|
20
|
+
- blastdbcheck
|
21
|
+
version:
|
22
|
+
number: '2.2.29'
|
23
|
+
command: 'blastx -version'
|
24
|
+
url:
|
25
|
+
64bit:
|
26
|
+
macosx: ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/2.2.29/ncbi-blast-2.2.29+-universal-macosx.tar.gz
|
27
|
+
linux: ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/2.2.29/ncbi-blast-2.2.29+-x64-linux.tar.gz
|
28
|
+
bowtie2:
|
29
|
+
binaries:
|
30
|
+
- bowtie2
|
31
|
+
- bowtie2-align-l
|
32
|
+
- bowtie2-align-s
|
33
|
+
- bowtie2-build
|
34
|
+
- bowtie2-build-l
|
35
|
+
- bowtie2-build-s
|
36
|
+
- bowtie2-inspect
|
37
|
+
- bowtie2-inspect-l
|
38
|
+
- bowtie2-inspect-s
|
39
|
+
version:
|
40
|
+
number: '2.2.3'
|
41
|
+
command: 'bowtie2 --version'
|
42
|
+
url:
|
43
|
+
64bit:
|
44
|
+
linux: http://downloads.sourceforge.net/project/bowtie-bio/bowtie2/2.2.3/bowtie2-2.2.3-linux-x86_64.zip
|
45
|
+
macosx: http://downloads.sourceforge.net/project/bowtie-bio/bowtie2/2.2.3/bowtie2-2.2.3-macos-x86_64.zip
|
46
|
+
express:
|
47
|
+
binaries:
|
48
|
+
- express
|
49
|
+
version:
|
50
|
+
number: '1.5.1'
|
51
|
+
command: 'express --version'
|
52
|
+
url:
|
53
|
+
64bit:
|
54
|
+
linux: http://bio.math.berkeley.edu/eXpress/downloads/express-1.5.1/express-1.5.1-linux_x86_64.tgz
|
55
|
+
macosx: http://bio.math.berkeley.edu/eXpress/downloads/express-1.5.1/express-1.5.1-macosx_x86_64.tgz
|
data/lib/transrate.rb
CHANGED
@@ -1,18 +1,33 @@
|
|
1
|
+
# before the
|
2
|
+
require 'rbconfig'
|
3
|
+
require 'yell'
|
4
|
+
RbConfig::CONFIG['CFLAGS'] = ''
|
5
|
+
|
1
6
|
require 'transrate/transrater'
|
2
7
|
require 'transrate/version'
|
8
|
+
require 'transrate/contig'
|
3
9
|
require 'transrate/assembly'
|
4
10
|
require 'transrate/bowtie2'
|
5
11
|
require 'transrate/read_metrics'
|
6
|
-
require 'transrate/usearch'
|
7
|
-
require 'transrate/rb_hit'
|
8
|
-
require 'transrate/reciprocal_annotation'
|
9
12
|
require 'transrate/comparative_metrics'
|
13
|
+
require 'transrate/contig_metrics'
|
10
14
|
require 'transrate/metric'
|
11
15
|
require 'transrate/dimension_reduce'
|
12
|
-
require 'transrate/
|
16
|
+
require 'transrate/samtools'
|
17
|
+
require 'transrate/cmd'
|
13
18
|
|
14
19
|
# Transrate is a comprehensive transcriptome assembly
|
15
20
|
# quality assessment tool.
|
16
21
|
module Transrate
|
17
22
|
|
23
|
+
# Create the universal logger and include it in Object
|
24
|
+
# making the logger object available everywhere
|
25
|
+
Yell.new(:format => "[%5L]: %m") do |l|
|
26
|
+
l.level = :info
|
27
|
+
l.name = Object
|
28
|
+
l.adapter STDOUT, level: [:debug, :info, :warn]
|
29
|
+
l.adapter STDERR, level: [:error, :fatal]
|
30
|
+
end
|
31
|
+
Object.send :include, Yell::Loggable
|
32
|
+
|
18
33
|
end # Transrate
|
data/lib/transrate/assembly.rb
CHANGED
@@ -1,8 +1,6 @@
|
|
1
1
|
require 'bio'
|
2
|
-
require 'bettersam'
|
3
2
|
require 'csv'
|
4
3
|
require 'forwardable'
|
5
|
-
require 'inline'
|
6
4
|
|
7
5
|
module Transrate
|
8
6
|
|
@@ -30,36 +28,28 @@ module Transrate
|
|
30
28
|
extend Forwardable
|
31
29
|
def_delegators :@assembly, :each, :<<, :size, :length
|
32
30
|
|
33
|
-
attr_accessor :
|
34
|
-
attr_accessor :orfs_ublast_db
|
31
|
+
attr_accessor :file
|
35
32
|
attr_reader :assembly
|
36
33
|
attr_reader :has_run
|
37
|
-
|
38
|
-
attr_accessor :file
|
34
|
+
attr_accessor :n_bases
|
39
35
|
attr_reader :n50
|
36
|
+
attr_accessor :contig_metrics
|
40
37
|
|
41
38
|
# Create a new Assembly.
|
42
39
|
#
|
43
40
|
# @param file [String] path to the assembly FASTA file
|
44
41
|
def initialize file
|
45
|
-
@file = file
|
42
|
+
@file = File.expand_path file
|
43
|
+
unless File.exist? @file
|
44
|
+
raise IOError.new "Assembly file doesn't exist: #{@file}"
|
45
|
+
end
|
46
46
|
@assembly = []
|
47
47
|
@n_bases = 0
|
48
48
|
Bio::FastaFormat.open(file).each do |entry|
|
49
49
|
@n_bases += entry.length
|
50
|
-
@assembly << entry
|
50
|
+
@assembly << Contig.new(entry)
|
51
51
|
end
|
52
|
-
|
53
|
-
|
54
|
-
# Return basic statistics about the assembly in
|
55
|
-
# the specified FASTA file
|
56
|
-
#
|
57
|
-
# @param file [String] path to assebmly FASTA file
|
58
|
-
#
|
59
|
-
# @return [Hash] basic statistics about the assembly
|
60
|
-
def self.stats_from_fasta file
|
61
|
-
a = Assembly.new file
|
62
|
-
a.basic_stats
|
52
|
+
@contig_metrics = ContigMetrics.new self
|
63
53
|
end
|
64
54
|
|
65
55
|
# Generate and store the basic statistics for this assembly
|
@@ -74,6 +64,7 @@ module Transrate
|
|
74
64
|
singleton_class.class_eval { attr_accessor attr_ivar }
|
75
65
|
self.instance_variable_set(ivar, value)
|
76
66
|
end
|
67
|
+
@contig_metrics.run
|
77
68
|
@has_run = true
|
78
69
|
end
|
79
70
|
|
@@ -85,60 +76,14 @@ module Transrate
|
|
85
76
|
# @param threads [Integer] number of threads to use
|
86
77
|
#
|
87
78
|
# @return [Hash] basic statistics about the assembly
|
88
|
-
def basic_stats threads=
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
# create a work queue to process contigs in parallel
|
94
|
-
queue = Queue.new
|
95
|
-
|
96
|
-
# split the contigs into equal sized bins, one bin per thread
|
97
|
-
binsize = (@assembly.size / threads.to_f).ceil
|
98
|
-
@assembly.each_slice(binsize) do |bin|
|
99
|
-
queue << bin
|
100
|
-
end
|
101
|
-
|
102
|
-
# a classic threadpool - an Array of threads that allows
|
103
|
-
# us to assign work to each thread and then aggregate their
|
104
|
-
# results when they are all finished
|
105
|
-
threadpool = []
|
106
|
-
|
107
|
-
# assign one bin of contigs to each thread from the queue.
|
108
|
-
# each thread will process its bin of contigs and then wait
|
109
|
-
# for the others to finish.
|
110
|
-
semaphore = Mutex.new
|
111
|
-
stats = []
|
112
|
-
|
113
|
-
threads.times do
|
114
|
-
threadpool << Thread.new do |thread|
|
115
|
-
# keep looping until we run out of bins
|
116
|
-
until queue.empty?
|
117
|
-
|
118
|
-
# use non-blocking pop, so an exception is raised
|
119
|
-
# when the queue runs dry
|
120
|
-
bin = queue.pop(true) rescue nil
|
121
|
-
if bin
|
122
|
-
# calculate basic stats for the bin, storing them
|
123
|
-
# in the current thread so they can be collected
|
124
|
-
# in the main thread.
|
125
|
-
bin_stats = basic_bin_stats bin
|
126
|
-
semaphore.synchronize { stats << bin_stats }
|
127
|
-
end
|
128
|
-
end
|
129
|
-
end
|
130
|
-
end
|
131
|
-
|
132
|
-
# collect the stats calculated in each thread and join
|
133
|
-
# the threads to terminate them
|
134
|
-
threadpool.each(&:join)
|
135
|
-
|
136
|
-
# merge the collected stats and return then
|
137
|
-
merge_basic_stats stats
|
138
|
-
|
79
|
+
def basic_stats threads=1
|
80
|
+
return @basic_stats if @basic_stats
|
81
|
+
bin = @assembly.dup
|
82
|
+
@basic_stats = basic_bin_stats bin
|
83
|
+
@basic_stats
|
139
84
|
end # basic_stats
|
140
85
|
|
141
|
-
|
86
|
+
|
142
87
|
# Calculate basic statistics in an single thread for a bin
|
143
88
|
# of contigs.
|
144
89
|
#
|
@@ -156,13 +101,13 @@ module Transrate
|
|
156
101
|
#
|
157
102
|
# @param [Array] bin An array of Bio::Sequence objects
|
158
103
|
# representing contigs in the assembly
|
159
|
-
|
160
|
-
def basic_bin_stats bin
|
161
104
|
|
105
|
+
def basic_bin_stats bin
|
106
|
+
|
162
107
|
# cumulative length is a float so we can divide it
|
163
108
|
# accurately later to get the mean length
|
164
109
|
cumulative_length = 0.0
|
165
|
-
|
110
|
+
|
166
111
|
# we'll calculate Nx for x in [10, 30, 50, 70, 90]
|
167
112
|
# to do this we create a stack of the x values and
|
168
113
|
# pop the first one to set the first cutoff. when
|
@@ -174,23 +119,30 @@ module Transrate
|
|
174
119
|
x2 = x.clone
|
175
120
|
cutoff = x2.pop / 100.0
|
176
121
|
res = []
|
177
|
-
|
178
|
-
n10k = 0
|
179
|
-
orf_length_sum = 0
|
180
|
-
|
122
|
+
n_under_200, n_over_1k, n_over_10k, n_with_orf, orf_length_sum = 0,0,0,0,0
|
181
123
|
# sort the contigs in ascending length order
|
182
124
|
# and iterate over them
|
183
|
-
bin.sort_by! { |c| c.seq.
|
125
|
+
bin.sort_by! { |c| c.seq.length }
|
184
126
|
bin.each do |contig|
|
185
|
-
|
127
|
+
|
186
128
|
# increment our long contig counters if this
|
187
129
|
# contig is above the thresholds
|
188
|
-
|
189
|
-
|
130
|
+
if contig.length < 200
|
131
|
+
# ignore contigs less than 200 bases,
|
132
|
+
# but record how many there are
|
133
|
+
n_under_200 += 1
|
134
|
+
next
|
135
|
+
end
|
136
|
+
n_over_1k += 1 if contig.length > 1_000
|
137
|
+
n_over_10k += 1 if contig.length > 10_000
|
190
138
|
|
191
139
|
# add the length of the longest orf to the
|
192
140
|
# running total
|
193
|
-
|
141
|
+
orf_length = contig.orf_length
|
142
|
+
orf_length_sum += orf_length
|
143
|
+
# only consider orfs that are realistic length
|
144
|
+
# (here we set minimum amino acid length as 50)
|
145
|
+
n_with_orf += 1 if orf_length > 149
|
194
146
|
|
195
147
|
# increment the cumulative length and check whether the Nx
|
196
148
|
# cutoff has been reached. if it has, store the Nx value and
|
@@ -199,125 +151,84 @@ module Transrate
|
|
199
151
|
if cumulative_length >= @n_bases * cutoff
|
200
152
|
res << contig.length
|
201
153
|
if x2.empty?
|
202
|
-
cutoff=1
|
154
|
+
cutoff = 1
|
203
155
|
else
|
204
156
|
cutoff = x2.pop / 100.0
|
205
|
-
end
|
157
|
+
end
|
206
158
|
end
|
207
159
|
|
208
160
|
end
|
209
161
|
|
162
|
+
# if there aren't enough sequences we might have no value for some
|
163
|
+
# of the Nx. Fill the empty ones in with the longest contig length.
|
164
|
+
while res.length < x.length do
|
165
|
+
res << bin.last.length
|
166
|
+
end
|
167
|
+
|
210
168
|
# calculate and return the statistics as a hash
|
211
169
|
mean = cumulative_length / @assembly.size
|
212
|
-
ns = Hash[x.map { |n| "
|
170
|
+
ns = Hash[x.map { |n| "n#{n}" }.zip(res)]
|
213
171
|
{
|
214
172
|
'n_seqs' => bin.size,
|
215
173
|
'smallest' => bin.first.length,
|
216
174
|
'largest' => bin.last.length,
|
217
175
|
'n_bases' => n_bases,
|
218
176
|
'mean_len' => mean,
|
219
|
-
'
|
220
|
-
'
|
221
|
-
'
|
177
|
+
'n_under_200' => n_under_200,
|
178
|
+
'n_over_1k' => n_over_1k,
|
179
|
+
'n_over_10k' => n_over_10k,
|
180
|
+
'n_with_orf' => n_with_orf,
|
181
|
+
'mean_orf_percent' => 300 * orf_length_sum / (@assembly.size * mean)
|
222
182
|
}.merge ns
|
223
183
|
|
224
184
|
end # basic_bin_stats
|
225
185
|
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
186
|
+
# Calls *block* with two arguments, the contig and an array
|
187
|
+
# of integer per-base coverage counts.
|
188
|
+
#
|
189
|
+
# @param bam [Bio::Db::Sam] a bam alignment of reads against this assembly
|
190
|
+
# @param block [Block] the block to call
|
191
|
+
def each_with_coverage(bam, &block)
|
192
|
+
logger.debug 'enumerating assembly with coverage'
|
193
|
+
# generate coverage with samtools
|
194
|
+
covfile = Samtools.coverage bam
|
195
|
+
# get an assembly enumerator
|
196
|
+
assembly_enum = @assembly.to_enum
|
197
|
+
contig = assembly_enum.next
|
198
|
+
# precreate an array of the correct size to contain
|
199
|
+
# coverage. this is necessary because samtools mpileup
|
200
|
+
# doesn't print a result line for bases with 0 coverage
|
201
|
+
contig.coverage = Array.new(contig.length, 0)
|
202
|
+
# the columns we need
|
203
|
+
name_i, pos_i, cov_i = 0, 1, 3
|
204
|
+
# parse the coverage file
|
205
|
+
File.open(covfile).each_line do |line|
|
206
|
+
cols = line.chomp.split("\t")
|
207
|
+
unless (cols && cols.length > 4)
|
208
|
+
# last line
|
209
|
+
break
|
244
210
|
end
|
211
|
+
# extract the columns
|
212
|
+
name, pos, cov = cols[name_i], cols[pos_i].to_i, cols[cov_i].to_i
|
213
|
+
unless contig.name == name
|
214
|
+
while contig.name != name
|
215
|
+
begin
|
216
|
+
block.call(contig, contig.coverage)
|
217
|
+
contig = assembly_enum.next
|
218
|
+
contig.coverage = Array.new(contig.length, 0)
|
219
|
+
rescue StopIteration => stop_error
|
220
|
+
logger.error 'reached the end of assembly enumerator while ' +
|
221
|
+
'there were contigs left in the coverage results'
|
222
|
+
logger.error "final assembly contig: #{@assembly.last.name}"
|
223
|
+
logger.error "coverage contig: #{name}"
|
224
|
+
raise stop_error
|
225
|
+
end
|
226
|
+
end
|
227
|
+
end
|
228
|
+
contig.coverage[pos - 1] = cov
|
245
229
|
end
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
end # merge_basic_stats
|
250
|
-
|
251
|
-
inline do |builder|
|
252
|
-
|
253
|
-
builder.c <<SRC
|
254
|
-
static
|
255
|
-
void
|
256
|
-
longest_orf(VALUE _s) {
|
257
|
-
int i,sl,longest=0;
|
258
|
-
int len[6];
|
259
|
-
char * c_str;
|
260
|
-
|
261
|
-
sl = RSTRING_LEN(_s);
|
262
|
-
c_str = StringValueCStr(_s);
|
263
|
-
for (i=0;i<6;i++) {
|
264
|
-
len[i]=0;
|
265
|
-
}
|
266
|
-
for (i=0;i<sl-2;i++) {
|
267
|
-
if (c_str[i]=='T' &&
|
268
|
-
((c_str[i+1]=='A' && c_str[i+2]=='G') ||
|
269
|
-
(c_str[i+1]=='A' && c_str[i+2]=='A') ||
|
270
|
-
(c_str[i+1]=='G' && c_str[i+2]=='A'))) {
|
271
|
-
if (len[i%3] > longest) {
|
272
|
-
longest = len[i%3];
|
273
|
-
}
|
274
|
-
len[i%3]=0;
|
275
|
-
} else {
|
276
|
-
len[i%3]++;
|
277
|
-
}
|
278
|
-
if (c_str[i+2]=='A' &&
|
279
|
-
((c_str[i]=='C' && c_str[i+1]=='T') ||
|
280
|
-
(c_str[i]=='T' && c_str[i+1]=='T') ||
|
281
|
-
(c_str[i]=='T' && c_str[i+1]=='C'))) {
|
282
|
-
if (len[3+i%3] > longest) {
|
283
|
-
longest = len[3+i%3];
|
284
|
-
}
|
285
|
-
len[3+i%3]=0;
|
286
|
-
} else {
|
287
|
-
len[3+i%3]++;
|
288
|
-
}
|
289
|
-
}
|
290
|
-
if (len[i%3] > longest) {
|
291
|
-
longest = len[i%3];
|
292
|
-
}
|
293
|
-
if (len[3+i%3] > longest) {
|
294
|
-
longest = len[3+i%3];
|
295
|
-
}
|
296
|
-
return INT2NUM(longest);
|
297
|
-
}
|
298
|
-
SRC
|
299
|
-
end
|
300
|
-
|
301
|
-
# finds longest orf in a sequence
|
302
|
-
def orf_length sequence
|
303
|
-
longest = longest_orf(sequence)
|
304
|
-
return longest
|
305
|
-
end
|
306
|
-
|
307
|
-
# return the number of bases in the assembly, calculating
|
308
|
-
# from the assembly if it hasn't already been done.
|
309
|
-
def n_bases
|
310
|
-
unless @n_bases
|
311
|
-
@n_bases = 0
|
312
|
-
@assembly.each { |s| @n_bases += s.length }
|
313
|
-
end
|
314
|
-
@n_bases
|
315
|
-
end
|
316
|
-
|
317
|
-
def print_stats
|
318
|
-
self.basic_stats.map do |k, v|
|
319
|
-
"#{k}#{" " * (20 - (k.length + v.to_i.to_s.length))}#{v.to_i}"
|
320
|
-
end.join('\n')
|
230
|
+
# yield the final contig
|
231
|
+
block.call(contig, contig.coverage)
|
321
232
|
end
|
322
233
|
|
323
234
|
end # Assembly
|