transrate 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +16 -1
- data/.travis.yml +8 -0
- data/README.md +45 -43
- data/Rakefile +36 -0
- data/bin/transrate +98 -50
- data/deps/deps.yaml +55 -0
- data/lib/transrate.rb +19 -4
- data/lib/transrate/assembly.rb +93 -182
- data/lib/transrate/bowtie2.rb +37 -13
- data/lib/transrate/cmd.rb +19 -0
- data/lib/transrate/comparative_metrics.rb +239 -19
- data/lib/transrate/contig.rb +212 -0
- data/lib/transrate/contig_metrics.rb +76 -0
- data/lib/transrate/read_metrics.rb +83 -41
- data/lib/transrate/samtools.rb +73 -0
- data/lib/transrate/transrater.rb +31 -11
- data/lib/transrate/version.rb +1 -1
- data/test/data/150uncovered.l.fq +892 -0
- data/test/data/150uncovered.r.fq +892 -0
- data/test/data/Os.protein.2.fa +95 -0
- data/test/data/Os.protein.fa +199 -0
- data/test/data/assembly.2.fa +26 -0
- data/test/{assembly.fasta → data/assembly.fasta} +0 -0
- data/test/data/bridging_reads.l.fastq +20 -0
- data/test/data/bridging_reads.r.fastq +20 -0
- data/test/data/sorghum_transcript.fa +4 -0
- data/test/data/tiny.sam +4 -0
- data/test/helper.rb +33 -2
- data/test/test_bowtie.rb +54 -0
- data/test/test_cmd.rb +15 -0
- data/test/test_comp_metrics.rb +177 -0
- data/test/test_contig.rb +61 -0
- data/test/test_contig_metrics.rb +50 -0
- data/test/test_inline.rb +10 -9
- data/test/test_read_metrics.rb +68 -0
- data/test/test_samtools.rb +22 -0
- data/test/test_transrate.rb +40 -0
- data/test/test_transrater.rb +68 -0
- data/transrate.gemspec +16 -10
- metadata +232 -57
- data/lib/transrate/express.rb +0 -37
- data/lib/transrate/log.rb +0 -16
- data/lib/transrate/rb_hit.rb +0 -33
- data/lib/transrate/reciprocal_annotation.rb +0 -105
- data/lib/transrate/usearch.rb +0 -66
- data/test/test_test.rb +0 -41
data/deps/deps.yaml
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
blastplus:
|
2
|
+
binaries:
|
3
|
+
- makeblastdb
|
4
|
+
- blastn
|
5
|
+
- tblastn
|
6
|
+
- blastp
|
7
|
+
- blastx
|
8
|
+
- tblastx
|
9
|
+
- makembindex
|
10
|
+
- psiblast
|
11
|
+
- rpsblast
|
12
|
+
- blastdbcmd
|
13
|
+
- segmasker
|
14
|
+
- dustmasker
|
15
|
+
- blast_formatter
|
16
|
+
- windowmasker
|
17
|
+
- blastdb_aliastool
|
18
|
+
- deltablast
|
19
|
+
- rpstblastn
|
20
|
+
- blastdbcheck
|
21
|
+
version:
|
22
|
+
number: '2.2.29'
|
23
|
+
command: 'blastx -version'
|
24
|
+
url:
|
25
|
+
64bit:
|
26
|
+
macosx: ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/2.2.29/ncbi-blast-2.2.29+-universal-macosx.tar.gz
|
27
|
+
linux: ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/2.2.29/ncbi-blast-2.2.29+-x64-linux.tar.gz
|
28
|
+
bowtie2:
|
29
|
+
binaries:
|
30
|
+
- bowtie2
|
31
|
+
- bowtie2-align-l
|
32
|
+
- bowtie2-align-s
|
33
|
+
- bowtie2-build
|
34
|
+
- bowtie2-build-l
|
35
|
+
- bowtie2-build-s
|
36
|
+
- bowtie2-inspect
|
37
|
+
- bowtie2-inspect-l
|
38
|
+
- bowtie2-inspect-s
|
39
|
+
version:
|
40
|
+
number: '2.2.3'
|
41
|
+
command: 'bowtie2 --version'
|
42
|
+
url:
|
43
|
+
64bit:
|
44
|
+
linux: http://downloads.sourceforge.net/project/bowtie-bio/bowtie2/2.2.3/bowtie2-2.2.3-linux-x86_64.zip
|
45
|
+
macosx: http://downloads.sourceforge.net/project/bowtie-bio/bowtie2/2.2.3/bowtie2-2.2.3-macos-x86_64.zip
|
46
|
+
express:
|
47
|
+
binaries:
|
48
|
+
- express
|
49
|
+
version:
|
50
|
+
number: '1.5.1'
|
51
|
+
command: 'express --version'
|
52
|
+
url:
|
53
|
+
64bit:
|
54
|
+
linux: http://bio.math.berkeley.edu/eXpress/downloads/express-1.5.1/express-1.5.1-linux_x86_64.tgz
|
55
|
+
macosx: http://bio.math.berkeley.edu/eXpress/downloads/express-1.5.1/express-1.5.1-macosx_x86_64.tgz
|
data/lib/transrate.rb
CHANGED
@@ -1,18 +1,33 @@
|
|
1
|
+
# before the
|
2
|
+
require 'rbconfig'
|
3
|
+
require 'yell'
|
4
|
+
RbConfig::CONFIG['CFLAGS'] = ''
|
5
|
+
|
1
6
|
require 'transrate/transrater'
|
2
7
|
require 'transrate/version'
|
8
|
+
require 'transrate/contig'
|
3
9
|
require 'transrate/assembly'
|
4
10
|
require 'transrate/bowtie2'
|
5
11
|
require 'transrate/read_metrics'
|
6
|
-
require 'transrate/usearch'
|
7
|
-
require 'transrate/rb_hit'
|
8
|
-
require 'transrate/reciprocal_annotation'
|
9
12
|
require 'transrate/comparative_metrics'
|
13
|
+
require 'transrate/contig_metrics'
|
10
14
|
require 'transrate/metric'
|
11
15
|
require 'transrate/dimension_reduce'
|
12
|
-
require 'transrate/
|
16
|
+
require 'transrate/samtools'
|
17
|
+
require 'transrate/cmd'
|
13
18
|
|
14
19
|
# Transrate is a comprehensive transcriptome assembly
|
15
20
|
# quality assessment tool.
|
16
21
|
module Transrate
|
17
22
|
|
23
|
+
# Create the universal logger and include it in Object
|
24
|
+
# making the logger object available everywhere
|
25
|
+
Yell.new(:format => "[%5L]: %m") do |l|
|
26
|
+
l.level = :info
|
27
|
+
l.name = Object
|
28
|
+
l.adapter STDOUT, level: [:debug, :info, :warn]
|
29
|
+
l.adapter STDERR, level: [:error, :fatal]
|
30
|
+
end
|
31
|
+
Object.send :include, Yell::Loggable
|
32
|
+
|
18
33
|
end # Transrate
|
data/lib/transrate/assembly.rb
CHANGED
@@ -1,8 +1,6 @@
|
|
1
1
|
require 'bio'
|
2
|
-
require 'bettersam'
|
3
2
|
require 'csv'
|
4
3
|
require 'forwardable'
|
5
|
-
require 'inline'
|
6
4
|
|
7
5
|
module Transrate
|
8
6
|
|
@@ -30,36 +28,28 @@ module Transrate
|
|
30
28
|
extend Forwardable
|
31
29
|
def_delegators :@assembly, :each, :<<, :size, :length
|
32
30
|
|
33
|
-
attr_accessor :
|
34
|
-
attr_accessor :orfs_ublast_db
|
31
|
+
attr_accessor :file
|
35
32
|
attr_reader :assembly
|
36
33
|
attr_reader :has_run
|
37
|
-
|
38
|
-
attr_accessor :file
|
34
|
+
attr_accessor :n_bases
|
39
35
|
attr_reader :n50
|
36
|
+
attr_accessor :contig_metrics
|
40
37
|
|
41
38
|
# Create a new Assembly.
|
42
39
|
#
|
43
40
|
# @param file [String] path to the assembly FASTA file
|
44
41
|
def initialize file
|
45
|
-
@file = file
|
42
|
+
@file = File.expand_path file
|
43
|
+
unless File.exist? @file
|
44
|
+
raise IOError.new "Assembly file doesn't exist: #{@file}"
|
45
|
+
end
|
46
46
|
@assembly = []
|
47
47
|
@n_bases = 0
|
48
48
|
Bio::FastaFormat.open(file).each do |entry|
|
49
49
|
@n_bases += entry.length
|
50
|
-
@assembly << entry
|
50
|
+
@assembly << Contig.new(entry)
|
51
51
|
end
|
52
|
-
|
53
|
-
|
54
|
-
# Return basic statistics about the assembly in
|
55
|
-
# the specified FASTA file
|
56
|
-
#
|
57
|
-
# @param file [String] path to assebmly FASTA file
|
58
|
-
#
|
59
|
-
# @return [Hash] basic statistics about the assembly
|
60
|
-
def self.stats_from_fasta file
|
61
|
-
a = Assembly.new file
|
62
|
-
a.basic_stats
|
52
|
+
@contig_metrics = ContigMetrics.new self
|
63
53
|
end
|
64
54
|
|
65
55
|
# Generate and store the basic statistics for this assembly
|
@@ -74,6 +64,7 @@ module Transrate
|
|
74
64
|
singleton_class.class_eval { attr_accessor attr_ivar }
|
75
65
|
self.instance_variable_set(ivar, value)
|
76
66
|
end
|
67
|
+
@contig_metrics.run
|
77
68
|
@has_run = true
|
78
69
|
end
|
79
70
|
|
@@ -85,60 +76,14 @@ module Transrate
|
|
85
76
|
# @param threads [Integer] number of threads to use
|
86
77
|
#
|
87
78
|
# @return [Hash] basic statistics about the assembly
|
88
|
-
def basic_stats threads=
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
# create a work queue to process contigs in parallel
|
94
|
-
queue = Queue.new
|
95
|
-
|
96
|
-
# split the contigs into equal sized bins, one bin per thread
|
97
|
-
binsize = (@assembly.size / threads.to_f).ceil
|
98
|
-
@assembly.each_slice(binsize) do |bin|
|
99
|
-
queue << bin
|
100
|
-
end
|
101
|
-
|
102
|
-
# a classic threadpool - an Array of threads that allows
|
103
|
-
# us to assign work to each thread and then aggregate their
|
104
|
-
# results when they are all finished
|
105
|
-
threadpool = []
|
106
|
-
|
107
|
-
# assign one bin of contigs to each thread from the queue.
|
108
|
-
# each thread will process its bin of contigs and then wait
|
109
|
-
# for the others to finish.
|
110
|
-
semaphore = Mutex.new
|
111
|
-
stats = []
|
112
|
-
|
113
|
-
threads.times do
|
114
|
-
threadpool << Thread.new do |thread|
|
115
|
-
# keep looping until we run out of bins
|
116
|
-
until queue.empty?
|
117
|
-
|
118
|
-
# use non-blocking pop, so an exception is raised
|
119
|
-
# when the queue runs dry
|
120
|
-
bin = queue.pop(true) rescue nil
|
121
|
-
if bin
|
122
|
-
# calculate basic stats for the bin, storing them
|
123
|
-
# in the current thread so they can be collected
|
124
|
-
# in the main thread.
|
125
|
-
bin_stats = basic_bin_stats bin
|
126
|
-
semaphore.synchronize { stats << bin_stats }
|
127
|
-
end
|
128
|
-
end
|
129
|
-
end
|
130
|
-
end
|
131
|
-
|
132
|
-
# collect the stats calculated in each thread and join
|
133
|
-
# the threads to terminate them
|
134
|
-
threadpool.each(&:join)
|
135
|
-
|
136
|
-
# merge the collected stats and return then
|
137
|
-
merge_basic_stats stats
|
138
|
-
|
79
|
+
def basic_stats threads=1
|
80
|
+
return @basic_stats if @basic_stats
|
81
|
+
bin = @assembly.dup
|
82
|
+
@basic_stats = basic_bin_stats bin
|
83
|
+
@basic_stats
|
139
84
|
end # basic_stats
|
140
85
|
|
141
|
-
|
86
|
+
|
142
87
|
# Calculate basic statistics in an single thread for a bin
|
143
88
|
# of contigs.
|
144
89
|
#
|
@@ -156,13 +101,13 @@ module Transrate
|
|
156
101
|
#
|
157
102
|
# @param [Array] bin An array of Bio::Sequence objects
|
158
103
|
# representing contigs in the assembly
|
159
|
-
|
160
|
-
def basic_bin_stats bin
|
161
104
|
|
105
|
+
def basic_bin_stats bin
|
106
|
+
|
162
107
|
# cumulative length is a float so we can divide it
|
163
108
|
# accurately later to get the mean length
|
164
109
|
cumulative_length = 0.0
|
165
|
-
|
110
|
+
|
166
111
|
# we'll calculate Nx for x in [10, 30, 50, 70, 90]
|
167
112
|
# to do this we create a stack of the x values and
|
168
113
|
# pop the first one to set the first cutoff. when
|
@@ -174,23 +119,30 @@ module Transrate
|
|
174
119
|
x2 = x.clone
|
175
120
|
cutoff = x2.pop / 100.0
|
176
121
|
res = []
|
177
|
-
|
178
|
-
n10k = 0
|
179
|
-
orf_length_sum = 0
|
180
|
-
|
122
|
+
n_under_200, n_over_1k, n_over_10k, n_with_orf, orf_length_sum = 0,0,0,0,0
|
181
123
|
# sort the contigs in ascending length order
|
182
124
|
# and iterate over them
|
183
|
-
bin.sort_by! { |c| c.seq.
|
125
|
+
bin.sort_by! { |c| c.seq.length }
|
184
126
|
bin.each do |contig|
|
185
|
-
|
127
|
+
|
186
128
|
# increment our long contig counters if this
|
187
129
|
# contig is above the thresholds
|
188
|
-
|
189
|
-
|
130
|
+
if contig.length < 200
|
131
|
+
# ignore contigs less than 200 bases,
|
132
|
+
# but record how many there are
|
133
|
+
n_under_200 += 1
|
134
|
+
next
|
135
|
+
end
|
136
|
+
n_over_1k += 1 if contig.length > 1_000
|
137
|
+
n_over_10k += 1 if contig.length > 10_000
|
190
138
|
|
191
139
|
# add the length of the longest orf to the
|
192
140
|
# running total
|
193
|
-
|
141
|
+
orf_length = contig.orf_length
|
142
|
+
orf_length_sum += orf_length
|
143
|
+
# only consider orfs that are realistic length
|
144
|
+
# (here we set minimum amino acid length as 50)
|
145
|
+
n_with_orf += 1 if orf_length > 149
|
194
146
|
|
195
147
|
# increment the cumulative length and check whether the Nx
|
196
148
|
# cutoff has been reached. if it has, store the Nx value and
|
@@ -199,125 +151,84 @@ module Transrate
|
|
199
151
|
if cumulative_length >= @n_bases * cutoff
|
200
152
|
res << contig.length
|
201
153
|
if x2.empty?
|
202
|
-
cutoff=1
|
154
|
+
cutoff = 1
|
203
155
|
else
|
204
156
|
cutoff = x2.pop / 100.0
|
205
|
-
end
|
157
|
+
end
|
206
158
|
end
|
207
159
|
|
208
160
|
end
|
209
161
|
|
162
|
+
# if there aren't enough sequences we might have no value for some
|
163
|
+
# of the Nx. Fill the empty ones in with the longest contig length.
|
164
|
+
while res.length < x.length do
|
165
|
+
res << bin.last.length
|
166
|
+
end
|
167
|
+
|
210
168
|
# calculate and return the statistics as a hash
|
211
169
|
mean = cumulative_length / @assembly.size
|
212
|
-
ns = Hash[x.map { |n| "
|
170
|
+
ns = Hash[x.map { |n| "n#{n}" }.zip(res)]
|
213
171
|
{
|
214
172
|
'n_seqs' => bin.size,
|
215
173
|
'smallest' => bin.first.length,
|
216
174
|
'largest' => bin.last.length,
|
217
175
|
'n_bases' => n_bases,
|
218
176
|
'mean_len' => mean,
|
219
|
-
'
|
220
|
-
'
|
221
|
-
'
|
177
|
+
'n_under_200' => n_under_200,
|
178
|
+
'n_over_1k' => n_over_1k,
|
179
|
+
'n_over_10k' => n_over_10k,
|
180
|
+
'n_with_orf' => n_with_orf,
|
181
|
+
'mean_orf_percent' => 300 * orf_length_sum / (@assembly.size * mean)
|
222
182
|
}.merge ns
|
223
183
|
|
224
184
|
end # basic_bin_stats
|
225
185
|
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
186
|
+
# Calls *block* with two arguments, the contig and an array
|
187
|
+
# of integer per-base coverage counts.
|
188
|
+
#
|
189
|
+
# @param bam [Bio::Db::Sam] a bam alignment of reads against this assembly
|
190
|
+
# @param block [Block] the block to call
|
191
|
+
def each_with_coverage(bam, &block)
|
192
|
+
logger.debug 'enumerating assembly with coverage'
|
193
|
+
# generate coverage with samtools
|
194
|
+
covfile = Samtools.coverage bam
|
195
|
+
# get an assembly enumerator
|
196
|
+
assembly_enum = @assembly.to_enum
|
197
|
+
contig = assembly_enum.next
|
198
|
+
# precreate an array of the correct size to contain
|
199
|
+
# coverage. this is necessary because samtools mpileup
|
200
|
+
# doesn't print a result line for bases with 0 coverage
|
201
|
+
contig.coverage = Array.new(contig.length, 0)
|
202
|
+
# the columns we need
|
203
|
+
name_i, pos_i, cov_i = 0, 1, 3
|
204
|
+
# parse the coverage file
|
205
|
+
File.open(covfile).each_line do |line|
|
206
|
+
cols = line.chomp.split("\t")
|
207
|
+
unless (cols && cols.length > 4)
|
208
|
+
# last line
|
209
|
+
break
|
244
210
|
end
|
211
|
+
# extract the columns
|
212
|
+
name, pos, cov = cols[name_i], cols[pos_i].to_i, cols[cov_i].to_i
|
213
|
+
unless contig.name == name
|
214
|
+
while contig.name != name
|
215
|
+
begin
|
216
|
+
block.call(contig, contig.coverage)
|
217
|
+
contig = assembly_enum.next
|
218
|
+
contig.coverage = Array.new(contig.length, 0)
|
219
|
+
rescue StopIteration => stop_error
|
220
|
+
logger.error 'reached the end of assembly enumerator while ' +
|
221
|
+
'there were contigs left in the coverage results'
|
222
|
+
logger.error "final assembly contig: #{@assembly.last.name}"
|
223
|
+
logger.error "coverage contig: #{name}"
|
224
|
+
raise stop_error
|
225
|
+
end
|
226
|
+
end
|
227
|
+
end
|
228
|
+
contig.coverage[pos - 1] = cov
|
245
229
|
end
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
end # merge_basic_stats
|
250
|
-
|
251
|
-
inline do |builder|
|
252
|
-
|
253
|
-
builder.c <<SRC
|
254
|
-
static
|
255
|
-
void
|
256
|
-
longest_orf(VALUE _s) {
|
257
|
-
int i,sl,longest=0;
|
258
|
-
int len[6];
|
259
|
-
char * c_str;
|
260
|
-
|
261
|
-
sl = RSTRING_LEN(_s);
|
262
|
-
c_str = StringValueCStr(_s);
|
263
|
-
for (i=0;i<6;i++) {
|
264
|
-
len[i]=0;
|
265
|
-
}
|
266
|
-
for (i=0;i<sl-2;i++) {
|
267
|
-
if (c_str[i]=='T' &&
|
268
|
-
((c_str[i+1]=='A' && c_str[i+2]=='G') ||
|
269
|
-
(c_str[i+1]=='A' && c_str[i+2]=='A') ||
|
270
|
-
(c_str[i+1]=='G' && c_str[i+2]=='A'))) {
|
271
|
-
if (len[i%3] > longest) {
|
272
|
-
longest = len[i%3];
|
273
|
-
}
|
274
|
-
len[i%3]=0;
|
275
|
-
} else {
|
276
|
-
len[i%3]++;
|
277
|
-
}
|
278
|
-
if (c_str[i+2]=='A' &&
|
279
|
-
((c_str[i]=='C' && c_str[i+1]=='T') ||
|
280
|
-
(c_str[i]=='T' && c_str[i+1]=='T') ||
|
281
|
-
(c_str[i]=='T' && c_str[i+1]=='C'))) {
|
282
|
-
if (len[3+i%3] > longest) {
|
283
|
-
longest = len[3+i%3];
|
284
|
-
}
|
285
|
-
len[3+i%3]=0;
|
286
|
-
} else {
|
287
|
-
len[3+i%3]++;
|
288
|
-
}
|
289
|
-
}
|
290
|
-
if (len[i%3] > longest) {
|
291
|
-
longest = len[i%3];
|
292
|
-
}
|
293
|
-
if (len[3+i%3] > longest) {
|
294
|
-
longest = len[3+i%3];
|
295
|
-
}
|
296
|
-
return INT2NUM(longest);
|
297
|
-
}
|
298
|
-
SRC
|
299
|
-
end
|
300
|
-
|
301
|
-
# finds longest orf in a sequence
|
302
|
-
def orf_length sequence
|
303
|
-
longest = longest_orf(sequence)
|
304
|
-
return longest
|
305
|
-
end
|
306
|
-
|
307
|
-
# return the number of bases in the assembly, calculating
|
308
|
-
# from the assembly if it hasn't already been done.
|
309
|
-
def n_bases
|
310
|
-
unless @n_bases
|
311
|
-
@n_bases = 0
|
312
|
-
@assembly.each { |s| @n_bases += s.length }
|
313
|
-
end
|
314
|
-
@n_bases
|
315
|
-
end
|
316
|
-
|
317
|
-
def print_stats
|
318
|
-
self.basic_stats.map do |k, v|
|
319
|
-
"#{k}#{" " * (20 - (k.length + v.to_i.to_s.length))}#{v.to_i}"
|
320
|
-
end.join('\n')
|
230
|
+
# yield the final contig
|
231
|
+
block.call(contig, contig.coverage)
|
321
232
|
end
|
322
233
|
|
323
234
|
end # Assembly
|