transrate 1.0.0.beta3 → 1.0.0.beta4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +4 -0
- data/Rakefile +1 -1
- data/bin/transrate +12 -8
- data/deps/deps.yaml +6 -3
- data/lib/transrate/assembly.rb +4 -0
- data/lib/transrate/comparative_metrics.rb +97 -210
- data/lib/transrate/read_metrics.rb +11 -2
- data/lib/transrate/salmon.rb +13 -3
- data/lib/transrate/score_optimiser.rb +6 -1
- data/lib/transrate/snap.rb +1 -2
- data/lib/transrate/transrater.rb +2 -2
- data/lib/transrate/version.rb +1 -1
- data/test/data/sorghum_100.fa +1 -1
- data/test/data/test.sf +19 -19
- data/test/test_assembly.rb +2 -2
- data/test/test_bin.rb +1 -0
- data/test/test_comp_metrics.rb +333 -162
- data/test/test_optimiser.rb +2 -2
- data/test/test_read_metrics.rb +9 -9
- data/test/test_salmon.rb +13 -1
- data/test/test_transrater.rb +1 -1
- data/transrate.gemspec +1 -1
- metadata +6 -8
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e6662137942e3933a714a4950d4de53826b71aad
|
4
|
+
data.tar.gz: 2863b7bc2b0f63af4c43dc31c77f43324dafccfc
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 30f64bf256bb98ae6031ee635c1187af680974fa97bc719d108a3878cfacaef7216fd5c8c5e0a07be077eca55d6379e1562f425eee3948d86a1f2dc816f1cfef
|
7
|
+
data.tar.gz: fb8fbd6cc684112693fd7241860724938610efa9b8950811b4cfe7b9b5232b28a8101a1c49157fa22a0aff979029628c6141f75a271871cc0a07ffe2402b7666
|
data/README.md
CHANGED
@@ -2,8 +2,11 @@
|
|
2
2
|
<img alt="Transrate - understand your transcriptome assembly" src="https://github.com/Blahah/transrate/raw/master/docs/transrate_logo_full.png">
|
3
3
|
</p>
|
4
4
|
|
5
|
+
Download latest binaries: [![Download](https://api.bintray.com/packages/blahah/generic/transrate/images/download.svg)][bintray]
|
6
|
+
|
5
7
|
Need help? Chat to us live: [![Gitter](https://badges.gitter.im/Join Chat.svg)](https://gitter.im/Blahah/transrate?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
|
6
8
|
|
9
|
+
|
7
10
|
## Development status
|
8
11
|
|
9
12
|
[![Gem Version](http://img.shields.io/gem/v/transrate.svg)][gem]
|
@@ -19,6 +22,7 @@ Need help? Chat to us live: [![Gitter](https://badges.gitter.im/Join Chat.svg)](
|
|
19
22
|
[gemnasium]: https://gemnasium.com/Blahah/transrate
|
20
23
|
[codeclimate]: https://codeclimate.com/github/Blahah/transrate
|
21
24
|
[coveralls]: https://coveralls.io/r/Blahah/transrate
|
25
|
+
[bintray]: https://bintray.com/blahah/generic/transrate/_latestVersion
|
22
26
|
|
23
27
|
This software is being actively developed. Please be aware that there may be bugs. If you find any, please report them on the [issue tracker](https://github.com/Blahah/transrate/issues).
|
24
28
|
|
data/Rakefile
CHANGED
data/bin/transrate
CHANGED
@@ -32,7 +32,8 @@ opts = Trollop::options do
|
|
32
32
|
banner <<-EOS
|
33
33
|
|
34
34
|
Transrate v#{Transrate::VERSION::STRING.dup}
|
35
|
-
by Richard Smith-Unna
|
35
|
+
by Richard Smith-Unna, Chris Boursnell, Rob Patro,
|
36
|
+
Julian Hibberd, and Steve Kelly
|
36
37
|
|
37
38
|
DESCRIPTION:
|
38
39
|
Analyse a de-novo transcriptome assembly using three kinds of metrics:
|
@@ -64,12 +65,12 @@ opts = Trollop::options do
|
|
64
65
|
EOS
|
65
66
|
opt :assembly, "Assembly file(s) in FASTA format, comma-separated",
|
66
67
|
:type => String
|
67
|
-
opt :reference, "Reference proteome file in FASTA format",
|
68
|
-
:type => String
|
69
68
|
opt :left, "Left reads file in FASTQ format",
|
70
69
|
:type => String
|
71
70
|
opt :right, "Right reads file in FASTQ format",
|
72
71
|
:type => String
|
72
|
+
opt :reference, "Reference proteome file in FASTA format",
|
73
|
+
:type => String
|
73
74
|
opt :threads, "Number of threads to use",
|
74
75
|
:default => 8,
|
75
76
|
:type => Integer
|
@@ -95,7 +96,7 @@ blast_dep = File.join(gem_dir, 'deps', 'blast.yaml')
|
|
95
96
|
deps, read_deps, ref_deps = nil
|
96
97
|
unless opts.install_deps.nil?
|
97
98
|
|
98
|
-
unless %w[all read
|
99
|
+
unless %w[all read ref].include? opts.install_deps
|
99
100
|
raise TransrateError.new "install-deps #{opts.install_deps} is not valid. " +
|
100
101
|
"You must specify one of: all, read, ref."
|
101
102
|
end
|
@@ -110,12 +111,12 @@ if deps || read_deps || ref_deps
|
|
110
111
|
puts "Checking dependencies"
|
111
112
|
|
112
113
|
missing = []
|
113
|
-
if
|
114
|
+
if deps || read_deps
|
114
115
|
Bindeps.require gem_deps
|
115
116
|
missing += Bindeps.missing gem_deps
|
116
117
|
end
|
117
118
|
|
118
|
-
if
|
119
|
+
if deps || ref_deps
|
119
120
|
Bindeps.require blast_dep
|
120
121
|
missing += Bindeps.missing blast_dep
|
121
122
|
end
|
@@ -307,9 +308,12 @@ assemblies.split(',').each do |assembly|
|
|
307
308
|
logger.info "No reference provided, skipping comparative diagnostics"
|
308
309
|
end
|
309
310
|
|
311
|
+
prefix = "#{opts.outfile}_#{File.basename(assembly)}"
|
312
|
+
|
310
313
|
if (opts.left && opts.right)
|
311
314
|
score = transrater.assembly_score
|
312
|
-
|
315
|
+
|
316
|
+
optimal, cutoff = transrater.assembly_optimal_score prefix
|
313
317
|
unless score.nil?
|
314
318
|
pretty_print_hash({:TRANSRATE_ASSEMBLY_SCORE => score}, report_width, 4)
|
315
319
|
logger.info "-" * report_width
|
@@ -320,7 +324,7 @@ assemblies.split(',').each do |assembly|
|
|
320
324
|
end
|
321
325
|
|
322
326
|
# write contig metrics to file for each contig
|
323
|
-
outfile = "#{
|
327
|
+
outfile = "#{prefix}_contigs.csv"
|
324
328
|
logger.info "Writing contig metrics for each contig to #{outfile}"
|
325
329
|
# have option to turn off, default on
|
326
330
|
first=true
|
data/deps/deps.yaml
CHANGED
@@ -23,9 +23,11 @@ salmon:
|
|
23
23
|
binaries:
|
24
24
|
- salmon
|
25
25
|
libraries:
|
26
|
+
- libgcc_s.so.1
|
26
27
|
- libgomp.so.1
|
27
28
|
- libm.so.6
|
28
29
|
- librt.so.1
|
30
|
+
- libstdc++.so.6
|
29
31
|
- libtbb.so
|
30
32
|
- libtbb.so.2
|
31
33
|
- libtbbmalloc.so
|
@@ -37,10 +39,11 @@ salmon:
|
|
37
39
|
- libtbb.dylib
|
38
40
|
- libtbbmalloc.dylib
|
39
41
|
- libtbbmalloc_proxy.dylib
|
42
|
+
- libsalmon_core.a
|
40
43
|
version:
|
41
|
-
number: '0.
|
44
|
+
number: '0.4'
|
42
45
|
command: 'salmon -v'
|
43
46
|
url:
|
44
47
|
64bit:
|
45
|
-
linux: https://github.com/
|
46
|
-
macosx: https://github.com/
|
48
|
+
linux: https://github.com/COMBINE-lab/salmon/releases/download/v0.4.0/SalmonBeta-0.4.0_DebianSqueeze.tar.gz
|
49
|
+
macosx: https://github.com/COMBINE-lab/salmon/releases/download/v0.4.0/SalmonBeta-0.4.0_OSX-10.10.tar.gz
|
data/lib/transrate/assembly.rb
CHANGED
@@ -48,6 +48,10 @@ module Transrate
|
|
48
48
|
@assembly = {}
|
49
49
|
@n_bases = 0
|
50
50
|
Bio::FastaFormat.open(file).each do |entry|
|
51
|
+
if entry.seq.length == 0
|
52
|
+
logger.error "Entry found with no sequence #{entry.entry_id}"
|
53
|
+
raise AssemblyError
|
54
|
+
end
|
51
55
|
@n_bases += entry.length
|
52
56
|
contig = Contig.new(entry)
|
53
57
|
if @assembly.key?(contig.name)
|
@@ -20,249 +20,136 @@ module Transrate
|
|
20
20
|
end
|
21
21
|
|
22
22
|
def run
|
23
|
-
|
24
|
-
|
25
|
-
@reciprocal_hits = @crbblast.size
|
26
|
-
@rbh_per_reference = @reciprocal_hits.to_f / @reference.size.to_f
|
27
|
-
@p_contigs_with_recip = @crbblast.reciprocals.size / @assembly.size.to_f
|
28
|
-
@n_contigs_with_recip = @crbblast.reciprocals.size
|
29
|
-
count_ref_crbbs
|
30
|
-
@p_refs_with_recip = @n_refs_with_recip / @reference.size.to_f
|
31
|
-
self.run_comp_stats
|
23
|
+
crbblast = run_crb_blast
|
24
|
+
calculate_reference_coverage crbblast
|
32
25
|
@has_run = true
|
33
26
|
end
|
34
27
|
|
35
|
-
def
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
@
|
40
|
-
|
41
|
-
|
42
|
-
|
28
|
+
def calculate_reference_coverage crbblast
|
29
|
+
# The reciprocals hash in crb blast has contig names as the key.
|
30
|
+
# In order to look up by the reference name we need to reverse this.
|
31
|
+
# Scan through the reciprocals and get this Hit objects and add them to
|
32
|
+
# the @reference object for each reference sequence
|
33
|
+
get_reference_hits crbblast
|
34
|
+
per_query_contig_reference_coverage
|
35
|
+
per_target_contig_reference_coverage crbblast
|
43
36
|
end
|
44
37
|
|
45
|
-
def
|
46
|
-
crbblast
|
47
|
-
|
48
|
-
crbblast
|
49
|
-
end
|
50
|
-
|
51
|
-
# coverage of contigs that have reciprocal hits
|
52
|
-
# divided by number of reciprocal targets
|
53
|
-
def coverage crbblast
|
54
|
-
return @reference_coverage unless @reference_coverage.nil?
|
55
|
-
crbblast.reciprocals.each do |key, list|
|
56
|
-
list.each_with_index do |hit, i|
|
38
|
+
def get_reference_hits crbblast
|
39
|
+
crbblast.reciprocals.each do |query_id, list|
|
40
|
+
list.each do |hit|
|
57
41
|
unless @reference.assembly.key? hit.target
|
58
42
|
raise TransrateError.new "#{hit.target} not in reference"
|
59
43
|
end
|
60
44
|
@reference[hit.target].hits << hit
|
45
|
+
end
|
46
|
+
end
|
47
|
+
@comp_stats[:CRBB_hits] = crbblast.size
|
48
|
+
@comp_stats[:n_contigs_with_CRBB] = crbblast.reciprocals.size
|
49
|
+
@comp_stats[:p_contigs_with_CRBB] = crbblast.reciprocals.size/@assembly.size.to_f
|
50
|
+
end
|
61
51
|
|
62
|
-
|
63
|
-
|
52
|
+
def per_query_contig_reference_coverage
|
53
|
+
# for each query contig in the @assembly find out how much it covers
|
54
|
+
# the reference
|
55
|
+
n_refs_with_recip = 0
|
56
|
+
total_crbb_hits = 0
|
57
|
+
@reference.each do |ref_contig_name, ref_contig|
|
58
|
+
ref_contig.hits.each do |hit| # a Hit from query to target
|
59
|
+
query_contig_name = hit.query
|
60
|
+
unless @assembly.assembly.key? query_contig_name
|
61
|
+
raise TransrateError.new "#{query_contig_name} not in assembly"
|
64
62
|
end
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
if
|
69
|
-
|
70
|
-
|
63
|
+
@assembly[query_contig_name].has_crb = true
|
64
|
+
@assembly[query_contig_name].hits << hit
|
65
|
+
raise TransrateError.new "query should not be protein" if hit.qprot
|
66
|
+
if hit.tprot
|
67
|
+
coverage = 3*hit.alnlen+2 - 3*hit.mismatches - 3*hit.gaps
|
68
|
+
coverage /= 3.0*hit.tlen
|
71
69
|
else
|
72
|
-
|
73
|
-
|
70
|
+
coverage = hit.alnlen - hit.mismatches - hit.gaps
|
71
|
+
coverage /= hit.tlen.to_f
|
74
72
|
end
|
75
|
-
|
73
|
+
@assembly[query_contig_name].reference_coverage = coverage
|
74
|
+
end
|
75
|
+
|
76
|
+
if ref_contig.hits.size > 0 # this reference has a crbblast hit
|
77
|
+
n_refs_with_recip += 1
|
76
78
|
end
|
79
|
+
total_crbb_hits += ref_contig.hits.size
|
77
80
|
end
|
81
|
+
@comp_stats[:rbh_per_reference] = total_crbb_hits / @reference.size.to_f
|
82
|
+
@comp_stats[:n_refs_with_CRBB] = n_refs_with_recip
|
83
|
+
@comp_stats[:p_refs_with_CRBB] = n_refs_with_recip / @reference.size.to_f
|
84
|
+
end
|
85
|
+
|
86
|
+
def per_target_contig_reference_coverage crbblast
|
87
|
+
# each target sequence in the reference can have multiple query contigs
|
88
|
+
# hit it. to calculate the reference coverage you can't just add up the
|
89
|
+
# alignment lengths. you have to make sure that overlaps are taken into
|
90
|
+
# account
|
91
|
+
coverage_thresholds = [0.25, 0.5, 0.75, 0.85, 0.95]
|
92
|
+
coverage_totals = [0, 0, 0, 0, 0]
|
93
|
+
prot = crbblast.target_is_prot
|
78
94
|
total_coverage = 0
|
79
95
|
total_length = 0
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
key = ref_contig.name
|
84
|
-
list = ref_contig.hits
|
85
|
-
if crbblast.target_is_prot
|
86
|
-
total_length += ref_contig.length * 3
|
96
|
+
@reference.each do |ref_contig_name, ref_contig|
|
97
|
+
if prot
|
98
|
+
covered = Array.new(ref_contig.length*3, false)
|
87
99
|
else
|
88
|
-
|
100
|
+
covered = Array.new(ref_contig.length, false)
|
89
101
|
end
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
start, stop = [hit.tstart, hit.tend].minmax
|
102
|
-
end
|
103
|
-
if blocks.empty?
|
104
|
-
blocks << [start, stop]
|
105
|
-
else
|
106
|
-
found=false
|
107
|
-
blocks.each do |block|
|
108
|
-
# if query overlaps with any block extend that block
|
109
|
-
o = overlap(block[0], block[1], start, stop)
|
110
|
-
if o == 0 # perfect overlap
|
111
|
-
found=true
|
112
|
-
elsif o == 1 # partial overlap
|
113
|
-
block[0] = start
|
114
|
-
found=true
|
115
|
-
elsif o == 2 # partial overlap
|
116
|
-
block[1] = stop
|
117
|
-
found=true
|
118
|
-
elsif o == 3 # full overlap
|
119
|
-
block[0] = start
|
120
|
-
block[1] = stop
|
121
|
-
found=true
|
122
|
-
elsif o == 4 # full overlap
|
123
|
-
found=true
|
124
|
-
# nothing
|
125
|
-
# elsif o == 5 || o == 6 # no overlap
|
126
|
-
|
127
|
-
end
|
102
|
+
ref_contig.hits.each_with_index do |hit, i| # a Hit from query to target
|
103
|
+
if prot
|
104
|
+
if hit.qstart % 3 == 0
|
105
|
+
tstart = 3*hit.tstart-4
|
106
|
+
tend = 3*hit.tend
|
107
|
+
elsif hit.qstart % 3 == 1
|
108
|
+
tstart = 3*hit.tstart-2
|
109
|
+
tend = 3*hit.tend
|
110
|
+
elsif hit.qstart % 3 == 2
|
111
|
+
tstart = 3*hit.tstart-3
|
112
|
+
tend = 3*hit.tend-1
|
128
113
|
end
|
129
|
-
if
|
130
|
-
|
114
|
+
if hit.qlen % 3 == 1
|
115
|
+
tend += 1
|
116
|
+
elsif hit.qlen % 3 == 2
|
117
|
+
tend += 2
|
131
118
|
end
|
132
|
-
|
133
|
-
|
119
|
+
else
|
120
|
+
tstart = hit.tstart
|
121
|
+
tend = hit.tend
|
122
|
+
end
|
123
|
+
(tstart..tend).each do |b|
|
124
|
+
covered[b-1] = true # blast coords are 1 indexed
|
134
125
|
end
|
135
126
|
end
|
136
|
-
|
137
|
-
|
138
|
-
if a!=b
|
139
|
-
o = overlap(block_a[0], block_a[1], block_b[0], block_b[1])
|
140
|
-
if o == 0 # perfect overlap
|
141
|
-
block_b[0]=-1
|
142
|
-
block_b[1]=-1
|
143
|
-
elsif o == 1 # partial overlap
|
144
|
-
block_a[0] = block_b[0]
|
145
|
-
block_b[0] = -1
|
146
|
-
block_b[1] = -1
|
147
|
-
elsif o == 2 # partial overlap
|
148
|
-
block_a[1] = block_b[1]
|
149
|
-
block_b[0] = -1
|
150
|
-
block_b[1] = -1
|
151
|
-
elsif o == 3 # full overlap
|
152
|
-
block_a[0] = block_b[0]
|
153
|
-
block_a[1] = block_b[1]
|
154
|
-
block_b[0] = -1
|
155
|
-
block_b[1] = -1
|
156
|
-
elsif o == 4 # full overlap
|
157
|
-
block_b[0] = -1
|
158
|
-
block_b[1] = -1
|
159
|
-
# elsif o == 5 || o == 6# no overlap
|
160
|
-
# do nothing
|
161
|
-
# elsif # no overlap
|
162
|
-
# do nothing
|
163
|
-
end
|
164
|
-
end
|
165
|
-
end # each_with_index b
|
166
|
-
end # each_with_index a
|
167
|
-
# sum blocks to find total coverage
|
168
|
-
length_of_coverage = calculate_coverage blocks
|
169
|
-
if target_length > 0
|
170
|
-
ref_p = length_of_coverage / target_length.to_f
|
171
|
-
else
|
172
|
-
ref_p = 0
|
173
|
-
end
|
127
|
+
coverage = covered.reduce(0) { |sum, v| v ? sum + 1 : sum }
|
128
|
+
ref_p = coverage / covered.length.to_f
|
174
129
|
ref_contig.reference_coverage = ref_p
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
@cov[i] +=1
|
130
|
+
coverage_thresholds.each_with_index do |n, index|
|
131
|
+
if ref_p >= n
|
132
|
+
coverage_totals[index] += 1
|
179
133
|
end
|
180
134
|
end
|
181
135
|
|
182
|
-
total_coverage +=
|
136
|
+
total_coverage += coverage
|
137
|
+
total_length += covered.length
|
183
138
|
end
|
184
139
|
|
185
|
-
|
186
|
-
|
140
|
+
# calculate proportion of ref sequences with coveragre over thresholds
|
141
|
+
coverage_thresholds.each_with_index do |p, i|
|
142
|
+
@comp_stats["cov#{(100*p).to_i}".to_sym] = coverage_totals[i]
|
187
143
|
@comp_stats["p_cov#{(100*p).to_i}".to_sym] =
|
188
|
-
|
189
|
-
end
|
190
|
-
total_coverage / total_length.to_f
|
191
|
-
end
|
192
|
-
|
193
|
-
# Calculate the total coverage from a set of coverage blocks
|
194
|
-
def calculate_coverage blocks
|
195
|
-
coverage = 0
|
196
|
-
blocks.each do |block|
|
197
|
-
if block[0] and block[1]
|
198
|
-
if block[0]>=0 and block[1]>=0
|
199
|
-
coverage += block[1] - block[0] + 1
|
200
|
-
end
|
201
|
-
else
|
202
|
-
puts "error: key = #{key}, #{blocks}"
|
203
|
-
end
|
204
|
-
end
|
205
|
-
coverage
|
206
|
-
end
|
207
|
-
|
208
|
-
# Count reference proteins with at least one recprocal hit
|
209
|
-
def count_ref_crbbs
|
210
|
-
@n_refs_with_recip = @reference.assembly.inject(0) do |sum, entry|
|
211
|
-
name, contig = entry
|
212
|
-
sum + (contig.hits.length > 0 ? 1 : 0)
|
213
|
-
end
|
214
|
-
end
|
215
|
-
|
216
|
-
def overlap(astart, astop, bstart, bstop)
|
217
|
-
if astart == bstart and astop == bstop
|
218
|
-
return 0
|
219
|
-
elsif astart < bstart
|
220
|
-
if astop > bstart
|
221
|
-
if astop > bstop
|
222
|
-
return 4
|
223
|
-
else
|
224
|
-
return 2
|
225
|
-
end
|
226
|
-
else
|
227
|
-
return 5 # no overlap
|
228
|
-
end
|
229
|
-
else
|
230
|
-
if bstop > astart
|
231
|
-
if bstop > astop
|
232
|
-
return 3
|
233
|
-
else
|
234
|
-
return 1
|
235
|
-
end
|
236
|
-
else
|
237
|
-
return 6 # no overlap
|
238
|
-
end
|
144
|
+
coverage_totals[i]/@reference.size.to_f
|
239
145
|
end
|
146
|
+
@comp_stats[:reference_coverage] = total_coverage / total_length.to_f
|
240
147
|
end
|
241
148
|
|
242
|
-
def
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
if astop > bstart
|
247
|
-
if astop > bstop
|
248
|
-
return (bstop-bstart+1)/(astop-astart+1).to_f # 4
|
249
|
-
else
|
250
|
-
return (astop-bstart+1)/(bstop-astart+1).to_f # 2
|
251
|
-
end
|
252
|
-
else
|
253
|
-
return 0 # 5 no overlap
|
254
|
-
end
|
255
|
-
else
|
256
|
-
if bstop > astart
|
257
|
-
if bstop > astop
|
258
|
-
return (astop-astart+1)/(bstop-bstart+1).to_f # 3
|
259
|
-
else
|
260
|
-
return (bstop-astart+1)/(astop-bstart+1).to_f # 1
|
261
|
-
end
|
262
|
-
else
|
263
|
-
return 0 # 6 no overlap
|
264
|
-
end
|
265
|
-
end
|
149
|
+
def run_crb_blast
|
150
|
+
crbblast = CRB_Blast::CRB_Blast.new @assembly.file, @reference.file
|
151
|
+
crbblast.run(1e-5, @threads, true)
|
152
|
+
crbblast
|
266
153
|
end
|
267
154
|
|
268
155
|
end # ComparativeMetrics
|