transrate 1.0.0.beta3 → 1.0.0.beta4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +4 -0
- data/Rakefile +1 -1
- data/bin/transrate +12 -8
- data/deps/deps.yaml +6 -3
- data/lib/transrate/assembly.rb +4 -0
- data/lib/transrate/comparative_metrics.rb +97 -210
- data/lib/transrate/read_metrics.rb +11 -2
- data/lib/transrate/salmon.rb +13 -3
- data/lib/transrate/score_optimiser.rb +6 -1
- data/lib/transrate/snap.rb +1 -2
- data/lib/transrate/transrater.rb +2 -2
- data/lib/transrate/version.rb +1 -1
- data/test/data/sorghum_100.fa +1 -1
- data/test/data/test.sf +19 -19
- data/test/test_assembly.rb +2 -2
- data/test/test_bin.rb +1 -0
- data/test/test_comp_metrics.rb +333 -162
- data/test/test_optimiser.rb +2 -2
- data/test/test_read_metrics.rb +9 -9
- data/test/test_salmon.rb +13 -1
- data/test/test_transrater.rb +1 -1
- data/transrate.gemspec +1 -1
- metadata +6 -8
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e6662137942e3933a714a4950d4de53826b71aad
|
4
|
+
data.tar.gz: 2863b7bc2b0f63af4c43dc31c77f43324dafccfc
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 30f64bf256bb98ae6031ee635c1187af680974fa97bc719d108a3878cfacaef7216fd5c8c5e0a07be077eca55d6379e1562f425eee3948d86a1f2dc816f1cfef
|
7
|
+
data.tar.gz: fb8fbd6cc684112693fd7241860724938610efa9b8950811b4cfe7b9b5232b28a8101a1c49157fa22a0aff979029628c6141f75a271871cc0a07ffe2402b7666
|
data/README.md
CHANGED
@@ -2,8 +2,11 @@
|
|
2
2
|
<img alt="Transrate - understand your transcriptome assembly" src="https://github.com/Blahah/transrate/raw/master/docs/transrate_logo_full.png">
|
3
3
|
</p>
|
4
4
|
|
5
|
+
Download latest binaries: [][bintray]
|
6
|
+
|
5
7
|
Need help? Chat to us live: [](https://gitter.im/Blahah/transrate?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
|
6
8
|
|
9
|
+
|
7
10
|
## Development status
|
8
11
|
|
9
12
|
[][gem]
|
@@ -19,6 +22,7 @@ Need help? Chat to us live: [](
|
|
19
22
|
[gemnasium]: https://gemnasium.com/Blahah/transrate
|
20
23
|
[codeclimate]: https://codeclimate.com/github/Blahah/transrate
|
21
24
|
[coveralls]: https://coveralls.io/r/Blahah/transrate
|
25
|
+
[bintray]: https://bintray.com/blahah/generic/transrate/_latestVersion
|
22
26
|
|
23
27
|
This software is being actively developed. Please be aware that there may be bugs. If you find any, please report them on the [issue tracker](https://github.com/Blahah/transrate/issues).
|
24
28
|
|
data/Rakefile
CHANGED
data/bin/transrate
CHANGED
@@ -32,7 +32,8 @@ opts = Trollop::options do
|
|
32
32
|
banner <<-EOS
|
33
33
|
|
34
34
|
Transrate v#{Transrate::VERSION::STRING.dup}
|
35
|
-
by Richard Smith-Unna
|
35
|
+
by Richard Smith-Unna, Chris Boursnell, Rob Patro,
|
36
|
+
Julian Hibberd, and Steve Kelly
|
36
37
|
|
37
38
|
DESCRIPTION:
|
38
39
|
Analyse a de-novo transcriptome assembly using three kinds of metrics:
|
@@ -64,12 +65,12 @@ opts = Trollop::options do
|
|
64
65
|
EOS
|
65
66
|
opt :assembly, "Assembly file(s) in FASTA format, comma-separated",
|
66
67
|
:type => String
|
67
|
-
opt :reference, "Reference proteome file in FASTA format",
|
68
|
-
:type => String
|
69
68
|
opt :left, "Left reads file in FASTQ format",
|
70
69
|
:type => String
|
71
70
|
opt :right, "Right reads file in FASTQ format",
|
72
71
|
:type => String
|
72
|
+
opt :reference, "Reference proteome file in FASTA format",
|
73
|
+
:type => String
|
73
74
|
opt :threads, "Number of threads to use",
|
74
75
|
:default => 8,
|
75
76
|
:type => Integer
|
@@ -95,7 +96,7 @@ blast_dep = File.join(gem_dir, 'deps', 'blast.yaml')
|
|
95
96
|
deps, read_deps, ref_deps = nil
|
96
97
|
unless opts.install_deps.nil?
|
97
98
|
|
98
|
-
unless %w[all read
|
99
|
+
unless %w[all read ref].include? opts.install_deps
|
99
100
|
raise TransrateError.new "install-deps #{opts.install_deps} is not valid. " +
|
100
101
|
"You must specify one of: all, read, ref."
|
101
102
|
end
|
@@ -110,12 +111,12 @@ if deps || read_deps || ref_deps
|
|
110
111
|
puts "Checking dependencies"
|
111
112
|
|
112
113
|
missing = []
|
113
|
-
if
|
114
|
+
if deps || read_deps
|
114
115
|
Bindeps.require gem_deps
|
115
116
|
missing += Bindeps.missing gem_deps
|
116
117
|
end
|
117
118
|
|
118
|
-
if
|
119
|
+
if deps || ref_deps
|
119
120
|
Bindeps.require blast_dep
|
120
121
|
missing += Bindeps.missing blast_dep
|
121
122
|
end
|
@@ -307,9 +308,12 @@ assemblies.split(',').each do |assembly|
|
|
307
308
|
logger.info "No reference provided, skipping comparative diagnostics"
|
308
309
|
end
|
309
310
|
|
311
|
+
prefix = "#{opts.outfile}_#{File.basename(assembly)}"
|
312
|
+
|
310
313
|
if (opts.left && opts.right)
|
311
314
|
score = transrater.assembly_score
|
312
|
-
|
315
|
+
|
316
|
+
optimal, cutoff = transrater.assembly_optimal_score prefix
|
313
317
|
unless score.nil?
|
314
318
|
pretty_print_hash({:TRANSRATE_ASSEMBLY_SCORE => score}, report_width, 4)
|
315
319
|
logger.info "-" * report_width
|
@@ -320,7 +324,7 @@ assemblies.split(',').each do |assembly|
|
|
320
324
|
end
|
321
325
|
|
322
326
|
# write contig metrics to file for each contig
|
323
|
-
outfile = "#{
|
327
|
+
outfile = "#{prefix}_contigs.csv"
|
324
328
|
logger.info "Writing contig metrics for each contig to #{outfile}"
|
325
329
|
# have option to turn off, default on
|
326
330
|
first=true
|
data/deps/deps.yaml
CHANGED
@@ -23,9 +23,11 @@ salmon:
|
|
23
23
|
binaries:
|
24
24
|
- salmon
|
25
25
|
libraries:
|
26
|
+
- libgcc_s.so.1
|
26
27
|
- libgomp.so.1
|
27
28
|
- libm.so.6
|
28
29
|
- librt.so.1
|
30
|
+
- libstdc++.so.6
|
29
31
|
- libtbb.so
|
30
32
|
- libtbb.so.2
|
31
33
|
- libtbbmalloc.so
|
@@ -37,10 +39,11 @@ salmon:
|
|
37
39
|
- libtbb.dylib
|
38
40
|
- libtbbmalloc.dylib
|
39
41
|
- libtbbmalloc_proxy.dylib
|
42
|
+
- libsalmon_core.a
|
40
43
|
version:
|
41
|
-
number: '0.
|
44
|
+
number: '0.4'
|
42
45
|
command: 'salmon -v'
|
43
46
|
url:
|
44
47
|
64bit:
|
45
|
-
linux: https://github.com/
|
46
|
-
macosx: https://github.com/
|
48
|
+
linux: https://github.com/COMBINE-lab/salmon/releases/download/v0.4.0/SalmonBeta-0.4.0_DebianSqueeze.tar.gz
|
49
|
+
macosx: https://github.com/COMBINE-lab/salmon/releases/download/v0.4.0/SalmonBeta-0.4.0_OSX-10.10.tar.gz
|
data/lib/transrate/assembly.rb
CHANGED
@@ -48,6 +48,10 @@ module Transrate
|
|
48
48
|
@assembly = {}
|
49
49
|
@n_bases = 0
|
50
50
|
Bio::FastaFormat.open(file).each do |entry|
|
51
|
+
if entry.seq.length == 0
|
52
|
+
logger.error "Entry found with no sequence #{entry.entry_id}"
|
53
|
+
raise AssemblyError
|
54
|
+
end
|
51
55
|
@n_bases += entry.length
|
52
56
|
contig = Contig.new(entry)
|
53
57
|
if @assembly.key?(contig.name)
|
@@ -20,249 +20,136 @@ module Transrate
|
|
20
20
|
end
|
21
21
|
|
22
22
|
def run
|
23
|
-
|
24
|
-
|
25
|
-
@reciprocal_hits = @crbblast.size
|
26
|
-
@rbh_per_reference = @reciprocal_hits.to_f / @reference.size.to_f
|
27
|
-
@p_contigs_with_recip = @crbblast.reciprocals.size / @assembly.size.to_f
|
28
|
-
@n_contigs_with_recip = @crbblast.reciprocals.size
|
29
|
-
count_ref_crbbs
|
30
|
-
@p_refs_with_recip = @n_refs_with_recip / @reference.size.to_f
|
31
|
-
self.run_comp_stats
|
23
|
+
crbblast = run_crb_blast
|
24
|
+
calculate_reference_coverage crbblast
|
32
25
|
@has_run = true
|
33
26
|
end
|
34
27
|
|
35
|
-
def
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
@
|
40
|
-
|
41
|
-
|
42
|
-
|
28
|
+
def calculate_reference_coverage crbblast
|
29
|
+
# The reciprocals hash in crb blast has contig names as the key.
|
30
|
+
# In order to look up by the reference name we need to reverse this.
|
31
|
+
# Scan through the reciprocals and get this Hit objects and add them to
|
32
|
+
# the @reference object for each reference sequence
|
33
|
+
get_reference_hits crbblast
|
34
|
+
per_query_contig_reference_coverage
|
35
|
+
per_target_contig_reference_coverage crbblast
|
43
36
|
end
|
44
37
|
|
45
|
-
def
|
46
|
-
crbblast
|
47
|
-
|
48
|
-
crbblast
|
49
|
-
end
|
50
|
-
|
51
|
-
# coverage of contigs that have reciprocal hits
|
52
|
-
# divided by number of reciprocal targets
|
53
|
-
def coverage crbblast
|
54
|
-
return @reference_coverage unless @reference_coverage.nil?
|
55
|
-
crbblast.reciprocals.each do |key, list|
|
56
|
-
list.each_with_index do |hit, i|
|
38
|
+
def get_reference_hits crbblast
|
39
|
+
crbblast.reciprocals.each do |query_id, list|
|
40
|
+
list.each do |hit|
|
57
41
|
unless @reference.assembly.key? hit.target
|
58
42
|
raise TransrateError.new "#{hit.target} not in reference"
|
59
43
|
end
|
60
44
|
@reference[hit.target].hits << hit
|
45
|
+
end
|
46
|
+
end
|
47
|
+
@comp_stats[:CRBB_hits] = crbblast.size
|
48
|
+
@comp_stats[:n_contigs_with_CRBB] = crbblast.reciprocals.size
|
49
|
+
@comp_stats[:p_contigs_with_CRBB] = crbblast.reciprocals.size/@assembly.size.to_f
|
50
|
+
end
|
61
51
|
|
62
|
-
|
63
|
-
|
52
|
+
def per_query_contig_reference_coverage
|
53
|
+
# for each query contig in the @assembly find out how much it covers
|
54
|
+
# the reference
|
55
|
+
n_refs_with_recip = 0
|
56
|
+
total_crbb_hits = 0
|
57
|
+
@reference.each do |ref_contig_name, ref_contig|
|
58
|
+
ref_contig.hits.each do |hit| # a Hit from query to target
|
59
|
+
query_contig_name = hit.query
|
60
|
+
unless @assembly.assembly.key? query_contig_name
|
61
|
+
raise TransrateError.new "#{query_contig_name} not in assembly"
|
64
62
|
end
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
if
|
69
|
-
|
70
|
-
|
63
|
+
@assembly[query_contig_name].has_crb = true
|
64
|
+
@assembly[query_contig_name].hits << hit
|
65
|
+
raise TransrateError.new "query should not be protein" if hit.qprot
|
66
|
+
if hit.tprot
|
67
|
+
coverage = 3*hit.alnlen+2 - 3*hit.mismatches - 3*hit.gaps
|
68
|
+
coverage /= 3.0*hit.tlen
|
71
69
|
else
|
72
|
-
|
73
|
-
|
70
|
+
coverage = hit.alnlen - hit.mismatches - hit.gaps
|
71
|
+
coverage /= hit.tlen.to_f
|
74
72
|
end
|
75
|
-
|
73
|
+
@assembly[query_contig_name].reference_coverage = coverage
|
74
|
+
end
|
75
|
+
|
76
|
+
if ref_contig.hits.size > 0 # this reference has a crbblast hit
|
77
|
+
n_refs_with_recip += 1
|
76
78
|
end
|
79
|
+
total_crbb_hits += ref_contig.hits.size
|
77
80
|
end
|
81
|
+
@comp_stats[:rbh_per_reference] = total_crbb_hits / @reference.size.to_f
|
82
|
+
@comp_stats[:n_refs_with_CRBB] = n_refs_with_recip
|
83
|
+
@comp_stats[:p_refs_with_CRBB] = n_refs_with_recip / @reference.size.to_f
|
84
|
+
end
|
85
|
+
|
86
|
+
def per_target_contig_reference_coverage crbblast
|
87
|
+
# each target sequence in the reference can have multiple query contigs
|
88
|
+
# hit it. to calculate the reference coverage you can't just add up the
|
89
|
+
# alignment lengths. you have to make sure that overlaps are taken into
|
90
|
+
# account
|
91
|
+
coverage_thresholds = [0.25, 0.5, 0.75, 0.85, 0.95]
|
92
|
+
coverage_totals = [0, 0, 0, 0, 0]
|
93
|
+
prot = crbblast.target_is_prot
|
78
94
|
total_coverage = 0
|
79
95
|
total_length = 0
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
key = ref_contig.name
|
84
|
-
list = ref_contig.hits
|
85
|
-
if crbblast.target_is_prot
|
86
|
-
total_length += ref_contig.length * 3
|
96
|
+
@reference.each do |ref_contig_name, ref_contig|
|
97
|
+
if prot
|
98
|
+
covered = Array.new(ref_contig.length*3, false)
|
87
99
|
else
|
88
|
-
|
100
|
+
covered = Array.new(ref_contig.length, false)
|
89
101
|
end
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
start, stop = [hit.tstart, hit.tend].minmax
|
102
|
-
end
|
103
|
-
if blocks.empty?
|
104
|
-
blocks << [start, stop]
|
105
|
-
else
|
106
|
-
found=false
|
107
|
-
blocks.each do |block|
|
108
|
-
# if query overlaps with any block extend that block
|
109
|
-
o = overlap(block[0], block[1], start, stop)
|
110
|
-
if o == 0 # perfect overlap
|
111
|
-
found=true
|
112
|
-
elsif o == 1 # partial overlap
|
113
|
-
block[0] = start
|
114
|
-
found=true
|
115
|
-
elsif o == 2 # partial overlap
|
116
|
-
block[1] = stop
|
117
|
-
found=true
|
118
|
-
elsif o == 3 # full overlap
|
119
|
-
block[0] = start
|
120
|
-
block[1] = stop
|
121
|
-
found=true
|
122
|
-
elsif o == 4 # full overlap
|
123
|
-
found=true
|
124
|
-
# nothing
|
125
|
-
# elsif o == 5 || o == 6 # no overlap
|
126
|
-
|
127
|
-
end
|
102
|
+
ref_contig.hits.each_with_index do |hit, i| # a Hit from query to target
|
103
|
+
if prot
|
104
|
+
if hit.qstart % 3 == 0
|
105
|
+
tstart = 3*hit.tstart-4
|
106
|
+
tend = 3*hit.tend
|
107
|
+
elsif hit.qstart % 3 == 1
|
108
|
+
tstart = 3*hit.tstart-2
|
109
|
+
tend = 3*hit.tend
|
110
|
+
elsif hit.qstart % 3 == 2
|
111
|
+
tstart = 3*hit.tstart-3
|
112
|
+
tend = 3*hit.tend-1
|
128
113
|
end
|
129
|
-
if
|
130
|
-
|
114
|
+
if hit.qlen % 3 == 1
|
115
|
+
tend += 1
|
116
|
+
elsif hit.qlen % 3 == 2
|
117
|
+
tend += 2
|
131
118
|
end
|
132
|
-
|
133
|
-
|
119
|
+
else
|
120
|
+
tstart = hit.tstart
|
121
|
+
tend = hit.tend
|
122
|
+
end
|
123
|
+
(tstart..tend).each do |b|
|
124
|
+
covered[b-1] = true # blast coords are 1 indexed
|
134
125
|
end
|
135
126
|
end
|
136
|
-
|
137
|
-
|
138
|
-
if a!=b
|
139
|
-
o = overlap(block_a[0], block_a[1], block_b[0], block_b[1])
|
140
|
-
if o == 0 # perfect overlap
|
141
|
-
block_b[0]=-1
|
142
|
-
block_b[1]=-1
|
143
|
-
elsif o == 1 # partial overlap
|
144
|
-
block_a[0] = block_b[0]
|
145
|
-
block_b[0] = -1
|
146
|
-
block_b[1] = -1
|
147
|
-
elsif o == 2 # partial overlap
|
148
|
-
block_a[1] = block_b[1]
|
149
|
-
block_b[0] = -1
|
150
|
-
block_b[1] = -1
|
151
|
-
elsif o == 3 # full overlap
|
152
|
-
block_a[0] = block_b[0]
|
153
|
-
block_a[1] = block_b[1]
|
154
|
-
block_b[0] = -1
|
155
|
-
block_b[1] = -1
|
156
|
-
elsif o == 4 # full overlap
|
157
|
-
block_b[0] = -1
|
158
|
-
block_b[1] = -1
|
159
|
-
# elsif o == 5 || o == 6# no overlap
|
160
|
-
# do nothing
|
161
|
-
# elsif # no overlap
|
162
|
-
# do nothing
|
163
|
-
end
|
164
|
-
end
|
165
|
-
end # each_with_index b
|
166
|
-
end # each_with_index a
|
167
|
-
# sum blocks to find total coverage
|
168
|
-
length_of_coverage = calculate_coverage blocks
|
169
|
-
if target_length > 0
|
170
|
-
ref_p = length_of_coverage / target_length.to_f
|
171
|
-
else
|
172
|
-
ref_p = 0
|
173
|
-
end
|
127
|
+
coverage = covered.reduce(0) { |sum, v| v ? sum + 1 : sum }
|
128
|
+
ref_p = coverage / covered.length.to_f
|
174
129
|
ref_contig.reference_coverage = ref_p
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
@cov[i] +=1
|
130
|
+
coverage_thresholds.each_with_index do |n, index|
|
131
|
+
if ref_p >= n
|
132
|
+
coverage_totals[index] += 1
|
179
133
|
end
|
180
134
|
end
|
181
135
|
|
182
|
-
total_coverage +=
|
136
|
+
total_coverage += coverage
|
137
|
+
total_length += covered.length
|
183
138
|
end
|
184
139
|
|
185
|
-
|
186
|
-
|
140
|
+
# calculate proportion of ref sequences with coveragre over thresholds
|
141
|
+
coverage_thresholds.each_with_index do |p, i|
|
142
|
+
@comp_stats["cov#{(100*p).to_i}".to_sym] = coverage_totals[i]
|
187
143
|
@comp_stats["p_cov#{(100*p).to_i}".to_sym] =
|
188
|
-
|
189
|
-
end
|
190
|
-
total_coverage / total_length.to_f
|
191
|
-
end
|
192
|
-
|
193
|
-
# Calculate the total coverage from a set of coverage blocks
|
194
|
-
def calculate_coverage blocks
|
195
|
-
coverage = 0
|
196
|
-
blocks.each do |block|
|
197
|
-
if block[0] and block[1]
|
198
|
-
if block[0]>=0 and block[1]>=0
|
199
|
-
coverage += block[1] - block[0] + 1
|
200
|
-
end
|
201
|
-
else
|
202
|
-
puts "error: key = #{key}, #{blocks}"
|
203
|
-
end
|
204
|
-
end
|
205
|
-
coverage
|
206
|
-
end
|
207
|
-
|
208
|
-
# Count reference proteins with at least one recprocal hit
|
209
|
-
def count_ref_crbbs
|
210
|
-
@n_refs_with_recip = @reference.assembly.inject(0) do |sum, entry|
|
211
|
-
name, contig = entry
|
212
|
-
sum + (contig.hits.length > 0 ? 1 : 0)
|
213
|
-
end
|
214
|
-
end
|
215
|
-
|
216
|
-
def overlap(astart, astop, bstart, bstop)
|
217
|
-
if astart == bstart and astop == bstop
|
218
|
-
return 0
|
219
|
-
elsif astart < bstart
|
220
|
-
if astop > bstart
|
221
|
-
if astop > bstop
|
222
|
-
return 4
|
223
|
-
else
|
224
|
-
return 2
|
225
|
-
end
|
226
|
-
else
|
227
|
-
return 5 # no overlap
|
228
|
-
end
|
229
|
-
else
|
230
|
-
if bstop > astart
|
231
|
-
if bstop > astop
|
232
|
-
return 3
|
233
|
-
else
|
234
|
-
return 1
|
235
|
-
end
|
236
|
-
else
|
237
|
-
return 6 # no overlap
|
238
|
-
end
|
144
|
+
coverage_totals[i]/@reference.size.to_f
|
239
145
|
end
|
146
|
+
@comp_stats[:reference_coverage] = total_coverage / total_length.to_f
|
240
147
|
end
|
241
148
|
|
242
|
-
def
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
if astop > bstart
|
247
|
-
if astop > bstop
|
248
|
-
return (bstop-bstart+1)/(astop-astart+1).to_f # 4
|
249
|
-
else
|
250
|
-
return (astop-bstart+1)/(bstop-astart+1).to_f # 2
|
251
|
-
end
|
252
|
-
else
|
253
|
-
return 0 # 5 no overlap
|
254
|
-
end
|
255
|
-
else
|
256
|
-
if bstop > astart
|
257
|
-
if bstop > astop
|
258
|
-
return (astop-astart+1)/(bstop-bstart+1).to_f # 3
|
259
|
-
else
|
260
|
-
return (bstop-astart+1)/(astop-bstart+1).to_f # 1
|
261
|
-
end
|
262
|
-
else
|
263
|
-
return 0 # 6 no overlap
|
264
|
-
end
|
265
|
-
end
|
149
|
+
def run_crb_blast
|
150
|
+
crbblast = CRB_Blast::CRB_Blast.new @assembly.file, @reference.file
|
151
|
+
crbblast.run(1e-5, @threads, true)
|
152
|
+
crbblast
|
266
153
|
end
|
267
154
|
|
268
155
|
end # ComparativeMetrics
|