transrate 1.0.0.beta3 → 1.0.0.beta4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: a663790460cbe480bce88af21df3609e8b63a073
4
- data.tar.gz: 22b4fb0af30dc78afeee914abd8b93d1e7dd7966
3
+ metadata.gz: e6662137942e3933a714a4950d4de53826b71aad
4
+ data.tar.gz: 2863b7bc2b0f63af4c43dc31c77f43324dafccfc
5
5
  SHA512:
6
- metadata.gz: ed8cad8ff40d18fd6ce7b1a78bf8a6dab2ef0b141d98a08c5403e8f766e4b11ab79f96e32f30abdae951c6fee3b5d31c3a2cc1d3c98a5a93c20ea258b6424183
7
- data.tar.gz: a60476a414dc3626d80d5d1f8ed967c3dcb79563ce97454b79b5f5595451c4285582650591b5974ac0114980482fadd8f6e6d12c50f392a41cac04fe1562ce86
6
+ metadata.gz: 30f64bf256bb98ae6031ee635c1187af680974fa97bc719d108a3878cfacaef7216fd5c8c5e0a07be077eca55d6379e1562f425eee3948d86a1f2dc816f1cfef
7
+ data.tar.gz: fb8fbd6cc684112693fd7241860724938610efa9b8950811b4cfe7b9b5232b28a8101a1c49157fa22a0aff979029628c6141f75a271871cc0a07ffe2402b7666
data/README.md CHANGED
@@ -2,8 +2,11 @@
2
2
  <img alt="Transrate - understand your transcriptome assembly" src="https://github.com/Blahah/transrate/raw/master/docs/transrate_logo_full.png">
3
3
  </p>
4
4
 
5
+ Download latest binaries: [![Download](https://api.bintray.com/packages/blahah/generic/transrate/images/download.svg)][bintray]
6
+
5
7
  Need help? Chat to us live: [![Gitter](https://badges.gitter.im/Join Chat.svg)](https://gitter.im/Blahah/transrate?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
6
8
 
9
+
7
10
  ## Development status
8
11
 
9
12
  [![Gem Version](http://img.shields.io/gem/v/transrate.svg)][gem]
@@ -19,6 +22,7 @@ Need help? Chat to us live: [![Gitter](https://badges.gitter.im/Join Chat.svg)](
19
22
  [gemnasium]: https://gemnasium.com/Blahah/transrate
20
23
  [codeclimate]: https://codeclimate.com/github/Blahah/transrate
21
24
  [coveralls]: https://coveralls.io/r/Blahah/transrate
25
+ [bintray]: https://bintray.com/blahah/generic/transrate/_latestVersion
22
26
 
23
27
  This software is being actively developed. Please be aware that there may be bugs. If you find any, please report them on the [issue tracker](https://github.com/Blahah/transrate/issues).
24
28
 
data/Rakefile CHANGED
@@ -90,7 +90,7 @@ task :default => :test
90
90
  # PACKAGING
91
91
 
92
92
  PACKAGE_NAME = "transrate"
93
- VERSION = "1.0.0.beta3"
93
+ VERSION = "1.0.0.beta4"
94
94
  TRAVELING_RUBY_VERSION = "20150210-2.2.0"
95
95
 
96
96
  desc "Package your app"
data/bin/transrate CHANGED
@@ -32,7 +32,8 @@ opts = Trollop::options do
32
32
  banner <<-EOS
33
33
 
34
34
  Transrate v#{Transrate::VERSION::STRING.dup}
35
- by Richard Smith-Unna <rds45@cam.ac.uk> and Chris Boursnell
35
+ by Richard Smith-Unna, Chris Boursnell, Rob Patro,
36
+ Julian Hibberd, and Steve Kelly
36
37
 
37
38
  DESCRIPTION:
38
39
  Analyse a de-novo transcriptome assembly using three kinds of metrics:
@@ -64,12 +65,12 @@ opts = Trollop::options do
64
65
  EOS
65
66
  opt :assembly, "Assembly file(s) in FASTA format, comma-separated",
66
67
  :type => String
67
- opt :reference, "Reference proteome file in FASTA format",
68
- :type => String
69
68
  opt :left, "Left reads file in FASTQ format",
70
69
  :type => String
71
70
  opt :right, "Right reads file in FASTQ format",
72
71
  :type => String
72
+ opt :reference, "Reference proteome file in FASTA format",
73
+ :type => String
73
74
  opt :threads, "Number of threads to use",
74
75
  :default => 8,
75
76
  :type => Integer
@@ -95,7 +96,7 @@ blast_dep = File.join(gem_dir, 'deps', 'blast.yaml')
95
96
  deps, read_deps, ref_deps = nil
96
97
  unless opts.install_deps.nil?
97
98
 
98
- unless %w[all read red].include? opts.install_deps
99
+ unless %w[all read ref].include? opts.install_deps
99
100
  raise TransrateError.new "install-deps #{opts.install_deps} is not valid. " +
100
101
  "You must specify one of: all, read, ref."
101
102
  end
@@ -110,12 +111,12 @@ if deps || read_deps || ref_deps
110
111
  puts "Checking dependencies"
111
112
 
112
113
  missing = []
113
- if opts.install_deps || opts.install_read_deps
114
+ if deps || read_deps
114
115
  Bindeps.require gem_deps
115
116
  missing += Bindeps.missing gem_deps
116
117
  end
117
118
 
118
- if opts.install_deps || opts.install_ref_deps
119
+ if deps || ref_deps
119
120
  Bindeps.require blast_dep
120
121
  missing += Bindeps.missing blast_dep
121
122
  end
@@ -307,9 +308,12 @@ assemblies.split(',').each do |assembly|
307
308
  logger.info "No reference provided, skipping comparative diagnostics"
308
309
  end
309
310
 
311
+ prefix = "#{opts.outfile}_#{File.basename(assembly)}"
312
+
310
313
  if (opts.left && opts.right)
311
314
  score = transrater.assembly_score
312
- optimal, cutoff = transrater.assembly_optimal_score
315
+
316
+ optimal, cutoff = transrater.assembly_optimal_score prefix
313
317
  unless score.nil?
314
318
  pretty_print_hash({:TRANSRATE_ASSEMBLY_SCORE => score}, report_width, 4)
315
319
  logger.info "-" * report_width
@@ -320,7 +324,7 @@ assemblies.split(',').each do |assembly|
320
324
  end
321
325
 
322
326
  # write contig metrics to file for each contig
323
- outfile = "#{opts.outfile}_#{File.basename(assembly)}_contigs.csv"
327
+ outfile = "#{prefix}_contigs.csv"
324
328
  logger.info "Writing contig metrics for each contig to #{outfile}"
325
329
  # have option to turn off, default on
326
330
  first=true
data/deps/deps.yaml CHANGED
@@ -23,9 +23,11 @@ salmon:
23
23
  binaries:
24
24
  - salmon
25
25
  libraries:
26
+ - libgcc_s.so.1
26
27
  - libgomp.so.1
27
28
  - libm.so.6
28
29
  - librt.so.1
30
+ - libstdc++.so.6
29
31
  - libtbb.so
30
32
  - libtbb.so.2
31
33
  - libtbbmalloc.so
@@ -37,10 +39,11 @@ salmon:
37
39
  - libtbb.dylib
38
40
  - libtbbmalloc.dylib
39
41
  - libtbbmalloc_proxy.dylib
42
+ - libsalmon_core.a
40
43
  version:
41
- number: '0.3'
44
+ number: '0.4'
42
45
  command: 'salmon -v'
43
46
  url:
44
47
  64bit:
45
- linux: https://github.com/kingsfordgroup/sailfish/releases/download/v0.3.0/SalmonBeta-v0.3.0_squeeze.tar.gz
46
- macosx: https://github.com/kingsfordgroup/sailfish/releases/download/v0.3.0/SalmonBeta-v0.3.0_MacOSX-10.10.2.tar.gz
48
+ linux: https://github.com/COMBINE-lab/salmon/releases/download/v0.4.0/SalmonBeta-0.4.0_DebianSqueeze.tar.gz
49
+ macosx: https://github.com/COMBINE-lab/salmon/releases/download/v0.4.0/SalmonBeta-0.4.0_OSX-10.10.tar.gz
@@ -48,6 +48,10 @@ module Transrate
48
48
  @assembly = {}
49
49
  @n_bases = 0
50
50
  Bio::FastaFormat.open(file).each do |entry|
51
+ if entry.seq.length == 0
52
+ logger.error "Entry found with no sequence #{entry.entry_id}"
53
+ raise AssemblyError
54
+ end
51
55
  @n_bases += entry.length
52
56
  contig = Contig.new(entry)
53
57
  if @assembly.key?(contig.name)
@@ -20,249 +20,136 @@ module Transrate
20
20
  end
21
21
 
22
22
  def run
23
- @crbblast = reciprocal_best_blast
24
- @reference_coverage = coverage @crbblast
25
- @reciprocal_hits = @crbblast.size
26
- @rbh_per_reference = @reciprocal_hits.to_f / @reference.size.to_f
27
- @p_contigs_with_recip = @crbblast.reciprocals.size / @assembly.size.to_f
28
- @n_contigs_with_recip = @crbblast.reciprocals.size
29
- count_ref_crbbs
30
- @p_refs_with_recip = @n_refs_with_recip / @reference.size.to_f
31
- self.run_comp_stats
23
+ crbblast = run_crb_blast
24
+ calculate_reference_coverage crbblast
32
25
  @has_run = true
33
26
  end
34
27
 
35
- def run_comp_stats
36
- @comp_stats[:CRBB_hits] = @reciprocal_hits # CRBB hits
37
- @comp_stats[:p_contigs_with_CRBB] = @p_contigs_with_recip
38
- @comp_stats[:n_contigs_with_CRBB] = @n_contigs_with_recip
39
- @comp_stats[:p_refs_with_CRBB] = @p_refs_with_recip
40
- @comp_stats[:n_refs_with_CRBB] = @n_refs_with_recip
41
- @comp_stats[:rbh_per_reference] = @rbh_per_reference
42
- @comp_stats[:reference_coverage] = @reference_coverage
28
+ def calculate_reference_coverage crbblast
29
+ # The reciprocals hash in crb blast has contig names as the key.
30
+ # In order to look up by the reference name we need to reverse this.
31
+ # Scan through the reciprocals and get this Hit objects and add them to
32
+ # the @reference object for each reference sequence
33
+ get_reference_hits crbblast
34
+ per_query_contig_reference_coverage
35
+ per_target_contig_reference_coverage crbblast
43
36
  end
44
37
 
45
- def reciprocal_best_blast
46
- crbblast = CRB_Blast::CRB_Blast.new @assembly.file, @reference.file
47
- crbblast.run(1e-5, @threads, true)
48
- crbblast
49
- end
50
-
51
- # coverage of contigs that have reciprocal hits
52
- # divided by number of reciprocal targets
53
- def coverage crbblast
54
- return @reference_coverage unless @reference_coverage.nil?
55
- crbblast.reciprocals.each do |key, list|
56
- list.each_with_index do |hit, i|
38
+ def get_reference_hits crbblast
39
+ crbblast.reciprocals.each do |query_id, list|
40
+ list.each do |hit|
57
41
  unless @reference.assembly.key? hit.target
58
42
  raise TransrateError.new "#{hit.target} not in reference"
59
43
  end
60
44
  @reference[hit.target].hits << hit
45
+ end
46
+ end
47
+ @comp_stats[:CRBB_hits] = crbblast.size
48
+ @comp_stats[:n_contigs_with_CRBB] = crbblast.reciprocals.size
49
+ @comp_stats[:p_contigs_with_CRBB] = crbblast.reciprocals.size/@assembly.size.to_f
50
+ end
61
51
 
62
- unless @assembly.assembly.key? hit.query
63
- raise TransrateError.new "#{hit.query} not in assembly"
52
+ def per_query_contig_reference_coverage
53
+ # for each query contig in the @assembly find out how much it covers
54
+ # the reference
55
+ n_refs_with_recip = 0
56
+ total_crbb_hits = 0
57
+ @reference.each do |ref_contig_name, ref_contig|
58
+ ref_contig.hits.each do |hit| # a Hit from query to target
59
+ query_contig_name = hit.query
60
+ unless @assembly.assembly.key? query_contig_name
61
+ raise TransrateError.new "#{query_contig_name} not in assembly"
64
62
  end
65
- contig = @assembly[hit.query]
66
- contig.has_crb = true
67
- # how much of the reference is covered by this single contig
68
- if crbblast.target_is_prot
69
- contig.reference_coverage =
70
- (hit.alnlen - hit.mismatches - hit.gaps) / (3*hit.tlen)
63
+ @assembly[query_contig_name].has_crb = true
64
+ @assembly[query_contig_name].hits << hit
65
+ raise TransrateError.new "query should not be protein" if hit.qprot
66
+ if hit.tprot
67
+ coverage = 3*hit.alnlen+2 - 3*hit.mismatches - 3*hit.gaps
68
+ coverage /= 3.0*hit.tlen
71
69
  else
72
- contig.reference_coverage =
73
- (hit.alnlen - hit.mismatches - hit.gaps) / hit.tlen
70
+ coverage = hit.alnlen - hit.mismatches - hit.gaps
71
+ coverage /= hit.tlen.to_f
74
72
  end
75
- contig.hits << hit
73
+ @assembly[query_contig_name].reference_coverage = coverage
74
+ end
75
+
76
+ if ref_contig.hits.size > 0 # this reference has a crbblast hit
77
+ n_refs_with_recip += 1
76
78
  end
79
+ total_crbb_hits += ref_contig.hits.size
77
80
  end
81
+ @comp_stats[:rbh_per_reference] = total_crbb_hits / @reference.size.to_f
82
+ @comp_stats[:n_refs_with_CRBB] = n_refs_with_recip
83
+ @comp_stats[:p_refs_with_CRBB] = n_refs_with_recip / @reference.size.to_f
84
+ end
85
+
86
+ def per_target_contig_reference_coverage crbblast
87
+ # each target sequence in the reference can have multiple query contigs
88
+ # hit it. to calculate the reference coverage you can't just add up the
89
+ # alignment lengths. you have to make sure that overlaps are taken into
90
+ # account
91
+ coverage_thresholds = [0.25, 0.5, 0.75, 0.85, 0.95]
92
+ coverage_totals = [0, 0, 0, 0, 0]
93
+ prot = crbblast.target_is_prot
78
94
  total_coverage = 0
79
95
  total_length = 0
80
- cov = [0.25, 0.5, 0.75, 0.85, 0.95]
81
- @cov ||= [0, 0, 0, 0, 0]
82
- @reference.each_value do |ref_contig|
83
- key = ref_contig.name
84
- list = ref_contig.hits
85
- if crbblast.target_is_prot
86
- total_length += ref_contig.length * 3
96
+ @reference.each do |ref_contig_name, ref_contig|
97
+ if prot
98
+ covered = Array.new(ref_contig.length*3, false)
87
99
  else
88
- total_length += ref_contig.length
100
+ covered = Array.new(ref_contig.length, false)
89
101
  end
90
- next if list.empty?
91
- blocks = []
92
- target_length = 0
93
- list.each do |hit|
94
- target_length = hit.tlen
95
- if crbblast.target_is_prot
96
- target_length *= 3
97
- start, stop = [hit.tstart, hit.tend].minmax
98
- start = start*3-2
99
- stop = stop*3
100
- else
101
- start, stop = [hit.tstart, hit.tend].minmax
102
- end
103
- if blocks.empty?
104
- blocks << [start, stop]
105
- else
106
- found=false
107
- blocks.each do |block|
108
- # if query overlaps with any block extend that block
109
- o = overlap(block[0], block[1], start, stop)
110
- if o == 0 # perfect overlap
111
- found=true
112
- elsif o == 1 # partial overlap
113
- block[0] = start
114
- found=true
115
- elsif o == 2 # partial overlap
116
- block[1] = stop
117
- found=true
118
- elsif o == 3 # full overlap
119
- block[0] = start
120
- block[1] = stop
121
- found=true
122
- elsif o == 4 # full overlap
123
- found=true
124
- # nothing
125
- # elsif o == 5 || o == 6 # no overlap
126
-
127
- end
102
+ ref_contig.hits.each_with_index do |hit, i| # a Hit from query to target
103
+ if prot
104
+ if hit.qstart % 3 == 0
105
+ tstart = 3*hit.tstart-4
106
+ tend = 3*hit.tend
107
+ elsif hit.qstart % 3 == 1
108
+ tstart = 3*hit.tstart-2
109
+ tend = 3*hit.tend
110
+ elsif hit.qstart % 3 == 2
111
+ tstart = 3*hit.tstart-3
112
+ tend = 3*hit.tend-1
128
113
  end
129
- if !found
130
- blocks << [start, stop]
114
+ if hit.qlen % 3 == 1
115
+ tend += 1
116
+ elsif hit.qlen % 3 == 2
117
+ tend += 2
131
118
  end
132
- # if any blocks now overlap then extend one block and remove
133
- # the other
119
+ else
120
+ tstart = hit.tstart
121
+ tend = hit.tend
122
+ end
123
+ (tstart..tend).each do |b|
124
+ covered[b-1] = true # blast coords are 1 indexed
134
125
  end
135
126
  end
136
- blocks.each_with_index do |block_a,a|
137
- blocks.each_with_index do |block_b,b|
138
- if a!=b
139
- o = overlap(block_a[0], block_a[1], block_b[0], block_b[1])
140
- if o == 0 # perfect overlap
141
- block_b[0]=-1
142
- block_b[1]=-1
143
- elsif o == 1 # partial overlap
144
- block_a[0] = block_b[0]
145
- block_b[0] = -1
146
- block_b[1] = -1
147
- elsif o == 2 # partial overlap
148
- block_a[1] = block_b[1]
149
- block_b[0] = -1
150
- block_b[1] = -1
151
- elsif o == 3 # full overlap
152
- block_a[0] = block_b[0]
153
- block_a[1] = block_b[1]
154
- block_b[0] = -1
155
- block_b[1] = -1
156
- elsif o == 4 # full overlap
157
- block_b[0] = -1
158
- block_b[1] = -1
159
- # elsif o == 5 || o == 6# no overlap
160
- # do nothing
161
- # elsif # no overlap
162
- # do nothing
163
- end
164
- end
165
- end # each_with_index b
166
- end # each_with_index a
167
- # sum blocks to find total coverage
168
- length_of_coverage = calculate_coverage blocks
169
- if target_length > 0
170
- ref_p = length_of_coverage / target_length.to_f
171
- else
172
- ref_p = 0
173
- end
127
+ coverage = covered.reduce(0) { |sum, v| v ? sum + 1 : sum }
128
+ ref_p = coverage / covered.length.to_f
174
129
  ref_contig.reference_coverage = ref_p
175
-
176
- cov.each_with_index do |c, i|
177
- if ref_p >= c
178
- @cov[i] +=1
130
+ coverage_thresholds.each_with_index do |n, index|
131
+ if ref_p >= n
132
+ coverage_totals[index] += 1
179
133
  end
180
134
  end
181
135
 
182
- total_coverage += length_of_coverage
136
+ total_coverage += coverage
137
+ total_length += covered.length
183
138
  end
184
139
 
185
- cov.each_with_index do |p, i|
186
- @comp_stats["cov#{(100*p).to_i}".to_sym] = @cov[i]
140
+ # calculate proportion of ref sequences with coveragre over thresholds
141
+ coverage_thresholds.each_with_index do |p, i|
142
+ @comp_stats["cov#{(100*p).to_i}".to_sym] = coverage_totals[i]
187
143
  @comp_stats["p_cov#{(100*p).to_i}".to_sym] =
188
- @cov[i]/@reference.size.to_f
189
- end
190
- total_coverage / total_length.to_f
191
- end
192
-
193
- # Calculate the total coverage from a set of coverage blocks
194
- def calculate_coverage blocks
195
- coverage = 0
196
- blocks.each do |block|
197
- if block[0] and block[1]
198
- if block[0]>=0 and block[1]>=0
199
- coverage += block[1] - block[0] + 1
200
- end
201
- else
202
- puts "error: key = #{key}, #{blocks}"
203
- end
204
- end
205
- coverage
206
- end
207
-
208
- # Count reference proteins with at least one recprocal hit
209
- def count_ref_crbbs
210
- @n_refs_with_recip = @reference.assembly.inject(0) do |sum, entry|
211
- name, contig = entry
212
- sum + (contig.hits.length > 0 ? 1 : 0)
213
- end
214
- end
215
-
216
- def overlap(astart, astop, bstart, bstop)
217
- if astart == bstart and astop == bstop
218
- return 0
219
- elsif astart < bstart
220
- if astop > bstart
221
- if astop > bstop
222
- return 4
223
- else
224
- return 2
225
- end
226
- else
227
- return 5 # no overlap
228
- end
229
- else
230
- if bstop > astart
231
- if bstop > astop
232
- return 3
233
- else
234
- return 1
235
- end
236
- else
237
- return 6 # no overlap
238
- end
144
+ coverage_totals[i]/@reference.size.to_f
239
145
  end
146
+ @comp_stats[:reference_coverage] = total_coverage / total_length.to_f
240
147
  end
241
148
 
242
- def overlap_amount(astart, astop, bstart, bstop)
243
- if astart == bstart and astop == bstop
244
- return 1
245
- elsif astart < bstart
246
- if astop > bstart
247
- if astop > bstop
248
- return (bstop-bstart+1)/(astop-astart+1).to_f # 4
249
- else
250
- return (astop-bstart+1)/(bstop-astart+1).to_f # 2
251
- end
252
- else
253
- return 0 # 5 no overlap
254
- end
255
- else
256
- if bstop > astart
257
- if bstop > astop
258
- return (astop-astart+1)/(bstop-bstart+1).to_f # 3
259
- else
260
- return (bstop-astart+1)/(astop-bstart+1).to_f # 1
261
- end
262
- else
263
- return 0 # 6 no overlap
264
- end
265
- end
149
+ def run_crb_blast
150
+ crbblast = CRB_Blast::CRB_Blast.new @assembly.file, @reference.file
151
+ crbblast.run(1e-5, @threads, true)
152
+ crbblast
266
153
  end
267
154
 
268
155
  end # ComparativeMetrics