transrate 1.0.0.beta3 → 1.0.0.beta4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: a663790460cbe480bce88af21df3609e8b63a073
4
- data.tar.gz: 22b4fb0af30dc78afeee914abd8b93d1e7dd7966
3
+ metadata.gz: e6662137942e3933a714a4950d4de53826b71aad
4
+ data.tar.gz: 2863b7bc2b0f63af4c43dc31c77f43324dafccfc
5
5
  SHA512:
6
- metadata.gz: ed8cad8ff40d18fd6ce7b1a78bf8a6dab2ef0b141d98a08c5403e8f766e4b11ab79f96e32f30abdae951c6fee3b5d31c3a2cc1d3c98a5a93c20ea258b6424183
7
- data.tar.gz: a60476a414dc3626d80d5d1f8ed967c3dcb79563ce97454b79b5f5595451c4285582650591b5974ac0114980482fadd8f6e6d12c50f392a41cac04fe1562ce86
6
+ metadata.gz: 30f64bf256bb98ae6031ee635c1187af680974fa97bc719d108a3878cfacaef7216fd5c8c5e0a07be077eca55d6379e1562f425eee3948d86a1f2dc816f1cfef
7
+ data.tar.gz: fb8fbd6cc684112693fd7241860724938610efa9b8950811b4cfe7b9b5232b28a8101a1c49157fa22a0aff979029628c6141f75a271871cc0a07ffe2402b7666
data/README.md CHANGED
@@ -2,8 +2,11 @@
2
2
  <img alt="Transrate - understand your transcriptome assembly" src="https://github.com/Blahah/transrate/raw/master/docs/transrate_logo_full.png">
3
3
  </p>
4
4
 
5
+ Download latest binaries: [![Download](https://api.bintray.com/packages/blahah/generic/transrate/images/download.svg)][bintray]
6
+
5
7
  Need help? Chat to us live: [![Gitter](https://badges.gitter.im/Join Chat.svg)](https://gitter.im/Blahah/transrate?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
6
8
 
9
+
7
10
  ## Development status
8
11
 
9
12
  [![Gem Version](http://img.shields.io/gem/v/transrate.svg)][gem]
@@ -19,6 +22,7 @@ Need help? Chat to us live: [![Gitter](https://badges.gitter.im/Join Chat.svg)](
19
22
  [gemnasium]: https://gemnasium.com/Blahah/transrate
20
23
  [codeclimate]: https://codeclimate.com/github/Blahah/transrate
21
24
  [coveralls]: https://coveralls.io/r/Blahah/transrate
25
+ [bintray]: https://bintray.com/blahah/generic/transrate/_latestVersion
22
26
 
23
27
  This software is being actively developed. Please be aware that there may be bugs. If you find any, please report them on the [issue tracker](https://github.com/Blahah/transrate/issues).
24
28
 
data/Rakefile CHANGED
@@ -90,7 +90,7 @@ task :default => :test
90
90
  # PACKAGING
91
91
 
92
92
  PACKAGE_NAME = "transrate"
93
- VERSION = "1.0.0.beta3"
93
+ VERSION = "1.0.0.beta4"
94
94
  TRAVELING_RUBY_VERSION = "20150210-2.2.0"
95
95
 
96
96
  desc "Package your app"
data/bin/transrate CHANGED
@@ -32,7 +32,8 @@ opts = Trollop::options do
32
32
  banner <<-EOS
33
33
 
34
34
  Transrate v#{Transrate::VERSION::STRING.dup}
35
- by Richard Smith-Unna <rds45@cam.ac.uk> and Chris Boursnell
35
+ by Richard Smith-Unna, Chris Boursnell, Rob Patro,
36
+ Julian Hibberd, and Steve Kelly
36
37
 
37
38
  DESCRIPTION:
38
39
  Analyse a de-novo transcriptome assembly using three kinds of metrics:
@@ -64,12 +65,12 @@ opts = Trollop::options do
64
65
  EOS
65
66
  opt :assembly, "Assembly file(s) in FASTA format, comma-separated",
66
67
  :type => String
67
- opt :reference, "Reference proteome file in FASTA format",
68
- :type => String
69
68
  opt :left, "Left reads file in FASTQ format",
70
69
  :type => String
71
70
  opt :right, "Right reads file in FASTQ format",
72
71
  :type => String
72
+ opt :reference, "Reference proteome file in FASTA format",
73
+ :type => String
73
74
  opt :threads, "Number of threads to use",
74
75
  :default => 8,
75
76
  :type => Integer
@@ -95,7 +96,7 @@ blast_dep = File.join(gem_dir, 'deps', 'blast.yaml')
95
96
  deps, read_deps, ref_deps = nil
96
97
  unless opts.install_deps.nil?
97
98
 
98
- unless %w[all read red].include? opts.install_deps
99
+ unless %w[all read ref].include? opts.install_deps
99
100
  raise TransrateError.new "install-deps #{opts.install_deps} is not valid. " +
100
101
  "You must specify one of: all, read, ref."
101
102
  end
@@ -110,12 +111,12 @@ if deps || read_deps || ref_deps
110
111
  puts "Checking dependencies"
111
112
 
112
113
  missing = []
113
- if opts.install_deps || opts.install_read_deps
114
+ if deps || read_deps
114
115
  Bindeps.require gem_deps
115
116
  missing += Bindeps.missing gem_deps
116
117
  end
117
118
 
118
- if opts.install_deps || opts.install_ref_deps
119
+ if deps || ref_deps
119
120
  Bindeps.require blast_dep
120
121
  missing += Bindeps.missing blast_dep
121
122
  end
@@ -307,9 +308,12 @@ assemblies.split(',').each do |assembly|
307
308
  logger.info "No reference provided, skipping comparative diagnostics"
308
309
  end
309
310
 
311
+ prefix = "#{opts.outfile}_#{File.basename(assembly)}"
312
+
310
313
  if (opts.left && opts.right)
311
314
  score = transrater.assembly_score
312
- optimal, cutoff = transrater.assembly_optimal_score
315
+
316
+ optimal, cutoff = transrater.assembly_optimal_score prefix
313
317
  unless score.nil?
314
318
  pretty_print_hash({:TRANSRATE_ASSEMBLY_SCORE => score}, report_width, 4)
315
319
  logger.info "-" * report_width
@@ -320,7 +324,7 @@ assemblies.split(',').each do |assembly|
320
324
  end
321
325
 
322
326
  # write contig metrics to file for each contig
323
- outfile = "#{opts.outfile}_#{File.basename(assembly)}_contigs.csv"
327
+ outfile = "#{prefix}_contigs.csv"
324
328
  logger.info "Writing contig metrics for each contig to #{outfile}"
325
329
  # have option to turn off, default on
326
330
  first=true
data/deps/deps.yaml CHANGED
@@ -23,9 +23,11 @@ salmon:
23
23
  binaries:
24
24
  - salmon
25
25
  libraries:
26
+ - libgcc_s.so.1
26
27
  - libgomp.so.1
27
28
  - libm.so.6
28
29
  - librt.so.1
30
+ - libstdc++.so.6
29
31
  - libtbb.so
30
32
  - libtbb.so.2
31
33
  - libtbbmalloc.so
@@ -37,10 +39,11 @@ salmon:
37
39
  - libtbb.dylib
38
40
  - libtbbmalloc.dylib
39
41
  - libtbbmalloc_proxy.dylib
42
+ - libsalmon_core.a
40
43
  version:
41
- number: '0.3'
44
+ number: '0.4'
42
45
  command: 'salmon -v'
43
46
  url:
44
47
  64bit:
45
- linux: https://github.com/kingsfordgroup/sailfish/releases/download/v0.3.0/SalmonBeta-v0.3.0_squeeze.tar.gz
46
- macosx: https://github.com/kingsfordgroup/sailfish/releases/download/v0.3.0/SalmonBeta-v0.3.0_MacOSX-10.10.2.tar.gz
48
+ linux: https://github.com/COMBINE-lab/salmon/releases/download/v0.4.0/SalmonBeta-0.4.0_DebianSqueeze.tar.gz
49
+ macosx: https://github.com/COMBINE-lab/salmon/releases/download/v0.4.0/SalmonBeta-0.4.0_OSX-10.10.tar.gz
@@ -48,6 +48,10 @@ module Transrate
48
48
  @assembly = {}
49
49
  @n_bases = 0
50
50
  Bio::FastaFormat.open(file).each do |entry|
51
+ if entry.seq.length == 0
52
+ logger.error "Entry found with no sequence #{entry.entry_id}"
53
+ raise AssemblyError
54
+ end
51
55
  @n_bases += entry.length
52
56
  contig = Contig.new(entry)
53
57
  if @assembly.key?(contig.name)
@@ -20,249 +20,136 @@ module Transrate
20
20
  end
21
21
 
22
22
  def run
23
- @crbblast = reciprocal_best_blast
24
- @reference_coverage = coverage @crbblast
25
- @reciprocal_hits = @crbblast.size
26
- @rbh_per_reference = @reciprocal_hits.to_f / @reference.size.to_f
27
- @p_contigs_with_recip = @crbblast.reciprocals.size / @assembly.size.to_f
28
- @n_contigs_with_recip = @crbblast.reciprocals.size
29
- count_ref_crbbs
30
- @p_refs_with_recip = @n_refs_with_recip / @reference.size.to_f
31
- self.run_comp_stats
23
+ crbblast = run_crb_blast
24
+ calculate_reference_coverage crbblast
32
25
  @has_run = true
33
26
  end
34
27
 
35
- def run_comp_stats
36
- @comp_stats[:CRBB_hits] = @reciprocal_hits # CRBB hits
37
- @comp_stats[:p_contigs_with_CRBB] = @p_contigs_with_recip
38
- @comp_stats[:n_contigs_with_CRBB] = @n_contigs_with_recip
39
- @comp_stats[:p_refs_with_CRBB] = @p_refs_with_recip
40
- @comp_stats[:n_refs_with_CRBB] = @n_refs_with_recip
41
- @comp_stats[:rbh_per_reference] = @rbh_per_reference
42
- @comp_stats[:reference_coverage] = @reference_coverage
28
+ def calculate_reference_coverage crbblast
29
+ # The reciprocals hash in crb blast has contig names as the key.
30
+ # In order to look up by the reference name we need to reverse this.
31
+ # Scan through the reciprocals and get this Hit objects and add them to
32
+ # the @reference object for each reference sequence
33
+ get_reference_hits crbblast
34
+ per_query_contig_reference_coverage
35
+ per_target_contig_reference_coverage crbblast
43
36
  end
44
37
 
45
- def reciprocal_best_blast
46
- crbblast = CRB_Blast::CRB_Blast.new @assembly.file, @reference.file
47
- crbblast.run(1e-5, @threads, true)
48
- crbblast
49
- end
50
-
51
- # coverage of contigs that have reciprocal hits
52
- # divided by number of reciprocal targets
53
- def coverage crbblast
54
- return @reference_coverage unless @reference_coverage.nil?
55
- crbblast.reciprocals.each do |key, list|
56
- list.each_with_index do |hit, i|
38
+ def get_reference_hits crbblast
39
+ crbblast.reciprocals.each do |query_id, list|
40
+ list.each do |hit|
57
41
  unless @reference.assembly.key? hit.target
58
42
  raise TransrateError.new "#{hit.target} not in reference"
59
43
  end
60
44
  @reference[hit.target].hits << hit
45
+ end
46
+ end
47
+ @comp_stats[:CRBB_hits] = crbblast.size
48
+ @comp_stats[:n_contigs_with_CRBB] = crbblast.reciprocals.size
49
+ @comp_stats[:p_contigs_with_CRBB] = crbblast.reciprocals.size/@assembly.size.to_f
50
+ end
61
51
 
62
- unless @assembly.assembly.key? hit.query
63
- raise TransrateError.new "#{hit.query} not in assembly"
52
+ def per_query_contig_reference_coverage
53
+ # for each query contig in the @assembly find out how much it covers
54
+ # the reference
55
+ n_refs_with_recip = 0
56
+ total_crbb_hits = 0
57
+ @reference.each do |ref_contig_name, ref_contig|
58
+ ref_contig.hits.each do |hit| # a Hit from query to target
59
+ query_contig_name = hit.query
60
+ unless @assembly.assembly.key? query_contig_name
61
+ raise TransrateError.new "#{query_contig_name} not in assembly"
64
62
  end
65
- contig = @assembly[hit.query]
66
- contig.has_crb = true
67
- # how much of the reference is covered by this single contig
68
- if crbblast.target_is_prot
69
- contig.reference_coverage =
70
- (hit.alnlen - hit.mismatches - hit.gaps) / (3*hit.tlen)
63
+ @assembly[query_contig_name].has_crb = true
64
+ @assembly[query_contig_name].hits << hit
65
+ raise TransrateError.new "query should not be protein" if hit.qprot
66
+ if hit.tprot
67
+ coverage = 3*hit.alnlen+2 - 3*hit.mismatches - 3*hit.gaps
68
+ coverage /= 3.0*hit.tlen
71
69
  else
72
- contig.reference_coverage =
73
- (hit.alnlen - hit.mismatches - hit.gaps) / hit.tlen
70
+ coverage = hit.alnlen - hit.mismatches - hit.gaps
71
+ coverage /= hit.tlen.to_f
74
72
  end
75
- contig.hits << hit
73
+ @assembly[query_contig_name].reference_coverage = coverage
74
+ end
75
+
76
+ if ref_contig.hits.size > 0 # this reference has a crbblast hit
77
+ n_refs_with_recip += 1
76
78
  end
79
+ total_crbb_hits += ref_contig.hits.size
77
80
  end
81
+ @comp_stats[:rbh_per_reference] = total_crbb_hits / @reference.size.to_f
82
+ @comp_stats[:n_refs_with_CRBB] = n_refs_with_recip
83
+ @comp_stats[:p_refs_with_CRBB] = n_refs_with_recip / @reference.size.to_f
84
+ end
85
+
86
+ def per_target_contig_reference_coverage crbblast
87
+ # each target sequence in the reference can have multiple query contigs
88
+ # hit it. to calculate the reference coverage you can't just add up the
89
+ # alignment lengths. you have to make sure that overlaps are taken into
90
+ # account
91
+ coverage_thresholds = [0.25, 0.5, 0.75, 0.85, 0.95]
92
+ coverage_totals = [0, 0, 0, 0, 0]
93
+ prot = crbblast.target_is_prot
78
94
  total_coverage = 0
79
95
  total_length = 0
80
- cov = [0.25, 0.5, 0.75, 0.85, 0.95]
81
- @cov ||= [0, 0, 0, 0, 0]
82
- @reference.each_value do |ref_contig|
83
- key = ref_contig.name
84
- list = ref_contig.hits
85
- if crbblast.target_is_prot
86
- total_length += ref_contig.length * 3
96
+ @reference.each do |ref_contig_name, ref_contig|
97
+ if prot
98
+ covered = Array.new(ref_contig.length*3, false)
87
99
  else
88
- total_length += ref_contig.length
100
+ covered = Array.new(ref_contig.length, false)
89
101
  end
90
- next if list.empty?
91
- blocks = []
92
- target_length = 0
93
- list.each do |hit|
94
- target_length = hit.tlen
95
- if crbblast.target_is_prot
96
- target_length *= 3
97
- start, stop = [hit.tstart, hit.tend].minmax
98
- start = start*3-2
99
- stop = stop*3
100
- else
101
- start, stop = [hit.tstart, hit.tend].minmax
102
- end
103
- if blocks.empty?
104
- blocks << [start, stop]
105
- else
106
- found=false
107
- blocks.each do |block|
108
- # if query overlaps with any block extend that block
109
- o = overlap(block[0], block[1], start, stop)
110
- if o == 0 # perfect overlap
111
- found=true
112
- elsif o == 1 # partial overlap
113
- block[0] = start
114
- found=true
115
- elsif o == 2 # partial overlap
116
- block[1] = stop
117
- found=true
118
- elsif o == 3 # full overlap
119
- block[0] = start
120
- block[1] = stop
121
- found=true
122
- elsif o == 4 # full overlap
123
- found=true
124
- # nothing
125
- # elsif o == 5 || o == 6 # no overlap
126
-
127
- end
102
+ ref_contig.hits.each_with_index do |hit, i| # a Hit from query to target
103
+ if prot
104
+ if hit.qstart % 3 == 0
105
+ tstart = 3*hit.tstart-4
106
+ tend = 3*hit.tend
107
+ elsif hit.qstart % 3 == 1
108
+ tstart = 3*hit.tstart-2
109
+ tend = 3*hit.tend
110
+ elsif hit.qstart % 3 == 2
111
+ tstart = 3*hit.tstart-3
112
+ tend = 3*hit.tend-1
128
113
  end
129
- if !found
130
- blocks << [start, stop]
114
+ if hit.qlen % 3 == 1
115
+ tend += 1
116
+ elsif hit.qlen % 3 == 2
117
+ tend += 2
131
118
  end
132
- # if any blocks now overlap then extend one block and remove
133
- # the other
119
+ else
120
+ tstart = hit.tstart
121
+ tend = hit.tend
122
+ end
123
+ (tstart..tend).each do |b|
124
+ covered[b-1] = true # blast coords are 1 indexed
134
125
  end
135
126
  end
136
- blocks.each_with_index do |block_a,a|
137
- blocks.each_with_index do |block_b,b|
138
- if a!=b
139
- o = overlap(block_a[0], block_a[1], block_b[0], block_b[1])
140
- if o == 0 # perfect overlap
141
- block_b[0]=-1
142
- block_b[1]=-1
143
- elsif o == 1 # partial overlap
144
- block_a[0] = block_b[0]
145
- block_b[0] = -1
146
- block_b[1] = -1
147
- elsif o == 2 # partial overlap
148
- block_a[1] = block_b[1]
149
- block_b[0] = -1
150
- block_b[1] = -1
151
- elsif o == 3 # full overlap
152
- block_a[0] = block_b[0]
153
- block_a[1] = block_b[1]
154
- block_b[0] = -1
155
- block_b[1] = -1
156
- elsif o == 4 # full overlap
157
- block_b[0] = -1
158
- block_b[1] = -1
159
- # elsif o == 5 || o == 6# no overlap
160
- # do nothing
161
- # elsif # no overlap
162
- # do nothing
163
- end
164
- end
165
- end # each_with_index b
166
- end # each_with_index a
167
- # sum blocks to find total coverage
168
- length_of_coverage = calculate_coverage blocks
169
- if target_length > 0
170
- ref_p = length_of_coverage / target_length.to_f
171
- else
172
- ref_p = 0
173
- end
127
+ coverage = covered.reduce(0) { |sum, v| v ? sum + 1 : sum }
128
+ ref_p = coverage / covered.length.to_f
174
129
  ref_contig.reference_coverage = ref_p
175
-
176
- cov.each_with_index do |c, i|
177
- if ref_p >= c
178
- @cov[i] +=1
130
+ coverage_thresholds.each_with_index do |n, index|
131
+ if ref_p >= n
132
+ coverage_totals[index] += 1
179
133
  end
180
134
  end
181
135
 
182
- total_coverage += length_of_coverage
136
+ total_coverage += coverage
137
+ total_length += covered.length
183
138
  end
184
139
 
185
- cov.each_with_index do |p, i|
186
- @comp_stats["cov#{(100*p).to_i}".to_sym] = @cov[i]
140
+ # calculate proportion of ref sequences with coveragre over thresholds
141
+ coverage_thresholds.each_with_index do |p, i|
142
+ @comp_stats["cov#{(100*p).to_i}".to_sym] = coverage_totals[i]
187
143
  @comp_stats["p_cov#{(100*p).to_i}".to_sym] =
188
- @cov[i]/@reference.size.to_f
189
- end
190
- total_coverage / total_length.to_f
191
- end
192
-
193
- # Calculate the total coverage from a set of coverage blocks
194
- def calculate_coverage blocks
195
- coverage = 0
196
- blocks.each do |block|
197
- if block[0] and block[1]
198
- if block[0]>=0 and block[1]>=0
199
- coverage += block[1] - block[0] + 1
200
- end
201
- else
202
- puts "error: key = #{key}, #{blocks}"
203
- end
204
- end
205
- coverage
206
- end
207
-
208
- # Count reference proteins with at least one recprocal hit
209
- def count_ref_crbbs
210
- @n_refs_with_recip = @reference.assembly.inject(0) do |sum, entry|
211
- name, contig = entry
212
- sum + (contig.hits.length > 0 ? 1 : 0)
213
- end
214
- end
215
-
216
- def overlap(astart, astop, bstart, bstop)
217
- if astart == bstart and astop == bstop
218
- return 0
219
- elsif astart < bstart
220
- if astop > bstart
221
- if astop > bstop
222
- return 4
223
- else
224
- return 2
225
- end
226
- else
227
- return 5 # no overlap
228
- end
229
- else
230
- if bstop > astart
231
- if bstop > astop
232
- return 3
233
- else
234
- return 1
235
- end
236
- else
237
- return 6 # no overlap
238
- end
144
+ coverage_totals[i]/@reference.size.to_f
239
145
  end
146
+ @comp_stats[:reference_coverage] = total_coverage / total_length.to_f
240
147
  end
241
148
 
242
- def overlap_amount(astart, astop, bstart, bstop)
243
- if astart == bstart and astop == bstop
244
- return 1
245
- elsif astart < bstart
246
- if astop > bstart
247
- if astop > bstop
248
- return (bstop-bstart+1)/(astop-astart+1).to_f # 4
249
- else
250
- return (astop-bstart+1)/(bstop-astart+1).to_f # 2
251
- end
252
- else
253
- return 0 # 5 no overlap
254
- end
255
- else
256
- if bstop > astart
257
- if bstop > astop
258
- return (astop-astart+1)/(bstop-bstart+1).to_f # 3
259
- else
260
- return (bstop-astart+1)/(astop-bstart+1).to_f # 1
261
- end
262
- else
263
- return 0 # 6 no overlap
264
- end
265
- end
149
+ def run_crb_blast
150
+ crbblast = CRB_Blast::CRB_Blast.new @assembly.file, @reference.file
151
+ crbblast.run(1e-5, @threads, true)
152
+ crbblast
266
153
  end
267
154
 
268
155
  end # ComparativeMetrics