transrate 0.3.1 → 1.0.0.alpha.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +7 -0
- data/README.md +7 -6
- data/bin/transrate +21 -9
- data/deps/deps.yaml +49 -14
- data/ext/transrate/transrate.c +200 -166
- data/lib/transrate.rb +2 -3
- data/lib/transrate/assembly.rb +0 -49
- data/lib/transrate/cmd.rb +4 -0
- data/lib/transrate/comparative_metrics.rb +16 -64
- data/lib/transrate/contig.rb +57 -18
- data/lib/transrate/express.rb +79 -0
- data/lib/transrate/read_metrics.rb +196 -185
- data/lib/transrate/samtools.rb +88 -16
- data/lib/transrate/snap.rb +123 -0
- data/lib/transrate/transrater.rb +16 -19
- data/lib/transrate/version.rb +4 -4
- data/test/data/bridging_reads.l.fastq +20 -0
- data/test/data/bridging_reads.r.fastq +20 -0
- data/test/test_bin.rb +50 -21
- data/test/test_comp_metrics.rb +3 -27
- data/test/test_contig.rb +8 -0
- data/test/test_inline.rb +1 -1
- data/test/test_read_metrics.rb +108 -19
- data/test/test_transrater.rb +5 -5
- data/transrate.gemspec +2 -5
- metadata +66 -129
- data/lib/transrate/bowtie2.rb +0 -75
- data/lib/transrate/dimension_reduce.rb +0 -18
- data/lib/transrate/metric.rb +0 -16
- data/test/test_bowtie.rb +0 -66
data/lib/transrate.rb
CHANGED
@@ -7,12 +7,11 @@ require 'transrate/transrater'
|
|
7
7
|
require 'transrate/version'
|
8
8
|
require 'transrate/contig'
|
9
9
|
require 'transrate/assembly'
|
10
|
-
require 'transrate/
|
10
|
+
require 'transrate/snap'
|
11
|
+
require 'transrate/express'
|
11
12
|
require 'transrate/read_metrics'
|
12
13
|
require 'transrate/comparative_metrics'
|
13
14
|
require 'transrate/contig_metrics'
|
14
|
-
require 'transrate/metric'
|
15
|
-
require 'transrate/dimension_reduce'
|
16
15
|
require 'transrate/samtools'
|
17
16
|
require 'transrate/cmd'
|
18
17
|
require 'transrate/transrate.so'
|
data/lib/transrate/assembly.rb
CHANGED
@@ -184,55 +184,6 @@ module Transrate
|
|
184
184
|
|
185
185
|
end # basic_bin_stats
|
186
186
|
|
187
|
-
# Calls *block* with two arguments, the contig and an array
|
188
|
-
# of integer per-base coverage counts.
|
189
|
-
#
|
190
|
-
# @param bam [Bio::Db::Sam] a bam alignment of reads against this assembly
|
191
|
-
# @param block [Block] the block to call
|
192
|
-
def each_with_coverage(bam, &block)
|
193
|
-
logger.debug 'enumerating assembly with coverage'
|
194
|
-
# generate coverage with samtools
|
195
|
-
covfile = Samtools.coverage bam
|
196
|
-
# get an assembly enumerator
|
197
|
-
assembly_enum = @assembly.to_enum
|
198
|
-
contig_name, contig = assembly_enum.next
|
199
|
-
# precreate an array of the correct size to contain
|
200
|
-
# coverage. this is necessary because samtools mpileup
|
201
|
-
# doesn't print a result line for bases with 0 coverage
|
202
|
-
contig.coverage = Array.new(contig.length, 0)
|
203
|
-
# the columns we need
|
204
|
-
name_i, pos_i, cov_i = 0, 1, 3
|
205
|
-
# parse the coverage file
|
206
|
-
File.open(covfile).each_line do |line|
|
207
|
-
cols = line.chomp.split("\t")
|
208
|
-
unless (cols && cols.length > 4)
|
209
|
-
# last line
|
210
|
-
break
|
211
|
-
end
|
212
|
-
# extract the columns
|
213
|
-
name = Bio::FastaDefline.new(cols[name_i]).entry_id
|
214
|
-
pos, cov = cols[pos_i].to_i, cols[cov_i].to_i
|
215
|
-
unless contig_name == name
|
216
|
-
while contig_name != name
|
217
|
-
begin
|
218
|
-
block.call(contig, contig.coverage)
|
219
|
-
contig_name, contig = assembly_enum.next
|
220
|
-
contig.coverage = Array.new(contig.length, 0)
|
221
|
-
rescue StopIteration => stop_error
|
222
|
-
logger.error 'reached the end of assembly enumerator while ' +
|
223
|
-
'there were contigs left in the coverage results'
|
224
|
-
logger.error "final assembly contig: #{@assembly.last.name}"
|
225
|
-
logger.error "coverage contig: #{name}"
|
226
|
-
raise stop_error
|
227
|
-
end
|
228
|
-
end
|
229
|
-
end
|
230
|
-
contig.coverage[pos - 1] = cov
|
231
|
-
end
|
232
|
-
# yield the final contig
|
233
|
-
block.call(contig, contig.coverage)
|
234
|
-
end
|
235
|
-
|
236
187
|
end # Assembly
|
237
188
|
|
238
189
|
end # Transrate
|
data/lib/transrate/cmd.rb
CHANGED
@@ -11,7 +11,6 @@ module Transrate
|
|
11
11
|
attr_reader :has_run
|
12
12
|
attr_reader :reference_coverage
|
13
13
|
attr_reader :comp_stats
|
14
|
-
attr_reader :n_chimeras, :p_chimeras
|
15
14
|
|
16
15
|
def initialize assembly, reference, threads
|
17
16
|
@assembly = assembly
|
@@ -23,14 +22,12 @@ module Transrate
|
|
23
22
|
def run
|
24
23
|
@crbblast = reciprocal_best_blast
|
25
24
|
@reference_coverage = coverage @crbblast
|
26
|
-
@collapse_factor = collapse_factor @crbblast.reciprocals
|
27
25
|
@reciprocal_hits = @crbblast.size
|
28
26
|
@rbh_per_reference = @reciprocal_hits.to_f / @reference.size.to_f
|
29
27
|
@p_contigs_with_recip = @crbblast.reciprocals.size / @assembly.size.to_f
|
30
28
|
@n_contigs_with_recip = @crbblast.reciprocals.size
|
31
29
|
count_ref_crbbs
|
32
30
|
@p_refs_with_recip = @n_refs_with_recip / @reference.size.to_f
|
33
|
-
chimeras @crbblast
|
34
31
|
self.run_comp_stats
|
35
32
|
@has_run = true
|
36
33
|
end
|
@@ -43,9 +40,6 @@ module Transrate
|
|
43
40
|
@comp_stats[:n_refs_with_CRBB] = @n_refs_with_recip
|
44
41
|
@comp_stats[:rbh_per_reference] = @rbh_per_reference
|
45
42
|
@comp_stats[:reference_coverage] = @reference_coverage
|
46
|
-
@comp_stats[:collapse_factor] = @collapse_factor
|
47
|
-
@comp_stats[:n_chimeras] = @n_chimeras
|
48
|
-
@comp_stats[:p_chimeras] = @p_chimeras
|
49
43
|
end
|
50
44
|
|
51
45
|
def reciprocal_best_blast
|
@@ -71,19 +65,29 @@ module Transrate
|
|
71
65
|
contig = @assembly[hit.query]
|
72
66
|
contig.has_crb = true
|
73
67
|
# how much of the reference is covered by this single contig
|
74
|
-
|
68
|
+
if crbblast.target_is_prot
|
69
|
+
contig.reference_coverage =
|
70
|
+
(hit.alnlen - hit.mismatches - hit.gaps) / (3*hit.tlen)
|
71
|
+
else
|
72
|
+
contig.reference_coverage =
|
73
|
+
(hit.alnlen - hit.mismatches - hit.gaps) / hit.tlen
|
74
|
+
end
|
75
75
|
contig.hits << hit
|
76
76
|
end
|
77
77
|
end
|
78
78
|
total_coverage = 0
|
79
79
|
total_length = 0
|
80
80
|
cov = [0.25, 0.5, 0.75, 0.85, 0.95]
|
81
|
+
@cov ||= [0, 0, 0, 0, 0]
|
81
82
|
@reference.each_value do |ref_contig|
|
82
83
|
key = ref_contig.name
|
83
84
|
list = ref_contig.hits
|
84
|
-
|
85
|
-
|
86
|
-
|
85
|
+
if crbblast.target_is_prot
|
86
|
+
total_length += ref_contig.length * 3
|
87
|
+
else
|
88
|
+
total_length += ref_contig.length
|
89
|
+
end
|
90
|
+
next if list.empty?
|
87
91
|
blocks = []
|
88
92
|
target_length = 0
|
89
93
|
list.each do |hit|
|
@@ -162,9 +166,7 @@ module Transrate
|
|
162
166
|
end # each_with_index a
|
163
167
|
# sum blocks to find total coverage
|
164
168
|
length_of_coverage = calculate_coverage blocks
|
165
|
-
@cov ||= [0, 0, 0, 0, 0]
|
166
169
|
if target_length > 0
|
167
|
-
# puts "#{length_of_coverage} / #{target_length.to_f}"
|
168
170
|
ref_p = length_of_coverage / target_length.to_f
|
169
171
|
else
|
170
172
|
ref_p = 0
|
@@ -179,10 +181,11 @@ module Transrate
|
|
179
181
|
|
180
182
|
total_coverage += length_of_coverage
|
181
183
|
end
|
184
|
+
|
182
185
|
cov.each_with_index do |p, i|
|
183
186
|
@comp_stats["cov#{(100*p).to_i}".to_sym] = @cov[i]
|
184
187
|
@comp_stats["p_cov#{(100*p).to_i}".to_sym] =
|
185
|
-
|
188
|
+
@cov[i]/@reference.size.to_f
|
186
189
|
end
|
187
190
|
total_coverage / total_length.to_f
|
188
191
|
end
|
@@ -210,44 +213,6 @@ module Transrate
|
|
210
213
|
end
|
211
214
|
end
|
212
215
|
|
213
|
-
def chimeras crbblast
|
214
|
-
@n_chimeras = 0
|
215
|
-
crbblast.reciprocals.each_pair do |key, list|
|
216
|
-
p = 0
|
217
|
-
list.each_with_index do |a, i|
|
218
|
-
list.each_with_index do |b, j|
|
219
|
-
if j>i
|
220
|
-
if a.target == b.target
|
221
|
-
astart, astop = [a.tstart, a.tend].minmax
|
222
|
-
bstart, bstop = [b.tstart, b.tend].minmax
|
223
|
-
|
224
|
-
oa = overlap_amount(astart, astop, bstart, bstop)
|
225
|
-
if oa > 0.75
|
226
|
-
p += 1
|
227
|
-
end
|
228
|
-
else
|
229
|
-
astart, astop = [a.qstart, a.qend].minmax
|
230
|
-
bstart, bstop = [b.qstart, b.qend].minmax
|
231
|
-
|
232
|
-
oa = overlap_amount(astart, astop, bstart, bstop)
|
233
|
-
if oa < 0.25
|
234
|
-
p += 1
|
235
|
-
end
|
236
|
-
end
|
237
|
-
end
|
238
|
-
end
|
239
|
-
end
|
240
|
-
if p/list.size.to_f >= 0.5
|
241
|
-
@n_chimeras += 1
|
242
|
-
unless @assembly.assembly.key? key
|
243
|
-
puts "key not in assembly: #{key}"
|
244
|
-
end
|
245
|
-
@assembly[key].is_chimera = true
|
246
|
-
end
|
247
|
-
end
|
248
|
-
@p_chimeras = @n_chimeras / crbblast.reciprocals.length.to_f
|
249
|
-
end
|
250
|
-
|
251
216
|
def overlap(astart, astop, bstart, bstop)
|
252
217
|
if astart == bstart and astop == bstop
|
253
218
|
return 0
|
@@ -300,19 +265,6 @@ module Transrate
|
|
300
265
|
end
|
301
266
|
end
|
302
267
|
|
303
|
-
# Count unique reference proteins per contig
|
304
|
-
def collapse_factor reciprocals
|
305
|
-
return @collapse_factor unless @collapse_factor.nil?
|
306
|
-
cf_sum = 0
|
307
|
-
reciprocals.each do |query, hits|
|
308
|
-
uniq_hits = Set.new hits.map{ |h| h.target }
|
309
|
-
cf = uniq_hits.length
|
310
|
-
@assembly[query].collapse_factor = cf
|
311
|
-
cf_sum += cf
|
312
|
-
end
|
313
|
-
cf_sum / reciprocals.size
|
314
|
-
end
|
315
|
-
|
316
268
|
end # ComparativeMetrics
|
317
269
|
|
318
270
|
end # Transrate
|
data/lib/transrate/contig.rb
CHANGED
@@ -10,9 +10,12 @@ module Transrate
|
|
10
10
|
def_delegators :@seq, :size, :length
|
11
11
|
attr_accessor :seq, :name
|
12
12
|
# read-based metrics
|
13
|
-
attr_accessor :coverage, :uncovered_bases, :
|
13
|
+
attr_accessor :coverage, :uncovered_bases, :p_uncovered_bases
|
14
|
+
attr_accessor :p_seq_true, :p_unique
|
15
|
+
attr_accessor :low_uniqueness_bases, :in_bridges
|
16
|
+
attr_accessor :p_good, :p_not_segmented
|
14
17
|
# reference-based metrics
|
15
|
-
attr_accessor :has_crb, :
|
18
|
+
attr_accessor :has_crb, :reference_coverage
|
16
19
|
attr_accessor :hits
|
17
20
|
|
18
21
|
def initialize(seq, name: nil)
|
@@ -22,11 +25,16 @@ module Transrate
|
|
22
25
|
@name = seq.respond_to?(:entry_id) ? seq.entry_id : name
|
23
26
|
@hits = []
|
24
27
|
@reference_coverage = 0
|
25
|
-
@collapse_factor = 0
|
26
|
-
@is_chimera = false
|
27
28
|
@has_crb = false
|
28
29
|
@in_bridges = 0
|
29
|
-
@
|
30
|
+
@p_seq_true = 0
|
31
|
+
@low_uniqueness_bases = 0
|
32
|
+
@p_good = -1
|
33
|
+
@uncovered_bases = length
|
34
|
+
@p_uncovered_bases = 1
|
35
|
+
@p_unique = 0
|
36
|
+
@p_not_segmented = 1
|
37
|
+
@score = -1
|
30
38
|
end
|
31
39
|
|
32
40
|
def each &block
|
@@ -43,34 +51,38 @@ module Transrate
|
|
43
51
|
:cpg_count => cpg_count,
|
44
52
|
:cpg_ratio => cpg_ratio,
|
45
53
|
:orf_length => orf_length,
|
46
|
-
:linguistic_complexity_6 => linguistic_complexity(6)
|
54
|
+
:linguistic_complexity_6 => linguistic_complexity(6),
|
47
55
|
}
|
48
56
|
end
|
49
57
|
|
50
58
|
def read_metrics
|
51
|
-
read = @
|
52
|
-
:
|
53
|
-
:
|
54
|
-
:
|
59
|
+
read = @p_good>=0 ? {
|
60
|
+
:in_bridges => in_bridges,
|
61
|
+
:p_good => @p_good,
|
62
|
+
:p_bases_covered => p_bases_covered,
|
63
|
+
:p_seq_true => p_seq_true,
|
64
|
+
:score => score,
|
65
|
+
:p_unique => p_unique,
|
66
|
+
:p_not_segmented => p_not_segmented
|
55
67
|
} : {
|
56
|
-
:
|
57
|
-
:
|
58
|
-
:
|
68
|
+
:in_bridges => "NA",
|
69
|
+
:p_good => "NA",
|
70
|
+
:p_bases_covered => "NA",
|
71
|
+
:p_seq_true => "NA",
|
72
|
+
:score => "NA",
|
73
|
+
:p_unique => p_unique,
|
74
|
+
:p_not_segmented => p_not_segmented
|
59
75
|
}
|
60
76
|
end
|
61
77
|
|
62
78
|
def comparative_metrics
|
63
79
|
reference = @has_crb ? {
|
64
80
|
:has_crb => has_crb,
|
65
|
-
:collapse_factor => collapse_factor,
|
66
81
|
:reference_coverage => reference_coverage,
|
67
|
-
:is_chimera => is_chimera,
|
68
82
|
:hits => hits.map{ |h| h.target }.join(";")
|
69
83
|
} : {
|
70
84
|
:has_crb => false,
|
71
|
-
:collapse_factor => "NA",
|
72
85
|
:reference_coverage => "NA",
|
73
|
-
:is_chimera => "NA",
|
74
86
|
:hits => "NA"
|
75
87
|
}
|
76
88
|
end
|
@@ -89,7 +101,7 @@ module Transrate
|
|
89
101
|
composition(@seq.seq)
|
90
102
|
alphabet = ['a', 'c', 'g', 't', 'n']
|
91
103
|
@base_composition = {}
|
92
|
-
@dibase_composition={}
|
104
|
+
@dibase_composition = {}
|
93
105
|
bases = []
|
94
106
|
dibases = []
|
95
107
|
alphabet.each do |c|
|
@@ -208,6 +220,33 @@ module Transrate
|
|
208
220
|
def linguistic_complexity k
|
209
221
|
return kmer_count(k, @seq.seq)/(4**k).to_f # call to C
|
210
222
|
end
|
223
|
+
|
224
|
+
def p_bases_covered
|
225
|
+
1 - p_uncovered_bases
|
226
|
+
end
|
227
|
+
|
228
|
+
def uncovered_bases= n
|
229
|
+
@uncovered_bases = n
|
230
|
+
@p_uncovered_bases = n / length.to_f
|
231
|
+
end
|
232
|
+
|
233
|
+
def p_unique_bases
|
234
|
+
(length - low_uniqueness_bases) / length.to_f
|
235
|
+
end
|
236
|
+
|
237
|
+
# Contig score (geometric mean of all score components)
|
238
|
+
def score
|
239
|
+
return @score if @score != -1
|
240
|
+
prod =
|
241
|
+
[p_bases_covered, 0.01].max * # proportion of bases covered
|
242
|
+
[p_not_segmented, 0.01].max * # prob contig has 0 changepoints
|
243
|
+
[p_good, 0.01].max * # proportion of reads that mapped good
|
244
|
+
[p_seq_true, 0.01].max * # scaled 1 - mean per-base edit distance
|
245
|
+
[p_unique, 0.01].max # prop mapQ >= 5
|
246
|
+
s = prod ** (1.0 / 5)
|
247
|
+
s = 0.01 if !s
|
248
|
+
@score = [s, 0.01].max
|
249
|
+
end
|
211
250
|
end
|
212
251
|
|
213
252
|
end
|
@@ -0,0 +1,79 @@
|
|
1
|
+
|
2
|
+
module Transrate
|
3
|
+
|
4
|
+
class ExpressError < StandardError
|
5
|
+
end
|
6
|
+
|
7
|
+
class Express
|
8
|
+
|
9
|
+
require 'ostruct'
|
10
|
+
|
11
|
+
# return an Express object
|
12
|
+
def initialize
|
13
|
+
which = Cmd.new('which express')
|
14
|
+
which.run
|
15
|
+
if !which.status.success?
|
16
|
+
raise ExpressError.new("could not find express in the path")
|
17
|
+
end
|
18
|
+
@express = which.stdout.split("\n").first
|
19
|
+
end
|
20
|
+
|
21
|
+
# return struct containing:
|
22
|
+
# results_file => path to the express results TSV
|
23
|
+
# expression => a hash of target => effective_count
|
24
|
+
# align_samp => path to the sampled alignments file
|
25
|
+
def run assembly, bamfile
|
26
|
+
assembly = assembly.file if assembly.is_a? Assembly
|
27
|
+
|
28
|
+
ex_output = 'results.xprs'
|
29
|
+
fin_output = "#{File.basename assembly}_#{ex_output}"
|
30
|
+
|
31
|
+
unless File.exists? fin_output
|
32
|
+
runner = Cmd.new build_command(assembly, bamfile)
|
33
|
+
runner.run
|
34
|
+
unless runner.status.success?
|
35
|
+
raise ExpressError.new("Express failed\n" +
|
36
|
+
runner.stderr + "\n" +
|
37
|
+
runner.stdout)
|
38
|
+
end
|
39
|
+
File.rename(ex_output, fin_output)
|
40
|
+
end
|
41
|
+
|
42
|
+
OpenStruct.new(:results_file => fin_output,
|
43
|
+
:expression => load_expression(fin_output),
|
44
|
+
:align_samp => 'hits.1.samp.bam')
|
45
|
+
end
|
46
|
+
|
47
|
+
# return the constructed eXpress command
|
48
|
+
def build_command assembly, bamfile
|
49
|
+
cmd = "#{@express}"
|
50
|
+
cmd << " #{File.expand_path assembly}"
|
51
|
+
cmd << " #{File.expand_path bamfile}"
|
52
|
+
cmd << " --output-dir ."
|
53
|
+
cmd << " --output-align-samp"
|
54
|
+
cmd << " --no-update-check"
|
55
|
+
cmd << " --additional-online 1"
|
56
|
+
cmd
|
57
|
+
end
|
58
|
+
|
59
|
+
# return a hash of target => effective_count created
|
60
|
+
# by parsing the results file
|
61
|
+
def load_expression file
|
62
|
+
expression = {}
|
63
|
+
first = true
|
64
|
+
File.open(file).each do |line|
|
65
|
+
if first
|
66
|
+
first = false
|
67
|
+
next
|
68
|
+
end
|
69
|
+
line = line.chomp.split("\t")
|
70
|
+
target = line[1]
|
71
|
+
effective_count = line[7]
|
72
|
+
expression[target] = effective_count.to_f
|
73
|
+
end
|
74
|
+
expression
|
75
|
+
end
|
76
|
+
|
77
|
+
end # Express
|
78
|
+
|
79
|
+
end # Transrate
|
@@ -2,243 +2,254 @@ module Transrate
|
|
2
2
|
|
3
3
|
class ReadMetrics
|
4
4
|
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
attr_reader :total
|
5
|
+
attr_reader :fragments_mapping
|
6
|
+
attr_reader :p_good_mapping
|
9
7
|
attr_reader :bad
|
10
8
|
attr_reader :supported_bridges
|
11
|
-
attr_reader :pr_good_mapping
|
12
|
-
attr_reader :percent_mapping
|
13
|
-
attr_reader :prop_expressed
|
14
9
|
attr_reader :has_run
|
10
|
+
attr_reader :read_length
|
15
11
|
|
16
12
|
def initialize assembly
|
17
13
|
@assembly = assembly
|
18
|
-
@mapper =
|
14
|
+
@mapper = Snap.new
|
19
15
|
self.initial_values
|
16
|
+
|
17
|
+
load_executables
|
18
|
+
@read_length = 100
|
19
|
+
end
|
20
|
+
|
21
|
+
def load_executables
|
22
|
+
@bam_splitter = get_bin_path 'bam-split'
|
23
|
+
@bam_reader = get_bin_path 'bam-read'
|
24
|
+
end
|
25
|
+
|
26
|
+
def get_bin_path bin
|
27
|
+
which_bin = Cmd.new("which #{bin}")
|
28
|
+
which_bin.run
|
29
|
+
if !which_bin.status.success?
|
30
|
+
raise IOError.new("ReadMetrics: could not find #{bin} in path")
|
31
|
+
end
|
32
|
+
which_bin.stdout.split("\n").first
|
20
33
|
end
|
21
34
|
|
22
35
|
def run left, right, insertsize:200, insertsd:50, threads:8
|
36
|
+
# check all read files exist
|
23
37
|
[left, right].each do |readfile|
|
24
|
-
|
25
|
-
|
38
|
+
raise IOError.new "Read file is nil" if readfile.nil?
|
39
|
+
readfile.split(",").each do |file|
|
40
|
+
unless File.exist? file
|
41
|
+
raise IOError.new "ReadMetrics: read file does not exist: #{file}"
|
42
|
+
end
|
26
43
|
end
|
27
44
|
end
|
28
|
-
|
29
|
-
|
30
|
-
|
45
|
+
|
46
|
+
# estimate max read length
|
47
|
+
@read_length = get_read_length(left, right)
|
48
|
+
|
49
|
+
# map reads
|
50
|
+
@mapper.build_index(@assembly.file, threads)
|
51
|
+
bamfile = @mapper.map_reads(@assembly.file, left, right,
|
31
52
|
insertsize: insertsize,
|
32
53
|
insertsd: insertsd,
|
33
54
|
threads: threads)
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
55
|
+
@fragments = @mapper.read_count
|
56
|
+
|
57
|
+
# classify bam file into valid and invalid alignments
|
58
|
+
sorted_bam = "#{File.basename(bamfile, '.bam')}.merged.sorted.bam"
|
59
|
+
readsorted_bam = "#{File.basename(bamfile, '.bam')}.valid.sorted.bam"
|
60
|
+
unless File.exist? sorted_bam
|
61
|
+
valid_bam, invalid_bam = split_bam bamfile
|
62
|
+
readsorted_bam = Samtools.readsort_bam(valid_bam, threads)
|
63
|
+
File.delete valid_bam
|
64
|
+
end
|
65
|
+
|
66
|
+
# pass valid alignments to eXpress for assignment
|
67
|
+
# always have to run the eXpress command to load the results
|
68
|
+
assigned_bam = assign_and_quantify readsorted_bam
|
69
|
+
|
70
|
+
# merge the assigned alignments back with the invalid ones
|
71
|
+
unless File.exist? sorted_bam
|
72
|
+
File.delete readsorted_bam
|
73
|
+
merged_bam = "#{File.basename(bamfile, '.bam')}.merged.bam"
|
74
|
+
Samtools.merge_bam(invalid_bam, assigned_bam, merged_bam, threads=threads)
|
75
|
+
File.delete invalid_bam
|
76
|
+
File.delete assigned_bam
|
77
|
+
sorted_bam = Samtools.sort_bam(merged_bam, threads)
|
78
|
+
File.delete merged_bam
|
79
|
+
end
|
80
|
+
|
81
|
+
# analyse the final mappings
|
82
|
+
analyse_read_mappings(sorted_bam, insertsize, insertsd, true)
|
83
|
+
|
40
84
|
@has_run = true
|
41
85
|
end
|
42
86
|
|
43
87
|
def read_stats
|
44
88
|
{
|
45
|
-
:
|
46
|
-
:
|
47
|
-
:
|
89
|
+
:fragments => @fragments,
|
90
|
+
:fragments_mapped => @fragments_mapped,
|
91
|
+
:p_fragments_mapped => @p_fragments_mapped,
|
48
92
|
:good_mappings => @good,
|
49
|
-
:
|
93
|
+
:p_good_mapping => @p_good_mapping,
|
50
94
|
:bad_mappings => @bad,
|
51
|
-
:potential_bridges => @
|
52
|
-
:
|
53
|
-
:
|
54
|
-
:
|
55
|
-
:
|
56
|
-
:
|
57
|
-
:
|
58
|
-
:
|
59
|
-
:
|
60
|
-
:
|
95
|
+
:potential_bridges => @potential_bridges,
|
96
|
+
:bases_uncovered => @bases_uncovered,
|
97
|
+
:p_bases_uncovered => @p_bases_uncovered,
|
98
|
+
:contigs_uncovbase => @contigs_uncovbase,
|
99
|
+
:p_contigs_uncovbase => @p_contigs_uncovbase,
|
100
|
+
:contigs_uncovered => @contigs_uncovered,
|
101
|
+
:p_contigs_uncovered => @p_contigs_uncovered,
|
102
|
+
:contigs_lowcovered => @contigs_lowcovered,
|
103
|
+
:p_contigs_lowcovered => @p_contigs_lowcovered,
|
104
|
+
:contigs_segmented => @contigs_segmented,
|
105
|
+
:p_contigs_segmented => @p_contigs_segmented,
|
106
|
+
:contigs_good => @contigs_good,
|
107
|
+
:p_contigs_good => @p_contigs_good
|
61
108
|
}
|
62
109
|
end
|
63
110
|
|
64
|
-
def
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
111
|
+
def get_read_length(left, right)
|
112
|
+
count=0
|
113
|
+
file = File.open(left.split(",").first)
|
114
|
+
name = file.readline.chomp
|
115
|
+
seq = file.readline.chomp
|
116
|
+
na = file.readline.chomp
|
117
|
+
qual = file.readline.chomp
|
118
|
+
read_length = 0
|
119
|
+
while name and count < 5000 # get max read length from first 5000 reads
|
120
|
+
read_length = [read_length, seq.length].max
|
121
|
+
name = file.readline.chomp rescue nil
|
122
|
+
seq = file.readline.chomp rescue nil
|
123
|
+
na = file.readline.chomp rescue nil
|
124
|
+
qual = file.readline.chomp rescue nil
|
125
|
+
count+=1
|
126
|
+
end
|
127
|
+
read_length
|
128
|
+
end
|
129
|
+
|
130
|
+
def split_bam bamfile
|
131
|
+
base = File.basename(bamfile, '.bam')
|
132
|
+
valid = "#{base}.valid.bam"
|
133
|
+
invalid = "#{base}.invalid.bam"
|
134
|
+
if !File.exist? valid
|
135
|
+
cmd = "#{@bam_splitter} #{bamfile}"
|
136
|
+
splitter = Cmd.new cmd
|
137
|
+
splitter.run
|
138
|
+
if !splitter.status.success?
|
139
|
+
logger.warn "Couldn't split bam file: #{bamfile}" +
|
140
|
+
"\n#{splitter.stdout}\n#{splitter.stderr}"
|
88
141
|
end
|
89
|
-
check_bridges
|
90
|
-
else
|
91
|
-
raise "samfile #{samfile} not found"
|
92
142
|
end
|
143
|
+
if !File.exist? valid
|
144
|
+
logger.warn "Splitting failed to create valid bam: #{valid}"
|
145
|
+
end
|
146
|
+
[valid, invalid]
|
93
147
|
end
|
94
148
|
|
95
|
-
def
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
@both_mapped = 0
|
101
|
-
@properly_paired = 0
|
102
|
-
@improperly_paired = 0
|
103
|
-
@proper_orientation = 0
|
104
|
-
@improper_orientation = 0
|
105
|
-
@same_contig = 0
|
106
|
-
@realistic_overlap = 0
|
107
|
-
@unrealistic_overlap = 0
|
108
|
-
@realistic_fragment = 0
|
109
|
-
@unrealistic_fragment = 0
|
110
|
-
@n_uncovered_bases = 0
|
111
|
-
@n_uncovered_base_contigs = 0 # any base cov < 1
|
112
|
-
@n_uncovered_contigs = 0 # mean cov < 1
|
113
|
-
@n_lowcovered_contigs = 0 # mean cov < 10
|
149
|
+
def assign_and_quantify bamfile
|
150
|
+
express = Express.new
|
151
|
+
results = express.run(@assembly, bamfile)
|
152
|
+
analyse_expression results.expression
|
153
|
+
results.align_samp
|
114
154
|
end
|
115
155
|
|
116
|
-
def
|
117
|
-
|
156
|
+
def analyse_expression express_output
|
157
|
+
express_output.each_pair do |name, eff_count|
|
158
|
+
@contigs_uncovered += 1 if eff_count < 1
|
159
|
+
@contigs_lowcovered += 1 if eff_count < 10
|
160
|
+
contig = @assembly[name]
|
161
|
+
contig.coverage = eff_count
|
162
|
+
end
|
118
163
|
end
|
119
164
|
|
120
|
-
def
|
165
|
+
def analyse_read_mappings bamfile, insertsize, insertsd, bridge=true
|
166
|
+
if File.exist?(bamfile) && File.size(bamfile) > 0
|
167
|
+
csv_output = "#{File.basename(@assembly.file)}_bam_info.csv"
|
168
|
+
csv_output = File.expand_path(csv_output)
|
121
169
|
|
122
|
-
|
170
|
+
analyse_bam bamfile, csv_output
|
171
|
+
# open output csv file
|
172
|
+
@potential_bridges = 0
|
123
173
|
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
# reads are paired
|
129
|
-
@both_mapped += 1 if ls.primary_aln?
|
130
|
-
if ls.read_properly_paired?
|
131
|
-
# mapped in proper pair
|
132
|
-
@properly_paired += 1
|
133
|
-
self.check_orientation(ls, rs)
|
134
|
-
else
|
135
|
-
# not mapped in proper pair
|
136
|
-
@improperly_paired += 1
|
137
|
-
if ls.chrom == rs.chrom
|
138
|
-
# both on same contig
|
139
|
-
@same_contig += 1
|
140
|
-
self.check_overlap_plausibility(ls, rs)
|
141
|
-
else
|
142
|
-
self.check_fragment_plausibility(ls, rs, realistic_dist)
|
143
|
-
end
|
174
|
+
CSV.foreach(csv_output, :headers => true,
|
175
|
+
:header_converters => :symbol,
|
176
|
+
:converters => :all) do |row|
|
177
|
+
populate_contig_data row
|
144
178
|
end
|
145
|
-
|
146
|
-
end
|
147
|
-
|
148
|
-
def check_orientation ls, rs
|
149
|
-
if ls.pair_opposite_strands?
|
150
|
-
# mates in proper orientation
|
151
|
-
@proper_orientation += 1
|
152
|
-
@good += 1
|
179
|
+
@bad = @fragments_mapped - @good
|
153
180
|
else
|
154
|
-
|
155
|
-
|
156
|
-
|
181
|
+
logger.warn "couldn't find bamfile: #{bamfile}"
|
182
|
+
end
|
183
|
+
@assembly.assembly.each_pair do |name, contig|
|
184
|
+
@contigs_good += 1 if contig.score >= 0.5
|
157
185
|
end
|
186
|
+
update_proportions
|
158
187
|
end
|
159
188
|
|
160
|
-
def
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
189
|
+
def update_proportions
|
190
|
+
nbases = @assembly.n_bases.to_f
|
191
|
+
ncontigs = @assembly.size.to_f
|
192
|
+
|
193
|
+
@p_bases_uncovered = @bases_uncovered / nbases
|
194
|
+
@p_contigs_uncovbase = @contigs_uncovbase / ncontigs
|
195
|
+
@p_contigs_uncovered = @contigs_uncovered / ncontigs
|
196
|
+
@p_contigs_lowcovered = @contigs_lowcovered / ncontigs
|
197
|
+
@p_contigs_segmented = @contigs_segmented / ncontigs
|
198
|
+
@p_contigs_good = @contigs_good / ncontigs
|
199
|
+
|
200
|
+
@p_good_mapping = @good.to_f / @fragments.to_f
|
201
|
+
@p_fragments_mapped = @fragments_mapped / @fragments.to_f
|
170
202
|
end
|
171
203
|
|
172
|
-
def
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
# increase the evidence for this bridge
|
180
|
-
key = [ls.chrom, rs.chrom].sort.join("<>").to_sym
|
181
|
-
if @bridges.has_key? key
|
182
|
-
@bridges[key] += 1
|
183
|
-
else
|
184
|
-
@bridges[key] = 1
|
204
|
+
def analyse_bam bamfile, csv_output
|
205
|
+
if !File.exist?(csv_output)
|
206
|
+
cmd = "#{@bam_reader} #{bamfile} #{csv_output}"
|
207
|
+
reader = Cmd.new cmd
|
208
|
+
reader.run
|
209
|
+
if !reader.status.success?
|
210
|
+
logger.warn "couldn't get information from bam file: #{bamfile}"
|
185
211
|
end
|
186
|
-
@realistic_fragment += 1
|
187
|
-
@good += 1
|
188
|
-
else
|
189
|
-
@unrealistic_fragment += 1
|
190
|
-
@bad += 1
|
191
212
|
end
|
192
213
|
end
|
193
214
|
|
194
|
-
def
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
215
|
+
def populate_contig_data row
|
216
|
+
contig = @assembly[row[:name]]
|
217
|
+
scale = 0.7
|
218
|
+
contig.p_seq_true = (row[:p_seq_true] - scale) * (1.0 / (1 - scale))
|
219
|
+
contig.uncovered_bases = row[:bases_uncovered]
|
220
|
+
@bases_uncovered += contig.uncovered_bases
|
221
|
+
if row[:fragments_mapped] and row[:fragments_mapped] > 0
|
222
|
+
contig.p_good = row[:good]/row[:fragments_mapped].to_f
|
223
|
+
end
|
224
|
+
contig.p_not_segmented = row[:p_not_segmented]
|
225
|
+
if contig.p_not_segmented < 0.5
|
226
|
+
@contigs_segmented += 1
|
227
|
+
end
|
228
|
+
contig.in_bridges = row[:bridges]
|
229
|
+
contig.p_unique = row[:p_unique]
|
230
|
+
if row[:bridges] > 1
|
231
|
+
@potential_bridges += 1
|
232
|
+
end
|
233
|
+
@fragments_mapped += row[:fragments_mapped]
|
234
|
+
@good += row[:good]
|
235
|
+
if row[:bases_uncovered] > 0
|
236
|
+
@contigs_uncovbase += 1
|
206
237
|
end
|
207
238
|
end
|
208
239
|
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
#
|
216
|
-
#
|
217
|
-
|
218
|
-
@
|
219
|
-
|
220
|
-
contig.uncovered_bases, total = 0, 0
|
221
|
-
coverage.each do |e|
|
222
|
-
total += e
|
223
|
-
contig.uncovered_bases += 1 if e < 1
|
224
|
-
end
|
225
|
-
tot_length += coverage.length
|
226
|
-
tot_coverage += total
|
227
|
-
contig.mean_coverage = total / coverage.length.to_f
|
228
|
-
@n_uncovered_bases += contig.uncovered_bases
|
229
|
-
@n_uncovered_base_contigs += 1 if contig.uncovered_bases > 0
|
230
|
-
@n_uncovered_contigs += 1 if contig.mean_coverage < 1
|
231
|
-
@n_lowcovered_contigs += 1 if contig.mean_coverage < 10
|
232
|
-
end
|
233
|
-
@mean_coverage = (tot_coverage / tot_length.to_f).round(2)
|
234
|
-
@p_uncovered_bases = @n_uncovered_bases / @assembly.n_bases.to_f
|
235
|
-
@p_uncovered_base_contigs = @n_uncovered_base_contigs /
|
236
|
-
@assembly.size.to_f
|
237
|
-
@p_uncovered_contigs = @n_uncovered_contigs / @assembly.size.to_f
|
238
|
-
@p_lowcovered_contigs = @n_lowcovered_contigs / @assembly.size.to_f
|
240
|
+
def initial_values
|
241
|
+
@fragments = 0
|
242
|
+
@fragments_mapped = 0
|
243
|
+
@good = 0
|
244
|
+
@bad = 0
|
245
|
+
@bases_uncovered = 0
|
246
|
+
@contigs_uncovbase = 0 # any base cov < 1
|
247
|
+
@contigs_uncovered = 0 # mean cov < 1
|
248
|
+
@contigs_lowcovered = 0 # mean cov < 10
|
249
|
+
@contigs_segmented = 0 # p_not_segmented < 0.5
|
250
|
+
@contigs_good = 0
|
239
251
|
end
|
240
252
|
|
241
253
|
end # ReadMetrics
|
242
254
|
|
243
255
|
end # Transrate
|
244
|
-
|