transrate 0.3.1 → 1.0.0.alpha.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +7 -0
- data/README.md +7 -6
- data/bin/transrate +21 -9
- data/deps/deps.yaml +49 -14
- data/ext/transrate/transrate.c +200 -166
- data/lib/transrate.rb +2 -3
- data/lib/transrate/assembly.rb +0 -49
- data/lib/transrate/cmd.rb +4 -0
- data/lib/transrate/comparative_metrics.rb +16 -64
- data/lib/transrate/contig.rb +57 -18
- data/lib/transrate/express.rb +79 -0
- data/lib/transrate/read_metrics.rb +196 -185
- data/lib/transrate/samtools.rb +88 -16
- data/lib/transrate/snap.rb +123 -0
- data/lib/transrate/transrater.rb +16 -19
- data/lib/transrate/version.rb +4 -4
- data/test/data/bridging_reads.l.fastq +20 -0
- data/test/data/bridging_reads.r.fastq +20 -0
- data/test/test_bin.rb +50 -21
- data/test/test_comp_metrics.rb +3 -27
- data/test/test_contig.rb +8 -0
- data/test/test_inline.rb +1 -1
- data/test/test_read_metrics.rb +108 -19
- data/test/test_transrater.rb +5 -5
- data/transrate.gemspec +2 -5
- metadata +66 -129
- data/lib/transrate/bowtie2.rb +0 -75
- data/lib/transrate/dimension_reduce.rb +0 -18
- data/lib/transrate/metric.rb +0 -16
- data/test/test_bowtie.rb +0 -66
data/lib/transrate.rb
CHANGED
@@ -7,12 +7,11 @@ require 'transrate/transrater'
|
|
7
7
|
require 'transrate/version'
|
8
8
|
require 'transrate/contig'
|
9
9
|
require 'transrate/assembly'
|
10
|
-
require 'transrate/
|
10
|
+
require 'transrate/snap'
|
11
|
+
require 'transrate/express'
|
11
12
|
require 'transrate/read_metrics'
|
12
13
|
require 'transrate/comparative_metrics'
|
13
14
|
require 'transrate/contig_metrics'
|
14
|
-
require 'transrate/metric'
|
15
|
-
require 'transrate/dimension_reduce'
|
16
15
|
require 'transrate/samtools'
|
17
16
|
require 'transrate/cmd'
|
18
17
|
require 'transrate/transrate.so'
|
data/lib/transrate/assembly.rb
CHANGED
@@ -184,55 +184,6 @@ module Transrate
|
|
184
184
|
|
185
185
|
end # basic_bin_stats
|
186
186
|
|
187
|
-
# Calls *block* with two arguments, the contig and an array
|
188
|
-
# of integer per-base coverage counts.
|
189
|
-
#
|
190
|
-
# @param bam [Bio::Db::Sam] a bam alignment of reads against this assembly
|
191
|
-
# @param block [Block] the block to call
|
192
|
-
def each_with_coverage(bam, &block)
|
193
|
-
logger.debug 'enumerating assembly with coverage'
|
194
|
-
# generate coverage with samtools
|
195
|
-
covfile = Samtools.coverage bam
|
196
|
-
# get an assembly enumerator
|
197
|
-
assembly_enum = @assembly.to_enum
|
198
|
-
contig_name, contig = assembly_enum.next
|
199
|
-
# precreate an array of the correct size to contain
|
200
|
-
# coverage. this is necessary because samtools mpileup
|
201
|
-
# doesn't print a result line for bases with 0 coverage
|
202
|
-
contig.coverage = Array.new(contig.length, 0)
|
203
|
-
# the columns we need
|
204
|
-
name_i, pos_i, cov_i = 0, 1, 3
|
205
|
-
# parse the coverage file
|
206
|
-
File.open(covfile).each_line do |line|
|
207
|
-
cols = line.chomp.split("\t")
|
208
|
-
unless (cols && cols.length > 4)
|
209
|
-
# last line
|
210
|
-
break
|
211
|
-
end
|
212
|
-
# extract the columns
|
213
|
-
name = Bio::FastaDefline.new(cols[name_i]).entry_id
|
214
|
-
pos, cov = cols[pos_i].to_i, cols[cov_i].to_i
|
215
|
-
unless contig_name == name
|
216
|
-
while contig_name != name
|
217
|
-
begin
|
218
|
-
block.call(contig, contig.coverage)
|
219
|
-
contig_name, contig = assembly_enum.next
|
220
|
-
contig.coverage = Array.new(contig.length, 0)
|
221
|
-
rescue StopIteration => stop_error
|
222
|
-
logger.error 'reached the end of assembly enumerator while ' +
|
223
|
-
'there were contigs left in the coverage results'
|
224
|
-
logger.error "final assembly contig: #{@assembly.last.name}"
|
225
|
-
logger.error "coverage contig: #{name}"
|
226
|
-
raise stop_error
|
227
|
-
end
|
228
|
-
end
|
229
|
-
end
|
230
|
-
contig.coverage[pos - 1] = cov
|
231
|
-
end
|
232
|
-
# yield the final contig
|
233
|
-
block.call(contig, contig.coverage)
|
234
|
-
end
|
235
|
-
|
236
187
|
end # Assembly
|
237
188
|
|
238
189
|
end # Transrate
|
data/lib/transrate/cmd.rb
CHANGED
@@ -11,7 +11,6 @@ module Transrate
|
|
11
11
|
attr_reader :has_run
|
12
12
|
attr_reader :reference_coverage
|
13
13
|
attr_reader :comp_stats
|
14
|
-
attr_reader :n_chimeras, :p_chimeras
|
15
14
|
|
16
15
|
def initialize assembly, reference, threads
|
17
16
|
@assembly = assembly
|
@@ -23,14 +22,12 @@ module Transrate
|
|
23
22
|
def run
|
24
23
|
@crbblast = reciprocal_best_blast
|
25
24
|
@reference_coverage = coverage @crbblast
|
26
|
-
@collapse_factor = collapse_factor @crbblast.reciprocals
|
27
25
|
@reciprocal_hits = @crbblast.size
|
28
26
|
@rbh_per_reference = @reciprocal_hits.to_f / @reference.size.to_f
|
29
27
|
@p_contigs_with_recip = @crbblast.reciprocals.size / @assembly.size.to_f
|
30
28
|
@n_contigs_with_recip = @crbblast.reciprocals.size
|
31
29
|
count_ref_crbbs
|
32
30
|
@p_refs_with_recip = @n_refs_with_recip / @reference.size.to_f
|
33
|
-
chimeras @crbblast
|
34
31
|
self.run_comp_stats
|
35
32
|
@has_run = true
|
36
33
|
end
|
@@ -43,9 +40,6 @@ module Transrate
|
|
43
40
|
@comp_stats[:n_refs_with_CRBB] = @n_refs_with_recip
|
44
41
|
@comp_stats[:rbh_per_reference] = @rbh_per_reference
|
45
42
|
@comp_stats[:reference_coverage] = @reference_coverage
|
46
|
-
@comp_stats[:collapse_factor] = @collapse_factor
|
47
|
-
@comp_stats[:n_chimeras] = @n_chimeras
|
48
|
-
@comp_stats[:p_chimeras] = @p_chimeras
|
49
43
|
end
|
50
44
|
|
51
45
|
def reciprocal_best_blast
|
@@ -71,19 +65,29 @@ module Transrate
|
|
71
65
|
contig = @assembly[hit.query]
|
72
66
|
contig.has_crb = true
|
73
67
|
# how much of the reference is covered by this single contig
|
74
|
-
|
68
|
+
if crbblast.target_is_prot
|
69
|
+
contig.reference_coverage =
|
70
|
+
(hit.alnlen - hit.mismatches - hit.gaps) / (3*hit.tlen)
|
71
|
+
else
|
72
|
+
contig.reference_coverage =
|
73
|
+
(hit.alnlen - hit.mismatches - hit.gaps) / hit.tlen
|
74
|
+
end
|
75
75
|
contig.hits << hit
|
76
76
|
end
|
77
77
|
end
|
78
78
|
total_coverage = 0
|
79
79
|
total_length = 0
|
80
80
|
cov = [0.25, 0.5, 0.75, 0.85, 0.95]
|
81
|
+
@cov ||= [0, 0, 0, 0, 0]
|
81
82
|
@reference.each_value do |ref_contig|
|
82
83
|
key = ref_contig.name
|
83
84
|
list = ref_contig.hits
|
84
|
-
|
85
|
-
|
86
|
-
|
85
|
+
if crbblast.target_is_prot
|
86
|
+
total_length += ref_contig.length * 3
|
87
|
+
else
|
88
|
+
total_length += ref_contig.length
|
89
|
+
end
|
90
|
+
next if list.empty?
|
87
91
|
blocks = []
|
88
92
|
target_length = 0
|
89
93
|
list.each do |hit|
|
@@ -162,9 +166,7 @@ module Transrate
|
|
162
166
|
end # each_with_index a
|
163
167
|
# sum blocks to find total coverage
|
164
168
|
length_of_coverage = calculate_coverage blocks
|
165
|
-
@cov ||= [0, 0, 0, 0, 0]
|
166
169
|
if target_length > 0
|
167
|
-
# puts "#{length_of_coverage} / #{target_length.to_f}"
|
168
170
|
ref_p = length_of_coverage / target_length.to_f
|
169
171
|
else
|
170
172
|
ref_p = 0
|
@@ -179,10 +181,11 @@ module Transrate
|
|
179
181
|
|
180
182
|
total_coverage += length_of_coverage
|
181
183
|
end
|
184
|
+
|
182
185
|
cov.each_with_index do |p, i|
|
183
186
|
@comp_stats["cov#{(100*p).to_i}".to_sym] = @cov[i]
|
184
187
|
@comp_stats["p_cov#{(100*p).to_i}".to_sym] =
|
185
|
-
|
188
|
+
@cov[i]/@reference.size.to_f
|
186
189
|
end
|
187
190
|
total_coverage / total_length.to_f
|
188
191
|
end
|
@@ -210,44 +213,6 @@ module Transrate
|
|
210
213
|
end
|
211
214
|
end
|
212
215
|
|
213
|
-
def chimeras crbblast
|
214
|
-
@n_chimeras = 0
|
215
|
-
crbblast.reciprocals.each_pair do |key, list|
|
216
|
-
p = 0
|
217
|
-
list.each_with_index do |a, i|
|
218
|
-
list.each_with_index do |b, j|
|
219
|
-
if j>i
|
220
|
-
if a.target == b.target
|
221
|
-
astart, astop = [a.tstart, a.tend].minmax
|
222
|
-
bstart, bstop = [b.tstart, b.tend].minmax
|
223
|
-
|
224
|
-
oa = overlap_amount(astart, astop, bstart, bstop)
|
225
|
-
if oa > 0.75
|
226
|
-
p += 1
|
227
|
-
end
|
228
|
-
else
|
229
|
-
astart, astop = [a.qstart, a.qend].minmax
|
230
|
-
bstart, bstop = [b.qstart, b.qend].minmax
|
231
|
-
|
232
|
-
oa = overlap_amount(astart, astop, bstart, bstop)
|
233
|
-
if oa < 0.25
|
234
|
-
p += 1
|
235
|
-
end
|
236
|
-
end
|
237
|
-
end
|
238
|
-
end
|
239
|
-
end
|
240
|
-
if p/list.size.to_f >= 0.5
|
241
|
-
@n_chimeras += 1
|
242
|
-
unless @assembly.assembly.key? key
|
243
|
-
puts "key not in assembly: #{key}"
|
244
|
-
end
|
245
|
-
@assembly[key].is_chimera = true
|
246
|
-
end
|
247
|
-
end
|
248
|
-
@p_chimeras = @n_chimeras / crbblast.reciprocals.length.to_f
|
249
|
-
end
|
250
|
-
|
251
216
|
def overlap(astart, astop, bstart, bstop)
|
252
217
|
if astart == bstart and astop == bstop
|
253
218
|
return 0
|
@@ -300,19 +265,6 @@ module Transrate
|
|
300
265
|
end
|
301
266
|
end
|
302
267
|
|
303
|
-
# Count unique reference proteins per contig
|
304
|
-
def collapse_factor reciprocals
|
305
|
-
return @collapse_factor unless @collapse_factor.nil?
|
306
|
-
cf_sum = 0
|
307
|
-
reciprocals.each do |query, hits|
|
308
|
-
uniq_hits = Set.new hits.map{ |h| h.target }
|
309
|
-
cf = uniq_hits.length
|
310
|
-
@assembly[query].collapse_factor = cf
|
311
|
-
cf_sum += cf
|
312
|
-
end
|
313
|
-
cf_sum / reciprocals.size
|
314
|
-
end
|
315
|
-
|
316
268
|
end # ComparativeMetrics
|
317
269
|
|
318
270
|
end # Transrate
|
data/lib/transrate/contig.rb
CHANGED
@@ -10,9 +10,12 @@ module Transrate
|
|
10
10
|
def_delegators :@seq, :size, :length
|
11
11
|
attr_accessor :seq, :name
|
12
12
|
# read-based metrics
|
13
|
-
attr_accessor :coverage, :uncovered_bases, :
|
13
|
+
attr_accessor :coverage, :uncovered_bases, :p_uncovered_bases
|
14
|
+
attr_accessor :p_seq_true, :p_unique
|
15
|
+
attr_accessor :low_uniqueness_bases, :in_bridges
|
16
|
+
attr_accessor :p_good, :p_not_segmented
|
14
17
|
# reference-based metrics
|
15
|
-
attr_accessor :has_crb, :
|
18
|
+
attr_accessor :has_crb, :reference_coverage
|
16
19
|
attr_accessor :hits
|
17
20
|
|
18
21
|
def initialize(seq, name: nil)
|
@@ -22,11 +25,16 @@ module Transrate
|
|
22
25
|
@name = seq.respond_to?(:entry_id) ? seq.entry_id : name
|
23
26
|
@hits = []
|
24
27
|
@reference_coverage = 0
|
25
|
-
@collapse_factor = 0
|
26
|
-
@is_chimera = false
|
27
28
|
@has_crb = false
|
28
29
|
@in_bridges = 0
|
29
|
-
@
|
30
|
+
@p_seq_true = 0
|
31
|
+
@low_uniqueness_bases = 0
|
32
|
+
@p_good = -1
|
33
|
+
@uncovered_bases = length
|
34
|
+
@p_uncovered_bases = 1
|
35
|
+
@p_unique = 0
|
36
|
+
@p_not_segmented = 1
|
37
|
+
@score = -1
|
30
38
|
end
|
31
39
|
|
32
40
|
def each &block
|
@@ -43,34 +51,38 @@ module Transrate
|
|
43
51
|
:cpg_count => cpg_count,
|
44
52
|
:cpg_ratio => cpg_ratio,
|
45
53
|
:orf_length => orf_length,
|
46
|
-
:linguistic_complexity_6 => linguistic_complexity(6)
|
54
|
+
:linguistic_complexity_6 => linguistic_complexity(6),
|
47
55
|
}
|
48
56
|
end
|
49
57
|
|
50
58
|
def read_metrics
|
51
|
-
read = @
|
52
|
-
:
|
53
|
-
:
|
54
|
-
:
|
59
|
+
read = @p_good>=0 ? {
|
60
|
+
:in_bridges => in_bridges,
|
61
|
+
:p_good => @p_good,
|
62
|
+
:p_bases_covered => p_bases_covered,
|
63
|
+
:p_seq_true => p_seq_true,
|
64
|
+
:score => score,
|
65
|
+
:p_unique => p_unique,
|
66
|
+
:p_not_segmented => p_not_segmented
|
55
67
|
} : {
|
56
|
-
:
|
57
|
-
:
|
58
|
-
:
|
68
|
+
:in_bridges => "NA",
|
69
|
+
:p_good => "NA",
|
70
|
+
:p_bases_covered => "NA",
|
71
|
+
:p_seq_true => "NA",
|
72
|
+
:score => "NA",
|
73
|
+
:p_unique => p_unique,
|
74
|
+
:p_not_segmented => p_not_segmented
|
59
75
|
}
|
60
76
|
end
|
61
77
|
|
62
78
|
def comparative_metrics
|
63
79
|
reference = @has_crb ? {
|
64
80
|
:has_crb => has_crb,
|
65
|
-
:collapse_factor => collapse_factor,
|
66
81
|
:reference_coverage => reference_coverage,
|
67
|
-
:is_chimera => is_chimera,
|
68
82
|
:hits => hits.map{ |h| h.target }.join(";")
|
69
83
|
} : {
|
70
84
|
:has_crb => false,
|
71
|
-
:collapse_factor => "NA",
|
72
85
|
:reference_coverage => "NA",
|
73
|
-
:is_chimera => "NA",
|
74
86
|
:hits => "NA"
|
75
87
|
}
|
76
88
|
end
|
@@ -89,7 +101,7 @@ module Transrate
|
|
89
101
|
composition(@seq.seq)
|
90
102
|
alphabet = ['a', 'c', 'g', 't', 'n']
|
91
103
|
@base_composition = {}
|
92
|
-
@dibase_composition={}
|
104
|
+
@dibase_composition = {}
|
93
105
|
bases = []
|
94
106
|
dibases = []
|
95
107
|
alphabet.each do |c|
|
@@ -208,6 +220,33 @@ module Transrate
|
|
208
220
|
def linguistic_complexity k
|
209
221
|
return kmer_count(k, @seq.seq)/(4**k).to_f # call to C
|
210
222
|
end
|
223
|
+
|
224
|
+
def p_bases_covered
|
225
|
+
1 - p_uncovered_bases
|
226
|
+
end
|
227
|
+
|
228
|
+
def uncovered_bases= n
|
229
|
+
@uncovered_bases = n
|
230
|
+
@p_uncovered_bases = n / length.to_f
|
231
|
+
end
|
232
|
+
|
233
|
+
def p_unique_bases
|
234
|
+
(length - low_uniqueness_bases) / length.to_f
|
235
|
+
end
|
236
|
+
|
237
|
+
# Contig score (geometric mean of all score components)
|
238
|
+
def score
|
239
|
+
return @score if @score != -1
|
240
|
+
prod =
|
241
|
+
[p_bases_covered, 0.01].max * # proportion of bases covered
|
242
|
+
[p_not_segmented, 0.01].max * # prob contig has 0 changepoints
|
243
|
+
[p_good, 0.01].max * # proportion of reads that mapped good
|
244
|
+
[p_seq_true, 0.01].max * # scaled 1 - mean per-base edit distance
|
245
|
+
[p_unique, 0.01].max # prop mapQ >= 5
|
246
|
+
s = prod ** (1.0 / 5)
|
247
|
+
s = 0.01 if !s
|
248
|
+
@score = [s, 0.01].max
|
249
|
+
end
|
211
250
|
end
|
212
251
|
|
213
252
|
end
|
@@ -0,0 +1,79 @@
|
|
1
|
+
|
2
|
+
module Transrate
|
3
|
+
|
4
|
+
class ExpressError < StandardError
|
5
|
+
end
|
6
|
+
|
7
|
+
class Express
|
8
|
+
|
9
|
+
require 'ostruct'
|
10
|
+
|
11
|
+
# return an Express object
|
12
|
+
def initialize
|
13
|
+
which = Cmd.new('which express')
|
14
|
+
which.run
|
15
|
+
if !which.status.success?
|
16
|
+
raise ExpressError.new("could not find express in the path")
|
17
|
+
end
|
18
|
+
@express = which.stdout.split("\n").first
|
19
|
+
end
|
20
|
+
|
21
|
+
# return struct containing:
|
22
|
+
# results_file => path to the express results TSV
|
23
|
+
# expression => a hash of target => effective_count
|
24
|
+
# align_samp => path to the sampled alignments file
|
25
|
+
def run assembly, bamfile
|
26
|
+
assembly = assembly.file if assembly.is_a? Assembly
|
27
|
+
|
28
|
+
ex_output = 'results.xprs'
|
29
|
+
fin_output = "#{File.basename assembly}_#{ex_output}"
|
30
|
+
|
31
|
+
unless File.exists? fin_output
|
32
|
+
runner = Cmd.new build_command(assembly, bamfile)
|
33
|
+
runner.run
|
34
|
+
unless runner.status.success?
|
35
|
+
raise ExpressError.new("Express failed\n" +
|
36
|
+
runner.stderr + "\n" +
|
37
|
+
runner.stdout)
|
38
|
+
end
|
39
|
+
File.rename(ex_output, fin_output)
|
40
|
+
end
|
41
|
+
|
42
|
+
OpenStruct.new(:results_file => fin_output,
|
43
|
+
:expression => load_expression(fin_output),
|
44
|
+
:align_samp => 'hits.1.samp.bam')
|
45
|
+
end
|
46
|
+
|
47
|
+
# return the constructed eXpress command
|
48
|
+
def build_command assembly, bamfile
|
49
|
+
cmd = "#{@express}"
|
50
|
+
cmd << " #{File.expand_path assembly}"
|
51
|
+
cmd << " #{File.expand_path bamfile}"
|
52
|
+
cmd << " --output-dir ."
|
53
|
+
cmd << " --output-align-samp"
|
54
|
+
cmd << " --no-update-check"
|
55
|
+
cmd << " --additional-online 1"
|
56
|
+
cmd
|
57
|
+
end
|
58
|
+
|
59
|
+
# return a hash of target => effective_count created
|
60
|
+
# by parsing the results file
|
61
|
+
def load_expression file
|
62
|
+
expression = {}
|
63
|
+
first = true
|
64
|
+
File.open(file).each do |line|
|
65
|
+
if first
|
66
|
+
first = false
|
67
|
+
next
|
68
|
+
end
|
69
|
+
line = line.chomp.split("\t")
|
70
|
+
target = line[1]
|
71
|
+
effective_count = line[7]
|
72
|
+
expression[target] = effective_count.to_f
|
73
|
+
end
|
74
|
+
expression
|
75
|
+
end
|
76
|
+
|
77
|
+
end # Express
|
78
|
+
|
79
|
+
end # Transrate
|
@@ -2,243 +2,254 @@ module Transrate
|
|
2
2
|
|
3
3
|
class ReadMetrics
|
4
4
|
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
attr_reader :total
|
5
|
+
attr_reader :fragments_mapping
|
6
|
+
attr_reader :p_good_mapping
|
9
7
|
attr_reader :bad
|
10
8
|
attr_reader :supported_bridges
|
11
|
-
attr_reader :pr_good_mapping
|
12
|
-
attr_reader :percent_mapping
|
13
|
-
attr_reader :prop_expressed
|
14
9
|
attr_reader :has_run
|
10
|
+
attr_reader :read_length
|
15
11
|
|
16
12
|
def initialize assembly
|
17
13
|
@assembly = assembly
|
18
|
-
@mapper =
|
14
|
+
@mapper = Snap.new
|
19
15
|
self.initial_values
|
16
|
+
|
17
|
+
load_executables
|
18
|
+
@read_length = 100
|
19
|
+
end
|
20
|
+
|
21
|
+
def load_executables
|
22
|
+
@bam_splitter = get_bin_path 'bam-split'
|
23
|
+
@bam_reader = get_bin_path 'bam-read'
|
24
|
+
end
|
25
|
+
|
26
|
+
def get_bin_path bin
|
27
|
+
which_bin = Cmd.new("which #{bin}")
|
28
|
+
which_bin.run
|
29
|
+
if !which_bin.status.success?
|
30
|
+
raise IOError.new("ReadMetrics: could not find #{bin} in path")
|
31
|
+
end
|
32
|
+
which_bin.stdout.split("\n").first
|
20
33
|
end
|
21
34
|
|
22
35
|
def run left, right, insertsize:200, insertsd:50, threads:8
|
36
|
+
# check all read files exist
|
23
37
|
[left, right].each do |readfile|
|
24
|
-
|
25
|
-
|
38
|
+
raise IOError.new "Read file is nil" if readfile.nil?
|
39
|
+
readfile.split(",").each do |file|
|
40
|
+
unless File.exist? file
|
41
|
+
raise IOError.new "ReadMetrics: read file does not exist: #{file}"
|
42
|
+
end
|
26
43
|
end
|
27
44
|
end
|
28
|
-
|
29
|
-
|
30
|
-
|
45
|
+
|
46
|
+
# estimate max read length
|
47
|
+
@read_length = get_read_length(left, right)
|
48
|
+
|
49
|
+
# map reads
|
50
|
+
@mapper.build_index(@assembly.file, threads)
|
51
|
+
bamfile = @mapper.map_reads(@assembly.file, left, right,
|
31
52
|
insertsize: insertsize,
|
32
53
|
insertsd: insertsd,
|
33
54
|
threads: threads)
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
55
|
+
@fragments = @mapper.read_count
|
56
|
+
|
57
|
+
# classify bam file into valid and invalid alignments
|
58
|
+
sorted_bam = "#{File.basename(bamfile, '.bam')}.merged.sorted.bam"
|
59
|
+
readsorted_bam = "#{File.basename(bamfile, '.bam')}.valid.sorted.bam"
|
60
|
+
unless File.exist? sorted_bam
|
61
|
+
valid_bam, invalid_bam = split_bam bamfile
|
62
|
+
readsorted_bam = Samtools.readsort_bam(valid_bam, threads)
|
63
|
+
File.delete valid_bam
|
64
|
+
end
|
65
|
+
|
66
|
+
# pass valid alignments to eXpress for assignment
|
67
|
+
# always have to run the eXpress command to load the results
|
68
|
+
assigned_bam = assign_and_quantify readsorted_bam
|
69
|
+
|
70
|
+
# merge the assigned alignments back with the invalid ones
|
71
|
+
unless File.exist? sorted_bam
|
72
|
+
File.delete readsorted_bam
|
73
|
+
merged_bam = "#{File.basename(bamfile, '.bam')}.merged.bam"
|
74
|
+
Samtools.merge_bam(invalid_bam, assigned_bam, merged_bam, threads=threads)
|
75
|
+
File.delete invalid_bam
|
76
|
+
File.delete assigned_bam
|
77
|
+
sorted_bam = Samtools.sort_bam(merged_bam, threads)
|
78
|
+
File.delete merged_bam
|
79
|
+
end
|
80
|
+
|
81
|
+
# analyse the final mappings
|
82
|
+
analyse_read_mappings(sorted_bam, insertsize, insertsd, true)
|
83
|
+
|
40
84
|
@has_run = true
|
41
85
|
end
|
42
86
|
|
43
87
|
def read_stats
|
44
88
|
{
|
45
|
-
:
|
46
|
-
:
|
47
|
-
:
|
89
|
+
:fragments => @fragments,
|
90
|
+
:fragments_mapped => @fragments_mapped,
|
91
|
+
:p_fragments_mapped => @p_fragments_mapped,
|
48
92
|
:good_mappings => @good,
|
49
|
-
:
|
93
|
+
:p_good_mapping => @p_good_mapping,
|
50
94
|
:bad_mappings => @bad,
|
51
|
-
:potential_bridges => @
|
52
|
-
:
|
53
|
-
:
|
54
|
-
:
|
55
|
-
:
|
56
|
-
:
|
57
|
-
:
|
58
|
-
:
|
59
|
-
:
|
60
|
-
:
|
95
|
+
:potential_bridges => @potential_bridges,
|
96
|
+
:bases_uncovered => @bases_uncovered,
|
97
|
+
:p_bases_uncovered => @p_bases_uncovered,
|
98
|
+
:contigs_uncovbase => @contigs_uncovbase,
|
99
|
+
:p_contigs_uncovbase => @p_contigs_uncovbase,
|
100
|
+
:contigs_uncovered => @contigs_uncovered,
|
101
|
+
:p_contigs_uncovered => @p_contigs_uncovered,
|
102
|
+
:contigs_lowcovered => @contigs_lowcovered,
|
103
|
+
:p_contigs_lowcovered => @p_contigs_lowcovered,
|
104
|
+
:contigs_segmented => @contigs_segmented,
|
105
|
+
:p_contigs_segmented => @p_contigs_segmented,
|
106
|
+
:contigs_good => @contigs_good,
|
107
|
+
:p_contigs_good => @p_contigs_good
|
61
108
|
}
|
62
109
|
end
|
63
110
|
|
64
|
-
def
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
111
|
+
def get_read_length(left, right)
|
112
|
+
count=0
|
113
|
+
file = File.open(left.split(",").first)
|
114
|
+
name = file.readline.chomp
|
115
|
+
seq = file.readline.chomp
|
116
|
+
na = file.readline.chomp
|
117
|
+
qual = file.readline.chomp
|
118
|
+
read_length = 0
|
119
|
+
while name and count < 5000 # get max read length from first 5000 reads
|
120
|
+
read_length = [read_length, seq.length].max
|
121
|
+
name = file.readline.chomp rescue nil
|
122
|
+
seq = file.readline.chomp rescue nil
|
123
|
+
na = file.readline.chomp rescue nil
|
124
|
+
qual = file.readline.chomp rescue nil
|
125
|
+
count+=1
|
126
|
+
end
|
127
|
+
read_length
|
128
|
+
end
|
129
|
+
|
130
|
+
def split_bam bamfile
|
131
|
+
base = File.basename(bamfile, '.bam')
|
132
|
+
valid = "#{base}.valid.bam"
|
133
|
+
invalid = "#{base}.invalid.bam"
|
134
|
+
if !File.exist? valid
|
135
|
+
cmd = "#{@bam_splitter} #{bamfile}"
|
136
|
+
splitter = Cmd.new cmd
|
137
|
+
splitter.run
|
138
|
+
if !splitter.status.success?
|
139
|
+
logger.warn "Couldn't split bam file: #{bamfile}" +
|
140
|
+
"\n#{splitter.stdout}\n#{splitter.stderr}"
|
88
141
|
end
|
89
|
-
check_bridges
|
90
|
-
else
|
91
|
-
raise "samfile #{samfile} not found"
|
92
142
|
end
|
143
|
+
if !File.exist? valid
|
144
|
+
logger.warn "Splitting failed to create valid bam: #{valid}"
|
145
|
+
end
|
146
|
+
[valid, invalid]
|
93
147
|
end
|
94
148
|
|
95
|
-
def
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
@both_mapped = 0
|
101
|
-
@properly_paired = 0
|
102
|
-
@improperly_paired = 0
|
103
|
-
@proper_orientation = 0
|
104
|
-
@improper_orientation = 0
|
105
|
-
@same_contig = 0
|
106
|
-
@realistic_overlap = 0
|
107
|
-
@unrealistic_overlap = 0
|
108
|
-
@realistic_fragment = 0
|
109
|
-
@unrealistic_fragment = 0
|
110
|
-
@n_uncovered_bases = 0
|
111
|
-
@n_uncovered_base_contigs = 0 # any base cov < 1
|
112
|
-
@n_uncovered_contigs = 0 # mean cov < 1
|
113
|
-
@n_lowcovered_contigs = 0 # mean cov < 10
|
149
|
+
def assign_and_quantify bamfile
|
150
|
+
express = Express.new
|
151
|
+
results = express.run(@assembly, bamfile)
|
152
|
+
analyse_expression results.expression
|
153
|
+
results.align_samp
|
114
154
|
end
|
115
155
|
|
116
|
-
def
|
117
|
-
|
156
|
+
def analyse_expression express_output
|
157
|
+
express_output.each_pair do |name, eff_count|
|
158
|
+
@contigs_uncovered += 1 if eff_count < 1
|
159
|
+
@contigs_lowcovered += 1 if eff_count < 10
|
160
|
+
contig = @assembly[name]
|
161
|
+
contig.coverage = eff_count
|
162
|
+
end
|
118
163
|
end
|
119
164
|
|
120
|
-
def
|
165
|
+
def analyse_read_mappings bamfile, insertsize, insertsd, bridge=true
|
166
|
+
if File.exist?(bamfile) && File.size(bamfile) > 0
|
167
|
+
csv_output = "#{File.basename(@assembly.file)}_bam_info.csv"
|
168
|
+
csv_output = File.expand_path(csv_output)
|
121
169
|
|
122
|
-
|
170
|
+
analyse_bam bamfile, csv_output
|
171
|
+
# open output csv file
|
172
|
+
@potential_bridges = 0
|
123
173
|
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
# reads are paired
|
129
|
-
@both_mapped += 1 if ls.primary_aln?
|
130
|
-
if ls.read_properly_paired?
|
131
|
-
# mapped in proper pair
|
132
|
-
@properly_paired += 1
|
133
|
-
self.check_orientation(ls, rs)
|
134
|
-
else
|
135
|
-
# not mapped in proper pair
|
136
|
-
@improperly_paired += 1
|
137
|
-
if ls.chrom == rs.chrom
|
138
|
-
# both on same contig
|
139
|
-
@same_contig += 1
|
140
|
-
self.check_overlap_plausibility(ls, rs)
|
141
|
-
else
|
142
|
-
self.check_fragment_plausibility(ls, rs, realistic_dist)
|
143
|
-
end
|
174
|
+
CSV.foreach(csv_output, :headers => true,
|
175
|
+
:header_converters => :symbol,
|
176
|
+
:converters => :all) do |row|
|
177
|
+
populate_contig_data row
|
144
178
|
end
|
145
|
-
|
146
|
-
end
|
147
|
-
|
148
|
-
def check_orientation ls, rs
|
149
|
-
if ls.pair_opposite_strands?
|
150
|
-
# mates in proper orientation
|
151
|
-
@proper_orientation += 1
|
152
|
-
@good += 1
|
179
|
+
@bad = @fragments_mapped - @good
|
153
180
|
else
|
154
|
-
|
155
|
-
|
156
|
-
|
181
|
+
logger.warn "couldn't find bamfile: #{bamfile}"
|
182
|
+
end
|
183
|
+
@assembly.assembly.each_pair do |name, contig|
|
184
|
+
@contigs_good += 1 if contig.score >= 0.5
|
157
185
|
end
|
186
|
+
update_proportions
|
158
187
|
end
|
159
188
|
|
160
|
-
def
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
189
|
+
def update_proportions
|
190
|
+
nbases = @assembly.n_bases.to_f
|
191
|
+
ncontigs = @assembly.size.to_f
|
192
|
+
|
193
|
+
@p_bases_uncovered = @bases_uncovered / nbases
|
194
|
+
@p_contigs_uncovbase = @contigs_uncovbase / ncontigs
|
195
|
+
@p_contigs_uncovered = @contigs_uncovered / ncontigs
|
196
|
+
@p_contigs_lowcovered = @contigs_lowcovered / ncontigs
|
197
|
+
@p_contigs_segmented = @contigs_segmented / ncontigs
|
198
|
+
@p_contigs_good = @contigs_good / ncontigs
|
199
|
+
|
200
|
+
@p_good_mapping = @good.to_f / @fragments.to_f
|
201
|
+
@p_fragments_mapped = @fragments_mapped / @fragments.to_f
|
170
202
|
end
|
171
203
|
|
172
|
-
def
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
# increase the evidence for this bridge
|
180
|
-
key = [ls.chrom, rs.chrom].sort.join("<>").to_sym
|
181
|
-
if @bridges.has_key? key
|
182
|
-
@bridges[key] += 1
|
183
|
-
else
|
184
|
-
@bridges[key] = 1
|
204
|
+
def analyse_bam bamfile, csv_output
|
205
|
+
if !File.exist?(csv_output)
|
206
|
+
cmd = "#{@bam_reader} #{bamfile} #{csv_output}"
|
207
|
+
reader = Cmd.new cmd
|
208
|
+
reader.run
|
209
|
+
if !reader.status.success?
|
210
|
+
logger.warn "couldn't get information from bam file: #{bamfile}"
|
185
211
|
end
|
186
|
-
@realistic_fragment += 1
|
187
|
-
@good += 1
|
188
|
-
else
|
189
|
-
@unrealistic_fragment += 1
|
190
|
-
@bad += 1
|
191
212
|
end
|
192
213
|
end
|
193
214
|
|
194
|
-
def
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
215
|
+
def populate_contig_data row
|
216
|
+
contig = @assembly[row[:name]]
|
217
|
+
scale = 0.7
|
218
|
+
contig.p_seq_true = (row[:p_seq_true] - scale) * (1.0 / (1 - scale))
|
219
|
+
contig.uncovered_bases = row[:bases_uncovered]
|
220
|
+
@bases_uncovered += contig.uncovered_bases
|
221
|
+
if row[:fragments_mapped] and row[:fragments_mapped] > 0
|
222
|
+
contig.p_good = row[:good]/row[:fragments_mapped].to_f
|
223
|
+
end
|
224
|
+
contig.p_not_segmented = row[:p_not_segmented]
|
225
|
+
if contig.p_not_segmented < 0.5
|
226
|
+
@contigs_segmented += 1
|
227
|
+
end
|
228
|
+
contig.in_bridges = row[:bridges]
|
229
|
+
contig.p_unique = row[:p_unique]
|
230
|
+
if row[:bridges] > 1
|
231
|
+
@potential_bridges += 1
|
232
|
+
end
|
233
|
+
@fragments_mapped += row[:fragments_mapped]
|
234
|
+
@good += row[:good]
|
235
|
+
if row[:bases_uncovered] > 0
|
236
|
+
@contigs_uncovbase += 1
|
206
237
|
end
|
207
238
|
end
|
208
239
|
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
#
|
216
|
-
#
|
217
|
-
|
218
|
-
@
|
219
|
-
|
220
|
-
contig.uncovered_bases, total = 0, 0
|
221
|
-
coverage.each do |e|
|
222
|
-
total += e
|
223
|
-
contig.uncovered_bases += 1 if e < 1
|
224
|
-
end
|
225
|
-
tot_length += coverage.length
|
226
|
-
tot_coverage += total
|
227
|
-
contig.mean_coverage = total / coverage.length.to_f
|
228
|
-
@n_uncovered_bases += contig.uncovered_bases
|
229
|
-
@n_uncovered_base_contigs += 1 if contig.uncovered_bases > 0
|
230
|
-
@n_uncovered_contigs += 1 if contig.mean_coverage < 1
|
231
|
-
@n_lowcovered_contigs += 1 if contig.mean_coverage < 10
|
232
|
-
end
|
233
|
-
@mean_coverage = (tot_coverage / tot_length.to_f).round(2)
|
234
|
-
@p_uncovered_bases = @n_uncovered_bases / @assembly.n_bases.to_f
|
235
|
-
@p_uncovered_base_contigs = @n_uncovered_base_contigs /
|
236
|
-
@assembly.size.to_f
|
237
|
-
@p_uncovered_contigs = @n_uncovered_contigs / @assembly.size.to_f
|
238
|
-
@p_lowcovered_contigs = @n_lowcovered_contigs / @assembly.size.to_f
|
240
|
+
def initial_values
|
241
|
+
@fragments = 0
|
242
|
+
@fragments_mapped = 0
|
243
|
+
@good = 0
|
244
|
+
@bad = 0
|
245
|
+
@bases_uncovered = 0
|
246
|
+
@contigs_uncovbase = 0 # any base cov < 1
|
247
|
+
@contigs_uncovered = 0 # mean cov < 1
|
248
|
+
@contigs_lowcovered = 0 # mean cov < 10
|
249
|
+
@contigs_segmented = 0 # p_not_segmented < 0.5
|
250
|
+
@contigs_good = 0
|
239
251
|
end
|
240
252
|
|
241
253
|
end # ReadMetrics
|
242
254
|
|
243
255
|
end # Transrate
|
244
|
-
|