transrate 0.3.1 → 1.0.0.alpha.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -7,12 +7,11 @@ require 'transrate/transrater'
7
7
  require 'transrate/version'
8
8
  require 'transrate/contig'
9
9
  require 'transrate/assembly'
10
- require 'transrate/bowtie2'
10
+ require 'transrate/snap'
11
+ require 'transrate/express'
11
12
  require 'transrate/read_metrics'
12
13
  require 'transrate/comparative_metrics'
13
14
  require 'transrate/contig_metrics'
14
- require 'transrate/metric'
15
- require 'transrate/dimension_reduce'
16
15
  require 'transrate/samtools'
17
16
  require 'transrate/cmd'
18
17
  require 'transrate/transrate.so'
@@ -184,55 +184,6 @@ module Transrate
184
184
 
185
185
  end # basic_bin_stats
186
186
 
187
- # Calls *block* with two arguments, the contig and an array
188
- # of integer per-base coverage counts.
189
- #
190
- # @param bam [Bio::Db::Sam] a bam alignment of reads against this assembly
191
- # @param block [Block] the block to call
192
- def each_with_coverage(bam, &block)
193
- logger.debug 'enumerating assembly with coverage'
194
- # generate coverage with samtools
195
- covfile = Samtools.coverage bam
196
- # get an assembly enumerator
197
- assembly_enum = @assembly.to_enum
198
- contig_name, contig = assembly_enum.next
199
- # precreate an array of the correct size to contain
200
- # coverage. this is necessary because samtools mpileup
201
- # doesn't print a result line for bases with 0 coverage
202
- contig.coverage = Array.new(contig.length, 0)
203
- # the columns we need
204
- name_i, pos_i, cov_i = 0, 1, 3
205
- # parse the coverage file
206
- File.open(covfile).each_line do |line|
207
- cols = line.chomp.split("\t")
208
- unless (cols && cols.length > 4)
209
- # last line
210
- break
211
- end
212
- # extract the columns
213
- name = Bio::FastaDefline.new(cols[name_i]).entry_id
214
- pos, cov = cols[pos_i].to_i, cols[cov_i].to_i
215
- unless contig_name == name
216
- while contig_name != name
217
- begin
218
- block.call(contig, contig.coverage)
219
- contig_name, contig = assembly_enum.next
220
- contig.coverage = Array.new(contig.length, 0)
221
- rescue StopIteration => stop_error
222
- logger.error 'reached the end of assembly enumerator while ' +
223
- 'there were contigs left in the coverage results'
224
- logger.error "final assembly contig: #{@assembly.last.name}"
225
- logger.error "coverage contig: #{name}"
226
- raise stop_error
227
- end
228
- end
229
- end
230
- contig.coverage[pos - 1] = cov
231
- end
232
- # yield the final contig
233
- block.call(contig, contig.coverage)
234
- end
235
-
236
187
  end # Assembly
237
188
 
238
189
  end # Transrate
@@ -14,6 +14,10 @@ module Transrate
14
14
  @stdout, @stderr, @status = Open3.capture3 @cmd
15
15
  end
16
16
 
17
+ def to_s
18
+ @cmd
19
+ end
20
+
17
21
  end
18
22
 
19
23
  end
@@ -11,7 +11,6 @@ module Transrate
11
11
  attr_reader :has_run
12
12
  attr_reader :reference_coverage
13
13
  attr_reader :comp_stats
14
- attr_reader :n_chimeras, :p_chimeras
15
14
 
16
15
  def initialize assembly, reference, threads
17
16
  @assembly = assembly
@@ -23,14 +22,12 @@ module Transrate
23
22
  def run
24
23
  @crbblast = reciprocal_best_blast
25
24
  @reference_coverage = coverage @crbblast
26
- @collapse_factor = collapse_factor @crbblast.reciprocals
27
25
  @reciprocal_hits = @crbblast.size
28
26
  @rbh_per_reference = @reciprocal_hits.to_f / @reference.size.to_f
29
27
  @p_contigs_with_recip = @crbblast.reciprocals.size / @assembly.size.to_f
30
28
  @n_contigs_with_recip = @crbblast.reciprocals.size
31
29
  count_ref_crbbs
32
30
  @p_refs_with_recip = @n_refs_with_recip / @reference.size.to_f
33
- chimeras @crbblast
34
31
  self.run_comp_stats
35
32
  @has_run = true
36
33
  end
@@ -43,9 +40,6 @@ module Transrate
43
40
  @comp_stats[:n_refs_with_CRBB] = @n_refs_with_recip
44
41
  @comp_stats[:rbh_per_reference] = @rbh_per_reference
45
42
  @comp_stats[:reference_coverage] = @reference_coverage
46
- @comp_stats[:collapse_factor] = @collapse_factor
47
- @comp_stats[:n_chimeras] = @n_chimeras
48
- @comp_stats[:p_chimeras] = @p_chimeras
49
43
  end
50
44
 
51
45
  def reciprocal_best_blast
@@ -71,19 +65,29 @@ module Transrate
71
65
  contig = @assembly[hit.query]
72
66
  contig.has_crb = true
73
67
  # how much of the reference is covered by this single contig
74
- contig.reference_coverage = hit.alnlen / hit.tlen
68
+ if crbblast.target_is_prot
69
+ contig.reference_coverage =
70
+ (hit.alnlen - hit.mismatches - hit.gaps) / (3*hit.tlen)
71
+ else
72
+ contig.reference_coverage =
73
+ (hit.alnlen - hit.mismatches - hit.gaps) / hit.tlen
74
+ end
75
75
  contig.hits << hit
76
76
  end
77
77
  end
78
78
  total_coverage = 0
79
79
  total_length = 0
80
80
  cov = [0.25, 0.5, 0.75, 0.85, 0.95]
81
+ @cov ||= [0, 0, 0, 0, 0]
81
82
  @reference.each_value do |ref_contig|
82
83
  key = ref_contig.name
83
84
  list = ref_contig.hits
84
- total_length += crbblast.target_is_prot ? ref_contig.length : ref_contig.length*3
85
-
86
- next if list.empty? # ah this is what was breaking everything
85
+ if crbblast.target_is_prot
86
+ total_length += ref_contig.length * 3
87
+ else
88
+ total_length += ref_contig.length
89
+ end
90
+ next if list.empty?
87
91
  blocks = []
88
92
  target_length = 0
89
93
  list.each do |hit|
@@ -162,9 +166,7 @@ module Transrate
162
166
  end # each_with_index a
163
167
  # sum blocks to find total coverage
164
168
  length_of_coverage = calculate_coverage blocks
165
- @cov ||= [0, 0, 0, 0, 0]
166
169
  if target_length > 0
167
- # puts "#{length_of_coverage} / #{target_length.to_f}"
168
170
  ref_p = length_of_coverage / target_length.to_f
169
171
  else
170
172
  ref_p = 0
@@ -179,10 +181,11 @@ module Transrate
179
181
 
180
182
  total_coverage += length_of_coverage
181
183
  end
184
+
182
185
  cov.each_with_index do |p, i|
183
186
  @comp_stats["cov#{(100*p).to_i}".to_sym] = @cov[i]
184
187
  @comp_stats["p_cov#{(100*p).to_i}".to_sym] =
185
- @cov[i]/@reference.size.to_f
188
+ @cov[i]/@reference.size.to_f
186
189
  end
187
190
  total_coverage / total_length.to_f
188
191
  end
@@ -210,44 +213,6 @@ module Transrate
210
213
  end
211
214
  end
212
215
 
213
- def chimeras crbblast
214
- @n_chimeras = 0
215
- crbblast.reciprocals.each_pair do |key, list|
216
- p = 0
217
- list.each_with_index do |a, i|
218
- list.each_with_index do |b, j|
219
- if j>i
220
- if a.target == b.target
221
- astart, astop = [a.tstart, a.tend].minmax
222
- bstart, bstop = [b.tstart, b.tend].minmax
223
-
224
- oa = overlap_amount(astart, astop, bstart, bstop)
225
- if oa > 0.75
226
- p += 1
227
- end
228
- else
229
- astart, astop = [a.qstart, a.qend].minmax
230
- bstart, bstop = [b.qstart, b.qend].minmax
231
-
232
- oa = overlap_amount(astart, astop, bstart, bstop)
233
- if oa < 0.25
234
- p += 1
235
- end
236
- end
237
- end
238
- end
239
- end
240
- if p/list.size.to_f >= 0.5
241
- @n_chimeras += 1
242
- unless @assembly.assembly.key? key
243
- puts "key not in assembly: #{key}"
244
- end
245
- @assembly[key].is_chimera = true
246
- end
247
- end
248
- @p_chimeras = @n_chimeras / crbblast.reciprocals.length.to_f
249
- end
250
-
251
216
  def overlap(astart, astop, bstart, bstop)
252
217
  if astart == bstart and astop == bstop
253
218
  return 0
@@ -300,19 +265,6 @@ module Transrate
300
265
  end
301
266
  end
302
267
 
303
- # Count unique reference proteins per contig
304
- def collapse_factor reciprocals
305
- return @collapse_factor unless @collapse_factor.nil?
306
- cf_sum = 0
307
- reciprocals.each do |query, hits|
308
- uniq_hits = Set.new hits.map{ |h| h.target }
309
- cf = uniq_hits.length
310
- @assembly[query].collapse_factor = cf
311
- cf_sum += cf
312
- end
313
- cf_sum / reciprocals.size
314
- end
315
-
316
268
  end # ComparativeMetrics
317
269
 
318
270
  end # Transrate
@@ -10,9 +10,12 @@ module Transrate
10
10
  def_delegators :@seq, :size, :length
11
11
  attr_accessor :seq, :name
12
12
  # read-based metrics
13
- attr_accessor :coverage, :uncovered_bases, :mean_coverage, :in_bridges
13
+ attr_accessor :coverage, :uncovered_bases, :p_uncovered_bases
14
+ attr_accessor :p_seq_true, :p_unique
15
+ attr_accessor :low_uniqueness_bases, :in_bridges
16
+ attr_accessor :p_good, :p_not_segmented
14
17
  # reference-based metrics
15
- attr_accessor :has_crb, :is_chimera, :collapse_factor, :reference_coverage
18
+ attr_accessor :has_crb, :reference_coverage
16
19
  attr_accessor :hits
17
20
 
18
21
  def initialize(seq, name: nil)
@@ -22,11 +25,16 @@ module Transrate
22
25
  @name = seq.respond_to?(:entry_id) ? seq.entry_id : name
23
26
  @hits = []
24
27
  @reference_coverage = 0
25
- @collapse_factor = 0
26
- @is_chimera = false
27
28
  @has_crb = false
28
29
  @in_bridges = 0
29
- @mean_coverage = 0
30
+ @p_seq_true = 0
31
+ @low_uniqueness_bases = 0
32
+ @p_good = -1
33
+ @uncovered_bases = length
34
+ @p_uncovered_bases = 1
35
+ @p_unique = 0
36
+ @p_not_segmented = 1
37
+ @score = -1
30
38
  end
31
39
 
32
40
  def each &block
@@ -43,34 +51,38 @@ module Transrate
43
51
  :cpg_count => cpg_count,
44
52
  :cpg_ratio => cpg_ratio,
45
53
  :orf_length => orf_length,
46
- :linguistic_complexity_6 => linguistic_complexity(6)
54
+ :linguistic_complexity_6 => linguistic_complexity(6),
47
55
  }
48
56
  end
49
57
 
50
58
  def read_metrics
51
- read = @coverage ? {
52
- :uncovered_bases => uncovered_bases,
53
- :mean_coverage => mean_coverage,
54
- :in_bridges => in_bridges
59
+ read = @p_good>=0 ? {
60
+ :in_bridges => in_bridges,
61
+ :p_good => @p_good,
62
+ :p_bases_covered => p_bases_covered,
63
+ :p_seq_true => p_seq_true,
64
+ :score => score,
65
+ :p_unique => p_unique,
66
+ :p_not_segmented => p_not_segmented
55
67
  } : {
56
- :uncovered_bases => "NA",
57
- :mean_coverage => "NA",
58
- :in_bridges => in_bridges
68
+ :in_bridges => "NA",
69
+ :p_good => "NA",
70
+ :p_bases_covered => "NA",
71
+ :p_seq_true => "NA",
72
+ :score => "NA",
73
+ :p_unique => p_unique,
74
+ :p_not_segmented => p_not_segmented
59
75
  }
60
76
  end
61
77
 
62
78
  def comparative_metrics
63
79
  reference = @has_crb ? {
64
80
  :has_crb => has_crb,
65
- :collapse_factor => collapse_factor,
66
81
  :reference_coverage => reference_coverage,
67
- :is_chimera => is_chimera,
68
82
  :hits => hits.map{ |h| h.target }.join(";")
69
83
  } : {
70
84
  :has_crb => false,
71
- :collapse_factor => "NA",
72
85
  :reference_coverage => "NA",
73
- :is_chimera => "NA",
74
86
  :hits => "NA"
75
87
  }
76
88
  end
@@ -89,7 +101,7 @@ module Transrate
89
101
  composition(@seq.seq)
90
102
  alphabet = ['a', 'c', 'g', 't', 'n']
91
103
  @base_composition = {}
92
- @dibase_composition={}
104
+ @dibase_composition = {}
93
105
  bases = []
94
106
  dibases = []
95
107
  alphabet.each do |c|
@@ -208,6 +220,33 @@ module Transrate
208
220
  def linguistic_complexity k
209
221
  return kmer_count(k, @seq.seq)/(4**k).to_f # call to C
210
222
  end
223
+
224
+ def p_bases_covered
225
+ 1 - p_uncovered_bases
226
+ end
227
+
228
+ def uncovered_bases= n
229
+ @uncovered_bases = n
230
+ @p_uncovered_bases = n / length.to_f
231
+ end
232
+
233
+ def p_unique_bases
234
+ (length - low_uniqueness_bases) / length.to_f
235
+ end
236
+
237
+ # Contig score (geometric mean of all score components)
238
+ def score
239
+ return @score if @score != -1
240
+ prod =
241
+ [p_bases_covered, 0.01].max * # proportion of bases covered
242
+ [p_not_segmented, 0.01].max * # prob contig has 0 changepoints
243
+ [p_good, 0.01].max * # proportion of reads that mapped good
244
+ [p_seq_true, 0.01].max * # scaled 1 - mean per-base edit distance
245
+ [p_unique, 0.01].max # prop mapQ >= 5
246
+ s = prod ** (1.0 / 5)
247
+ s = 0.01 if !s
248
+ @score = [s, 0.01].max
249
+ end
211
250
  end
212
251
 
213
252
  end
@@ -0,0 +1,79 @@
1
+
2
+ module Transrate
3
+
4
+ class ExpressError < StandardError
5
+ end
6
+
7
+ class Express
8
+
9
+ require 'ostruct'
10
+
11
+ # return an Express object
12
+ def initialize
13
+ which = Cmd.new('which express')
14
+ which.run
15
+ if !which.status.success?
16
+ raise ExpressError.new("could not find express in the path")
17
+ end
18
+ @express = which.stdout.split("\n").first
19
+ end
20
+
21
+ # return struct containing:
22
+ # results_file => path to the express results TSV
23
+ # expression => a hash of target => effective_count
24
+ # align_samp => path to the sampled alignments file
25
+ def run assembly, bamfile
26
+ assembly = assembly.file if assembly.is_a? Assembly
27
+
28
+ ex_output = 'results.xprs'
29
+ fin_output = "#{File.basename assembly}_#{ex_output}"
30
+
31
+ unless File.exists? fin_output
32
+ runner = Cmd.new build_command(assembly, bamfile)
33
+ runner.run
34
+ unless runner.status.success?
35
+ raise ExpressError.new("Express failed\n" +
36
+ runner.stderr + "\n" +
37
+ runner.stdout)
38
+ end
39
+ File.rename(ex_output, fin_output)
40
+ end
41
+
42
+ OpenStruct.new(:results_file => fin_output,
43
+ :expression => load_expression(fin_output),
44
+ :align_samp => 'hits.1.samp.bam')
45
+ end
46
+
47
+ # return the constructed eXpress command
48
+ def build_command assembly, bamfile
49
+ cmd = "#{@express}"
50
+ cmd << " #{File.expand_path assembly}"
51
+ cmd << " #{File.expand_path bamfile}"
52
+ cmd << " --output-dir ."
53
+ cmd << " --output-align-samp"
54
+ cmd << " --no-update-check"
55
+ cmd << " --additional-online 1"
56
+ cmd
57
+ end
58
+
59
+ # return a hash of target => effective_count created
60
+ # by parsing the results file
61
+ def load_expression file
62
+ expression = {}
63
+ first = true
64
+ File.open(file).each do |line|
65
+ if first
66
+ first = false
67
+ next
68
+ end
69
+ line = line.chomp.split("\t")
70
+ target = line[1]
71
+ effective_count = line[7]
72
+ expression[target] = effective_count.to_f
73
+ end
74
+ expression
75
+ end
76
+
77
+ end # Express
78
+
79
+ end # Transrate
@@ -2,243 +2,254 @@ module Transrate
2
2
 
3
3
  class ReadMetrics
4
4
 
5
- require 'bettersam'
6
- require 'bio-samtools'
7
-
8
- attr_reader :total
5
+ attr_reader :fragments_mapping
6
+ attr_reader :p_good_mapping
9
7
  attr_reader :bad
10
8
  attr_reader :supported_bridges
11
- attr_reader :pr_good_mapping
12
- attr_reader :percent_mapping
13
- attr_reader :prop_expressed
14
9
  attr_reader :has_run
10
+ attr_reader :read_length
15
11
 
16
12
  def initialize assembly
17
13
  @assembly = assembly
18
- @mapper = Bowtie2.new
14
+ @mapper = Snap.new
19
15
  self.initial_values
16
+
17
+ load_executables
18
+ @read_length = 100
19
+ end
20
+
21
+ def load_executables
22
+ @bam_splitter = get_bin_path 'bam-split'
23
+ @bam_reader = get_bin_path 'bam-read'
24
+ end
25
+
26
+ def get_bin_path bin
27
+ which_bin = Cmd.new("which #{bin}")
28
+ which_bin.run
29
+ if !which_bin.status.success?
30
+ raise IOError.new("ReadMetrics: could not find #{bin} in path")
31
+ end
32
+ which_bin.stdout.split("\n").first
20
33
  end
21
34
 
22
35
  def run left, right, insertsize:200, insertsd:50, threads:8
36
+ # check all read files exist
23
37
  [left, right].each do |readfile|
24
- unless File.exist? readfile
25
- raise IOError.new "ReadMetrics read file does not exist: #{readfile}"
38
+ raise IOError.new "Read file is nil" if readfile.nil?
39
+ readfile.split(",").each do |file|
40
+ unless File.exist? file
41
+ raise IOError.new "ReadMetrics: read file does not exist: #{file}"
42
+ end
26
43
  end
27
44
  end
28
- @mapper.build_index @assembly.file
29
- @num_pairs = `wc -l #{left}`.strip.split(/\s+/)[0].to_i/4
30
- samfile = @mapper.map_reads(@assembly.file, left, right,
45
+
46
+ # estimate max read length
47
+ @read_length = get_read_length(left, right)
48
+
49
+ # map reads
50
+ @mapper.build_index(@assembly.file, threads)
51
+ bamfile = @mapper.map_reads(@assembly.file, left, right,
31
52
  insertsize: insertsize,
32
53
  insertsd: insertsd,
33
54
  threads: threads)
34
- # check_bridges
35
- analyse_read_mappings(samfile, insertsize, insertsd, true)
36
- analyse_coverage(samfile)
37
- @pr_good_mapping = @good.to_f / @num_pairs.to_f
38
- @percent_mapping = @total.to_f / @num_pairs.to_f * 100.0
39
- @pc_good_mapping = @pr_good_mapping * 100.0
55
+ @fragments = @mapper.read_count
56
+
57
+ # classify bam file into valid and invalid alignments
58
+ sorted_bam = "#{File.basename(bamfile, '.bam')}.merged.sorted.bam"
59
+ readsorted_bam = "#{File.basename(bamfile, '.bam')}.valid.sorted.bam"
60
+ unless File.exist? sorted_bam
61
+ valid_bam, invalid_bam = split_bam bamfile
62
+ readsorted_bam = Samtools.readsort_bam(valid_bam, threads)
63
+ File.delete valid_bam
64
+ end
65
+
66
+ # pass valid alignments to eXpress for assignment
67
+ # always have to run the eXpress command to load the results
68
+ assigned_bam = assign_and_quantify readsorted_bam
69
+
70
+ # merge the assigned alignments back with the invalid ones
71
+ unless File.exist? sorted_bam
72
+ File.delete readsorted_bam
73
+ merged_bam = "#{File.basename(bamfile, '.bam')}.merged.bam"
74
+ Samtools.merge_bam(invalid_bam, assigned_bam, merged_bam, threads=threads)
75
+ File.delete invalid_bam
76
+ File.delete assigned_bam
77
+ sorted_bam = Samtools.sort_bam(merged_bam, threads)
78
+ File.delete merged_bam
79
+ end
80
+
81
+ # analyse the final mappings
82
+ analyse_read_mappings(sorted_bam, insertsize, insertsd, true)
83
+
40
84
  @has_run = true
41
85
  end
42
86
 
43
87
  def read_stats
44
88
  {
45
- :num_pairs => @num_pairs,
46
- :total_mappings => @total,
47
- :percent_mapping => @percent_mapping,
89
+ :fragments => @fragments,
90
+ :fragments_mapped => @fragments_mapped,
91
+ :p_fragments_mapped => @p_fragments_mapped,
48
92
  :good_mappings => @good,
49
- :pc_good_mapping => @pc_good_mapping,
93
+ :p_good_mapping => @p_good_mapping,
50
94
  :bad_mappings => @bad,
51
- :potential_bridges => @supported_bridges,
52
- :mean_coverage => @mean_coverage,
53
- :n_uncovered_bases => @n_uncovered_bases,
54
- :p_uncovered_bases => @p_uncovered_bases,
55
- :n_uncovered_base_contigs => @n_uncovered_base_contigs,
56
- :p_uncovered_base_contigs => @p_uncovered_base_contigs,
57
- :n_uncovered_contigs => @n_uncovered_contigs,
58
- :p_uncovered_contigs => @p_uncovered_contigs,
59
- :n_lowcovered_contigs => @n_lowcovered_contigs,
60
- :p_lowcovered_contigs => @p_lowcovered_contigs
95
+ :potential_bridges => @potential_bridges,
96
+ :bases_uncovered => @bases_uncovered,
97
+ :p_bases_uncovered => @p_bases_uncovered,
98
+ :contigs_uncovbase => @contigs_uncovbase,
99
+ :p_contigs_uncovbase => @p_contigs_uncovbase,
100
+ :contigs_uncovered => @contigs_uncovered,
101
+ :p_contigs_uncovered => @p_contigs_uncovered,
102
+ :contigs_lowcovered => @contigs_lowcovered,
103
+ :p_contigs_lowcovered => @p_contigs_lowcovered,
104
+ :contigs_segmented => @contigs_segmented,
105
+ :p_contigs_segmented => @p_contigs_segmented,
106
+ :contigs_good => @contigs_good,
107
+ :p_contigs_good => @p_contigs_good
61
108
  }
62
109
  end
63
110
 
64
- def analyse_read_mappings samfile, insertsize, insertsd, bridge=true
65
- @bridges = {} if bridge
66
- realistic_dist = self.realistic_distance(insertsize, insertsd)
67
- if File.exists?(samfile) && File.size(samfile) > 0
68
- ls = BetterSam.new
69
- rs = BetterSam.new
70
- sam = File.open(samfile)
71
- line = sam.readline
72
- while line and line=~/^@/
73
- line = sam.readline rescue nil
74
- end
75
- while line
76
- ls.parse_line(line)
77
- if ls.mate_unmapped?
78
- self.check_read_single(ls)
79
- line = sam.readline rescue nil
80
- else
81
- line2 = sam.readline rescue nil
82
- if line2
83
- rs.parse_line(line2)
84
- self.check_read_pair(ls, rs, realistic_dist)
85
- end
86
- line = sam.readline rescue nil
87
- end
111
+ def get_read_length(left, right)
112
+ count=0
113
+ file = File.open(left.split(",").first)
114
+ name = file.readline.chomp
115
+ seq = file.readline.chomp
116
+ na = file.readline.chomp
117
+ qual = file.readline.chomp
118
+ read_length = 0
119
+ while name and count < 5000 # get max read length from first 5000 reads
120
+ read_length = [read_length, seq.length].max
121
+ name = file.readline.chomp rescue nil
122
+ seq = file.readline.chomp rescue nil
123
+ na = file.readline.chomp rescue nil
124
+ qual = file.readline.chomp rescue nil
125
+ count+=1
126
+ end
127
+ read_length
128
+ end
129
+
130
+ def split_bam bamfile
131
+ base = File.basename(bamfile, '.bam')
132
+ valid = "#{base}.valid.bam"
133
+ invalid = "#{base}.invalid.bam"
134
+ if !File.exist? valid
135
+ cmd = "#{@bam_splitter} #{bamfile}"
136
+ splitter = Cmd.new cmd
137
+ splitter.run
138
+ if !splitter.status.success?
139
+ logger.warn "Couldn't split bam file: #{bamfile}" +
140
+ "\n#{splitter.stdout}\n#{splitter.stderr}"
88
141
  end
89
- check_bridges
90
- else
91
- raise "samfile #{samfile} not found"
92
142
  end
143
+ if !File.exist? valid
144
+ logger.warn "Splitting failed to create valid bam: #{valid}"
145
+ end
146
+ [valid, invalid]
93
147
  end
94
148
 
95
- def initial_values
96
- @num_pairs = 0
97
- @total = 0
98
- @good = 0
99
- @bad = 0
100
- @both_mapped = 0
101
- @properly_paired = 0
102
- @improperly_paired = 0
103
- @proper_orientation = 0
104
- @improper_orientation = 0
105
- @same_contig = 0
106
- @realistic_overlap = 0
107
- @unrealistic_overlap = 0
108
- @realistic_fragment = 0
109
- @unrealistic_fragment = 0
110
- @n_uncovered_bases = 0
111
- @n_uncovered_base_contigs = 0 # any base cov < 1
112
- @n_uncovered_contigs = 0 # mean cov < 1
113
- @n_lowcovered_contigs = 0 # mean cov < 10
149
+ def assign_and_quantify bamfile
150
+ express = Express.new
151
+ results = express.run(@assembly, bamfile)
152
+ analyse_expression results.expression
153
+ results.align_samp
114
154
  end
115
155
 
116
- def realistic_distance insertsize, insertsd
117
- insertsize + (3 * insertsd)
156
+ def analyse_expression express_output
157
+ express_output.each_pair do |name, eff_count|
158
+ @contigs_uncovered += 1 if eff_count < 1
159
+ @contigs_lowcovered += 1 if eff_count < 10
160
+ contig = @assembly[name]
161
+ contig.coverage = eff_count
162
+ end
118
163
  end
119
164
 
120
- def check_read_single ls
165
+ def analyse_read_mappings bamfile, insertsize, insertsd, bridge=true
166
+ if File.exist?(bamfile) && File.size(bamfile) > 0
167
+ csv_output = "#{File.basename(@assembly.file)}_bam_info.csv"
168
+ csv_output = File.expand_path(csv_output)
121
169
 
122
- end
170
+ analyse_bam bamfile, csv_output
171
+ # open output csv file
172
+ @potential_bridges = 0
123
173
 
124
- def check_read_pair ls, rs, realistic_dist
125
- return unless ls.primary_aln?
126
- @total += 1
127
- if ls.both_mapped?
128
- # reads are paired
129
- @both_mapped += 1 if ls.primary_aln?
130
- if ls.read_properly_paired?
131
- # mapped in proper pair
132
- @properly_paired += 1
133
- self.check_orientation(ls, rs)
134
- else
135
- # not mapped in proper pair
136
- @improperly_paired += 1
137
- if ls.chrom == rs.chrom
138
- # both on same contig
139
- @same_contig += 1
140
- self.check_overlap_plausibility(ls, rs)
141
- else
142
- self.check_fragment_plausibility(ls, rs, realistic_dist)
143
- end
174
+ CSV.foreach(csv_output, :headers => true,
175
+ :header_converters => :symbol,
176
+ :converters => :all) do |row|
177
+ populate_contig_data row
144
178
  end
145
- end
146
- end
147
-
148
- def check_orientation ls, rs
149
- if ls.pair_opposite_strands?
150
- # mates in proper orientation
151
- @proper_orientation += 1
152
- @good += 1
179
+ @bad = @fragments_mapped - @good
153
180
  else
154
- # mates in wrong orientation
155
- @improper_orientation += 1
156
- @bad += 1
181
+ logger.warn "couldn't find bamfile: #{bamfile}"
182
+ end
183
+ @assembly.assembly.each_pair do |name, contig|
184
+ @contigs_good += 1 if contig.score >= 0.5
157
185
  end
186
+ update_proportions
158
187
  end
159
188
 
160
- def check_overlap_plausibility ls, rs
161
- if Math.sqrt((ls.pos - rs.pos) ** 2) < ls.seq.length
162
- # overlap is realistic
163
- @realistic_overlap += 1
164
- self.check_orientation(ls, rs)
165
- else
166
- # overlap not realistic
167
- @unrealistic_overlap+= 1
168
- @bad += 1
169
- end
189
+ def update_proportions
190
+ nbases = @assembly.n_bases.to_f
191
+ ncontigs = @assembly.size.to_f
192
+
193
+ @p_bases_uncovered = @bases_uncovered / nbases
194
+ @p_contigs_uncovbase = @contigs_uncovbase / ncontigs
195
+ @p_contigs_uncovered = @contigs_uncovered / ncontigs
196
+ @p_contigs_lowcovered = @contigs_lowcovered / ncontigs
197
+ @p_contigs_segmented = @contigs_segmented / ncontigs
198
+ @p_contigs_good = @contigs_good / ncontigs
199
+
200
+ @p_good_mapping = @good.to_f / @fragments.to_f
201
+ @p_fragments_mapped = @fragments_mapped / @fragments.to_f
170
202
  end
171
203
 
172
- def check_fragment_plausibility ls, rs, realistic_dist
173
- # mates on different contigs
174
- # are the mapping positions within a realistic distance of
175
- # the ends of contigs?
176
- ldist = [ls.pos, ls.seq.length - ls.pos].min
177
- rdist = [rs.pos, rs.seq.length - rs.pos].min
178
- if ldist + rdist <= realistic_dist
179
- # increase the evidence for this bridge
180
- key = [ls.chrom, rs.chrom].sort.join("<>").to_sym
181
- if @bridges.has_key? key
182
- @bridges[key] += 1
183
- else
184
- @bridges[key] = 1
204
+ def analyse_bam bamfile, csv_output
205
+ if !File.exist?(csv_output)
206
+ cmd = "#{@bam_reader} #{bamfile} #{csv_output}"
207
+ reader = Cmd.new cmd
208
+ reader.run
209
+ if !reader.status.success?
210
+ logger.warn "couldn't get information from bam file: #{bamfile}"
185
211
  end
186
- @realistic_fragment += 1
187
- @good += 1
188
- else
189
- @unrealistic_fragment += 1
190
- @bad += 1
191
212
  end
192
213
  end
193
214
 
194
- def check_bridges
195
- @supported_bridges = 0
196
- CSV.open('supported_bridges.csv', 'w') do |f|
197
- @bridges.each_pair do |b, count|
198
- start, finish = b.to_s.split('<>')
199
- @assembly[start].in_bridges += 1
200
- @assembly[finish].in_bridges += 1
201
- if count > 1
202
- f << [start, finish, count]
203
- @supported_bridges += 1
204
- end
205
- end
215
+ def populate_contig_data row
216
+ contig = @assembly[row[:name]]
217
+ scale = 0.7
218
+ contig.p_seq_true = (row[:p_seq_true] - scale) * (1.0 / (1 - scale))
219
+ contig.uncovered_bases = row[:bases_uncovered]
220
+ @bases_uncovered += contig.uncovered_bases
221
+ if row[:fragments_mapped] and row[:fragments_mapped] > 0
222
+ contig.p_good = row[:good]/row[:fragments_mapped].to_f
223
+ end
224
+ contig.p_not_segmented = row[:p_not_segmented]
225
+ if contig.p_not_segmented < 0.5
226
+ @contigs_segmented += 1
227
+ end
228
+ contig.in_bridges = row[:bridges]
229
+ contig.p_unique = row[:p_unique]
230
+ if row[:bridges] > 1
231
+ @potential_bridges += 1
232
+ end
233
+ @fragments_mapped += row[:fragments_mapped]
234
+ @good += row[:good]
235
+ if row[:bases_uncovered] > 0
236
+ @contigs_uncovbase += 1
206
237
  end
207
238
  end
208
239
 
209
- # Generate per-base and contig read coverage statistics.
210
- # Note that contigs less than 200 bases long are ignored in this
211
- # analysis.
212
- def analyse_coverage samfile
213
- bamfile, sorted, index = Samtools.sam_to_sorted_indexed_bam samfile
214
- bam = Bio::DB::Sam.new(:bam => sorted, :fasta => @assembly.file)
215
- # get per-base coverage and calculate mean,
216
- # identify zero-coverage bases
217
- n, tot_length, tot_coverage = 0, 0, 0
218
- @assembly.each_with_coverage(bam) do |contig, coverage|
219
- next if contig.length < 200
220
- contig.uncovered_bases, total = 0, 0
221
- coverage.each do |e|
222
- total += e
223
- contig.uncovered_bases += 1 if e < 1
224
- end
225
- tot_length += coverage.length
226
- tot_coverage += total
227
- contig.mean_coverage = total / coverage.length.to_f
228
- @n_uncovered_bases += contig.uncovered_bases
229
- @n_uncovered_base_contigs += 1 if contig.uncovered_bases > 0
230
- @n_uncovered_contigs += 1 if contig.mean_coverage < 1
231
- @n_lowcovered_contigs += 1 if contig.mean_coverage < 10
232
- end
233
- @mean_coverage = (tot_coverage / tot_length.to_f).round(2)
234
- @p_uncovered_bases = @n_uncovered_bases / @assembly.n_bases.to_f
235
- @p_uncovered_base_contigs = @n_uncovered_base_contigs /
236
- @assembly.size.to_f
237
- @p_uncovered_contigs = @n_uncovered_contigs / @assembly.size.to_f
238
- @p_lowcovered_contigs = @n_lowcovered_contigs / @assembly.size.to_f
240
+ def initial_values
241
+ @fragments = 0
242
+ @fragments_mapped = 0
243
+ @good = 0
244
+ @bad = 0
245
+ @bases_uncovered = 0
246
+ @contigs_uncovbase = 0 # any base cov < 1
247
+ @contigs_uncovered = 0 # mean cov < 1
248
+ @contigs_lowcovered = 0 # mean cov < 10
249
+ @contigs_segmented = 0 # p_not_segmented < 0.5
250
+ @contigs_good = 0
239
251
  end
240
252
 
241
253
  end # ReadMetrics
242
254
 
243
255
  end # Transrate
244
-