transrate 0.3.1 → 1.0.0.alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,12 +7,11 @@ require 'transrate/transrater'
7
7
  require 'transrate/version'
8
8
  require 'transrate/contig'
9
9
  require 'transrate/assembly'
10
- require 'transrate/bowtie2'
10
+ require 'transrate/snap'
11
+ require 'transrate/express'
11
12
  require 'transrate/read_metrics'
12
13
  require 'transrate/comparative_metrics'
13
14
  require 'transrate/contig_metrics'
14
- require 'transrate/metric'
15
- require 'transrate/dimension_reduce'
16
15
  require 'transrate/samtools'
17
16
  require 'transrate/cmd'
18
17
  require 'transrate/transrate.so'
@@ -184,55 +184,6 @@ module Transrate
184
184
 
185
185
  end # basic_bin_stats
186
186
 
187
- # Calls *block* with two arguments, the contig and an array
188
- # of integer per-base coverage counts.
189
- #
190
- # @param bam [Bio::Db::Sam] a bam alignment of reads against this assembly
191
- # @param block [Block] the block to call
192
- def each_with_coverage(bam, &block)
193
- logger.debug 'enumerating assembly with coverage'
194
- # generate coverage with samtools
195
- covfile = Samtools.coverage bam
196
- # get an assembly enumerator
197
- assembly_enum = @assembly.to_enum
198
- contig_name, contig = assembly_enum.next
199
- # precreate an array of the correct size to contain
200
- # coverage. this is necessary because samtools mpileup
201
- # doesn't print a result line for bases with 0 coverage
202
- contig.coverage = Array.new(contig.length, 0)
203
- # the columns we need
204
- name_i, pos_i, cov_i = 0, 1, 3
205
- # parse the coverage file
206
- File.open(covfile).each_line do |line|
207
- cols = line.chomp.split("\t")
208
- unless (cols && cols.length > 4)
209
- # last line
210
- break
211
- end
212
- # extract the columns
213
- name = Bio::FastaDefline.new(cols[name_i]).entry_id
214
- pos, cov = cols[pos_i].to_i, cols[cov_i].to_i
215
- unless contig_name == name
216
- while contig_name != name
217
- begin
218
- block.call(contig, contig.coverage)
219
- contig_name, contig = assembly_enum.next
220
- contig.coverage = Array.new(contig.length, 0)
221
- rescue StopIteration => stop_error
222
- logger.error 'reached the end of assembly enumerator while ' +
223
- 'there were contigs left in the coverage results'
224
- logger.error "final assembly contig: #{@assembly.last.name}"
225
- logger.error "coverage contig: #{name}"
226
- raise stop_error
227
- end
228
- end
229
- end
230
- contig.coverage[pos - 1] = cov
231
- end
232
- # yield the final contig
233
- block.call(contig, contig.coverage)
234
- end
235
-
236
187
  end # Assembly
237
188
 
238
189
  end # Transrate
@@ -14,6 +14,10 @@ module Transrate
14
14
  @stdout, @stderr, @status = Open3.capture3 @cmd
15
15
  end
16
16
 
17
+ def to_s
18
+ @cmd
19
+ end
20
+
17
21
  end
18
22
 
19
23
  end
@@ -11,7 +11,6 @@ module Transrate
11
11
  attr_reader :has_run
12
12
  attr_reader :reference_coverage
13
13
  attr_reader :comp_stats
14
- attr_reader :n_chimeras, :p_chimeras
15
14
 
16
15
  def initialize assembly, reference, threads
17
16
  @assembly = assembly
@@ -23,14 +22,12 @@ module Transrate
23
22
  def run
24
23
  @crbblast = reciprocal_best_blast
25
24
  @reference_coverage = coverage @crbblast
26
- @collapse_factor = collapse_factor @crbblast.reciprocals
27
25
  @reciprocal_hits = @crbblast.size
28
26
  @rbh_per_reference = @reciprocal_hits.to_f / @reference.size.to_f
29
27
  @p_contigs_with_recip = @crbblast.reciprocals.size / @assembly.size.to_f
30
28
  @n_contigs_with_recip = @crbblast.reciprocals.size
31
29
  count_ref_crbbs
32
30
  @p_refs_with_recip = @n_refs_with_recip / @reference.size.to_f
33
- chimeras @crbblast
34
31
  self.run_comp_stats
35
32
  @has_run = true
36
33
  end
@@ -43,9 +40,6 @@ module Transrate
43
40
  @comp_stats[:n_refs_with_CRBB] = @n_refs_with_recip
44
41
  @comp_stats[:rbh_per_reference] = @rbh_per_reference
45
42
  @comp_stats[:reference_coverage] = @reference_coverage
46
- @comp_stats[:collapse_factor] = @collapse_factor
47
- @comp_stats[:n_chimeras] = @n_chimeras
48
- @comp_stats[:p_chimeras] = @p_chimeras
49
43
  end
50
44
 
51
45
  def reciprocal_best_blast
@@ -71,19 +65,29 @@ module Transrate
71
65
  contig = @assembly[hit.query]
72
66
  contig.has_crb = true
73
67
  # how much of the reference is covered by this single contig
74
- contig.reference_coverage = hit.alnlen / hit.tlen
68
+ if crbblast.target_is_prot
69
+ contig.reference_coverage =
70
+ (hit.alnlen - hit.mismatches - hit.gaps) / (3*hit.tlen)
71
+ else
72
+ contig.reference_coverage =
73
+ (hit.alnlen - hit.mismatches - hit.gaps) / hit.tlen
74
+ end
75
75
  contig.hits << hit
76
76
  end
77
77
  end
78
78
  total_coverage = 0
79
79
  total_length = 0
80
80
  cov = [0.25, 0.5, 0.75, 0.85, 0.95]
81
+ @cov ||= [0, 0, 0, 0, 0]
81
82
  @reference.each_value do |ref_contig|
82
83
  key = ref_contig.name
83
84
  list = ref_contig.hits
84
- total_length += crbblast.target_is_prot ? ref_contig.length : ref_contig.length*3
85
-
86
- next if list.empty? # ah this is what was breaking everything
85
+ if crbblast.target_is_prot
86
+ total_length += ref_contig.length * 3
87
+ else
88
+ total_length += ref_contig.length
89
+ end
90
+ next if list.empty?
87
91
  blocks = []
88
92
  target_length = 0
89
93
  list.each do |hit|
@@ -162,9 +166,7 @@ module Transrate
162
166
  end # each_with_index a
163
167
  # sum blocks to find total coverage
164
168
  length_of_coverage = calculate_coverage blocks
165
- @cov ||= [0, 0, 0, 0, 0]
166
169
  if target_length > 0
167
- # puts "#{length_of_coverage} / #{target_length.to_f}"
168
170
  ref_p = length_of_coverage / target_length.to_f
169
171
  else
170
172
  ref_p = 0
@@ -179,10 +181,11 @@ module Transrate
179
181
 
180
182
  total_coverage += length_of_coverage
181
183
  end
184
+
182
185
  cov.each_with_index do |p, i|
183
186
  @comp_stats["cov#{(100*p).to_i}".to_sym] = @cov[i]
184
187
  @comp_stats["p_cov#{(100*p).to_i}".to_sym] =
185
- @cov[i]/@reference.size.to_f
188
+ @cov[i]/@reference.size.to_f
186
189
  end
187
190
  total_coverage / total_length.to_f
188
191
  end
@@ -210,44 +213,6 @@ module Transrate
210
213
  end
211
214
  end
212
215
 
213
- def chimeras crbblast
214
- @n_chimeras = 0
215
- crbblast.reciprocals.each_pair do |key, list|
216
- p = 0
217
- list.each_with_index do |a, i|
218
- list.each_with_index do |b, j|
219
- if j>i
220
- if a.target == b.target
221
- astart, astop = [a.tstart, a.tend].minmax
222
- bstart, bstop = [b.tstart, b.tend].minmax
223
-
224
- oa = overlap_amount(astart, astop, bstart, bstop)
225
- if oa > 0.75
226
- p += 1
227
- end
228
- else
229
- astart, astop = [a.qstart, a.qend].minmax
230
- bstart, bstop = [b.qstart, b.qend].minmax
231
-
232
- oa = overlap_amount(astart, astop, bstart, bstop)
233
- if oa < 0.25
234
- p += 1
235
- end
236
- end
237
- end
238
- end
239
- end
240
- if p/list.size.to_f >= 0.5
241
- @n_chimeras += 1
242
- unless @assembly.assembly.key? key
243
- puts "key not in assembly: #{key}"
244
- end
245
- @assembly[key].is_chimera = true
246
- end
247
- end
248
- @p_chimeras = @n_chimeras / crbblast.reciprocals.length.to_f
249
- end
250
-
251
216
  def overlap(astart, astop, bstart, bstop)
252
217
  if astart == bstart and astop == bstop
253
218
  return 0
@@ -300,19 +265,6 @@ module Transrate
300
265
  end
301
266
  end
302
267
 
303
- # Count unique reference proteins per contig
304
- def collapse_factor reciprocals
305
- return @collapse_factor unless @collapse_factor.nil?
306
- cf_sum = 0
307
- reciprocals.each do |query, hits|
308
- uniq_hits = Set.new hits.map{ |h| h.target }
309
- cf = uniq_hits.length
310
- @assembly[query].collapse_factor = cf
311
- cf_sum += cf
312
- end
313
- cf_sum / reciprocals.size
314
- end
315
-
316
268
  end # ComparativeMetrics
317
269
 
318
270
  end # Transrate
@@ -10,9 +10,12 @@ module Transrate
10
10
  def_delegators :@seq, :size, :length
11
11
  attr_accessor :seq, :name
12
12
  # read-based metrics
13
- attr_accessor :coverage, :uncovered_bases, :mean_coverage, :in_bridges
13
+ attr_accessor :coverage, :uncovered_bases, :p_uncovered_bases
14
+ attr_accessor :p_seq_true, :p_unique
15
+ attr_accessor :low_uniqueness_bases, :in_bridges
16
+ attr_accessor :p_good, :p_not_segmented
14
17
  # reference-based metrics
15
- attr_accessor :has_crb, :is_chimera, :collapse_factor, :reference_coverage
18
+ attr_accessor :has_crb, :reference_coverage
16
19
  attr_accessor :hits
17
20
 
18
21
  def initialize(seq, name: nil)
@@ -22,11 +25,16 @@ module Transrate
22
25
  @name = seq.respond_to?(:entry_id) ? seq.entry_id : name
23
26
  @hits = []
24
27
  @reference_coverage = 0
25
- @collapse_factor = 0
26
- @is_chimera = false
27
28
  @has_crb = false
28
29
  @in_bridges = 0
29
- @mean_coverage = 0
30
+ @p_seq_true = 0
31
+ @low_uniqueness_bases = 0
32
+ @p_good = -1
33
+ @uncovered_bases = length
34
+ @p_uncovered_bases = 1
35
+ @p_unique = 0
36
+ @p_not_segmented = 1
37
+ @score = -1
30
38
  end
31
39
 
32
40
  def each &block
@@ -43,34 +51,38 @@ module Transrate
43
51
  :cpg_count => cpg_count,
44
52
  :cpg_ratio => cpg_ratio,
45
53
  :orf_length => orf_length,
46
- :linguistic_complexity_6 => linguistic_complexity(6)
54
+ :linguistic_complexity_6 => linguistic_complexity(6),
47
55
  }
48
56
  end
49
57
 
50
58
  def read_metrics
51
- read = @coverage ? {
52
- :uncovered_bases => uncovered_bases,
53
- :mean_coverage => mean_coverage,
54
- :in_bridges => in_bridges
59
+ read = @p_good>=0 ? {
60
+ :in_bridges => in_bridges,
61
+ :p_good => @p_good,
62
+ :p_bases_covered => p_bases_covered,
63
+ :p_seq_true => p_seq_true,
64
+ :score => score,
65
+ :p_unique => p_unique,
66
+ :p_not_segmented => p_not_segmented
55
67
  } : {
56
- :uncovered_bases => "NA",
57
- :mean_coverage => "NA",
58
- :in_bridges => in_bridges
68
+ :in_bridges => "NA",
69
+ :p_good => "NA",
70
+ :p_bases_covered => "NA",
71
+ :p_seq_true => "NA",
72
+ :score => "NA",
73
+ :p_unique => p_unique,
74
+ :p_not_segmented => p_not_segmented
59
75
  }
60
76
  end
61
77
 
62
78
  def comparative_metrics
63
79
  reference = @has_crb ? {
64
80
  :has_crb => has_crb,
65
- :collapse_factor => collapse_factor,
66
81
  :reference_coverage => reference_coverage,
67
- :is_chimera => is_chimera,
68
82
  :hits => hits.map{ |h| h.target }.join(";")
69
83
  } : {
70
84
  :has_crb => false,
71
- :collapse_factor => "NA",
72
85
  :reference_coverage => "NA",
73
- :is_chimera => "NA",
74
86
  :hits => "NA"
75
87
  }
76
88
  end
@@ -89,7 +101,7 @@ module Transrate
89
101
  composition(@seq.seq)
90
102
  alphabet = ['a', 'c', 'g', 't', 'n']
91
103
  @base_composition = {}
92
- @dibase_composition={}
104
+ @dibase_composition = {}
93
105
  bases = []
94
106
  dibases = []
95
107
  alphabet.each do |c|
@@ -208,6 +220,33 @@ module Transrate
208
220
  def linguistic_complexity k
209
221
  return kmer_count(k, @seq.seq)/(4**k).to_f # call to C
210
222
  end
223
+
224
+ def p_bases_covered
225
+ 1 - p_uncovered_bases
226
+ end
227
+
228
+ def uncovered_bases= n
229
+ @uncovered_bases = n
230
+ @p_uncovered_bases = n / length.to_f
231
+ end
232
+
233
+ def p_unique_bases
234
+ (length - low_uniqueness_bases) / length.to_f
235
+ end
236
+
237
+ # Contig score (geometric mean of all score components)
238
+ def score
239
+ return @score if @score != -1
240
+ prod =
241
+ [p_bases_covered, 0.01].max * # proportion of bases covered
242
+ [p_not_segmented, 0.01].max * # prob contig has 0 changepoints
243
+ [p_good, 0.01].max * # proportion of reads that mapped good
244
+ [p_seq_true, 0.01].max * # scaled 1 - mean per-base edit distance
245
+ [p_unique, 0.01].max # prop mapQ >= 5
246
+ s = prod ** (1.0 / 5)
247
+ s = 0.01 if !s
248
+ @score = [s, 0.01].max
249
+ end
211
250
  end
212
251
 
213
252
  end
@@ -0,0 +1,79 @@
1
+
2
+ module Transrate
3
+
4
+ class ExpressError < StandardError
5
+ end
6
+
7
+ class Express
8
+
9
+ require 'ostruct'
10
+
11
+ # return an Express object
12
+ def initialize
13
+ which = Cmd.new('which express')
14
+ which.run
15
+ if !which.status.success?
16
+ raise ExpressError.new("could not find express in the path")
17
+ end
18
+ @express = which.stdout.split("\n").first
19
+ end
20
+
21
+ # return struct containing:
22
+ # results_file => path to the express results TSV
23
+ # expression => a hash of target => effective_count
24
+ # align_samp => path to the sampled alignments file
25
+ def run assembly, bamfile
26
+ assembly = assembly.file if assembly.is_a? Assembly
27
+
28
+ ex_output = 'results.xprs'
29
+ fin_output = "#{File.basename assembly}_#{ex_output}"
30
+
31
+ unless File.exists? fin_output
32
+ runner = Cmd.new build_command(assembly, bamfile)
33
+ runner.run
34
+ unless runner.status.success?
35
+ raise ExpressError.new("Express failed\n" +
36
+ runner.stderr + "\n" +
37
+ runner.stdout)
38
+ end
39
+ File.rename(ex_output, fin_output)
40
+ end
41
+
42
+ OpenStruct.new(:results_file => fin_output,
43
+ :expression => load_expression(fin_output),
44
+ :align_samp => 'hits.1.samp.bam')
45
+ end
46
+
47
+ # return the constructed eXpress command
48
+ def build_command assembly, bamfile
49
+ cmd = "#{@express}"
50
+ cmd << " #{File.expand_path assembly}"
51
+ cmd << " #{File.expand_path bamfile}"
52
+ cmd << " --output-dir ."
53
+ cmd << " --output-align-samp"
54
+ cmd << " --no-update-check"
55
+ cmd << " --additional-online 1"
56
+ cmd
57
+ end
58
+
59
+ # return a hash of target => effective_count created
60
+ # by parsing the results file
61
+ def load_expression file
62
+ expression = {}
63
+ first = true
64
+ File.open(file).each do |line|
65
+ if first
66
+ first = false
67
+ next
68
+ end
69
+ line = line.chomp.split("\t")
70
+ target = line[1]
71
+ effective_count = line[7]
72
+ expression[target] = effective_count.to_f
73
+ end
74
+ expression
75
+ end
76
+
77
+ end # Express
78
+
79
+ end # Transrate
@@ -2,243 +2,254 @@ module Transrate
2
2
 
3
3
  class ReadMetrics
4
4
 
5
- require 'bettersam'
6
- require 'bio-samtools'
7
-
8
- attr_reader :total
5
+ attr_reader :fragments_mapping
6
+ attr_reader :p_good_mapping
9
7
  attr_reader :bad
10
8
  attr_reader :supported_bridges
11
- attr_reader :pr_good_mapping
12
- attr_reader :percent_mapping
13
- attr_reader :prop_expressed
14
9
  attr_reader :has_run
10
+ attr_reader :read_length
15
11
 
16
12
  def initialize assembly
17
13
  @assembly = assembly
18
- @mapper = Bowtie2.new
14
+ @mapper = Snap.new
19
15
  self.initial_values
16
+
17
+ load_executables
18
+ @read_length = 100
19
+ end
20
+
21
+ def load_executables
22
+ @bam_splitter = get_bin_path 'bam-split'
23
+ @bam_reader = get_bin_path 'bam-read'
24
+ end
25
+
26
+ def get_bin_path bin
27
+ which_bin = Cmd.new("which #{bin}")
28
+ which_bin.run
29
+ if !which_bin.status.success?
30
+ raise IOError.new("ReadMetrics: could not find #{bin} in path")
31
+ end
32
+ which_bin.stdout.split("\n").first
20
33
  end
21
34
 
22
35
  def run left, right, insertsize:200, insertsd:50, threads:8
36
+ # check all read files exist
23
37
  [left, right].each do |readfile|
24
- unless File.exist? readfile
25
- raise IOError.new "ReadMetrics read file does not exist: #{readfile}"
38
+ raise IOError.new "Read file is nil" if readfile.nil?
39
+ readfile.split(",").each do |file|
40
+ unless File.exist? file
41
+ raise IOError.new "ReadMetrics: read file does not exist: #{file}"
42
+ end
26
43
  end
27
44
  end
28
- @mapper.build_index @assembly.file
29
- @num_pairs = `wc -l #{left}`.strip.split(/\s+/)[0].to_i/4
30
- samfile = @mapper.map_reads(@assembly.file, left, right,
45
+
46
+ # estimate max read length
47
+ @read_length = get_read_length(left, right)
48
+
49
+ # map reads
50
+ @mapper.build_index(@assembly.file, threads)
51
+ bamfile = @mapper.map_reads(@assembly.file, left, right,
31
52
  insertsize: insertsize,
32
53
  insertsd: insertsd,
33
54
  threads: threads)
34
- # check_bridges
35
- analyse_read_mappings(samfile, insertsize, insertsd, true)
36
- analyse_coverage(samfile)
37
- @pr_good_mapping = @good.to_f / @num_pairs.to_f
38
- @percent_mapping = @total.to_f / @num_pairs.to_f * 100.0
39
- @pc_good_mapping = @pr_good_mapping * 100.0
55
+ @fragments = @mapper.read_count
56
+
57
+ # classify bam file into valid and invalid alignments
58
+ sorted_bam = "#{File.basename(bamfile, '.bam')}.merged.sorted.bam"
59
+ readsorted_bam = "#{File.basename(bamfile, '.bam')}.valid.sorted.bam"
60
+ unless File.exist? sorted_bam
61
+ valid_bam, invalid_bam = split_bam bamfile
62
+ readsorted_bam = Samtools.readsort_bam(valid_bam, threads)
63
+ File.delete valid_bam
64
+ end
65
+
66
+ # pass valid alignments to eXpress for assignment
67
+ # always have to run the eXpress command to load the results
68
+ assigned_bam = assign_and_quantify readsorted_bam
69
+
70
+ # merge the assigned alignments back with the invalid ones
71
+ unless File.exist? sorted_bam
72
+ File.delete readsorted_bam
73
+ merged_bam = "#{File.basename(bamfile, '.bam')}.merged.bam"
74
+ Samtools.merge_bam(invalid_bam, assigned_bam, merged_bam, threads=threads)
75
+ File.delete invalid_bam
76
+ File.delete assigned_bam
77
+ sorted_bam = Samtools.sort_bam(merged_bam, threads)
78
+ File.delete merged_bam
79
+ end
80
+
81
+ # analyse the final mappings
82
+ analyse_read_mappings(sorted_bam, insertsize, insertsd, true)
83
+
40
84
  @has_run = true
41
85
  end
42
86
 
43
87
  def read_stats
44
88
  {
45
- :num_pairs => @num_pairs,
46
- :total_mappings => @total,
47
- :percent_mapping => @percent_mapping,
89
+ :fragments => @fragments,
90
+ :fragments_mapped => @fragments_mapped,
91
+ :p_fragments_mapped => @p_fragments_mapped,
48
92
  :good_mappings => @good,
49
- :pc_good_mapping => @pc_good_mapping,
93
+ :p_good_mapping => @p_good_mapping,
50
94
  :bad_mappings => @bad,
51
- :potential_bridges => @supported_bridges,
52
- :mean_coverage => @mean_coverage,
53
- :n_uncovered_bases => @n_uncovered_bases,
54
- :p_uncovered_bases => @p_uncovered_bases,
55
- :n_uncovered_base_contigs => @n_uncovered_base_contigs,
56
- :p_uncovered_base_contigs => @p_uncovered_base_contigs,
57
- :n_uncovered_contigs => @n_uncovered_contigs,
58
- :p_uncovered_contigs => @p_uncovered_contigs,
59
- :n_lowcovered_contigs => @n_lowcovered_contigs,
60
- :p_lowcovered_contigs => @p_lowcovered_contigs
95
+ :potential_bridges => @potential_bridges,
96
+ :bases_uncovered => @bases_uncovered,
97
+ :p_bases_uncovered => @p_bases_uncovered,
98
+ :contigs_uncovbase => @contigs_uncovbase,
99
+ :p_contigs_uncovbase => @p_contigs_uncovbase,
100
+ :contigs_uncovered => @contigs_uncovered,
101
+ :p_contigs_uncovered => @p_contigs_uncovered,
102
+ :contigs_lowcovered => @contigs_lowcovered,
103
+ :p_contigs_lowcovered => @p_contigs_lowcovered,
104
+ :contigs_segmented => @contigs_segmented,
105
+ :p_contigs_segmented => @p_contigs_segmented,
106
+ :contigs_good => @contigs_good,
107
+ :p_contigs_good => @p_contigs_good
61
108
  }
62
109
  end
63
110
 
64
- def analyse_read_mappings samfile, insertsize, insertsd, bridge=true
65
- @bridges = {} if bridge
66
- realistic_dist = self.realistic_distance(insertsize, insertsd)
67
- if File.exists?(samfile) && File.size(samfile) > 0
68
- ls = BetterSam.new
69
- rs = BetterSam.new
70
- sam = File.open(samfile)
71
- line = sam.readline
72
- while line and line=~/^@/
73
- line = sam.readline rescue nil
74
- end
75
- while line
76
- ls.parse_line(line)
77
- if ls.mate_unmapped?
78
- self.check_read_single(ls)
79
- line = sam.readline rescue nil
80
- else
81
- line2 = sam.readline rescue nil
82
- if line2
83
- rs.parse_line(line2)
84
- self.check_read_pair(ls, rs, realistic_dist)
85
- end
86
- line = sam.readline rescue nil
87
- end
111
+ def get_read_length(left, right)
112
+ count=0
113
+ file = File.open(left.split(",").first)
114
+ name = file.readline.chomp
115
+ seq = file.readline.chomp
116
+ na = file.readline.chomp
117
+ qual = file.readline.chomp
118
+ read_length = 0
119
+ while name and count < 5000 # get max read length from first 5000 reads
120
+ read_length = [read_length, seq.length].max
121
+ name = file.readline.chomp rescue nil
122
+ seq = file.readline.chomp rescue nil
123
+ na = file.readline.chomp rescue nil
124
+ qual = file.readline.chomp rescue nil
125
+ count+=1
126
+ end
127
+ read_length
128
+ end
129
+
130
+ def split_bam bamfile
131
+ base = File.basename(bamfile, '.bam')
132
+ valid = "#{base}.valid.bam"
133
+ invalid = "#{base}.invalid.bam"
134
+ if !File.exist? valid
135
+ cmd = "#{@bam_splitter} #{bamfile}"
136
+ splitter = Cmd.new cmd
137
+ splitter.run
138
+ if !splitter.status.success?
139
+ logger.warn "Couldn't split bam file: #{bamfile}" +
140
+ "\n#{splitter.stdout}\n#{splitter.stderr}"
88
141
  end
89
- check_bridges
90
- else
91
- raise "samfile #{samfile} not found"
92
142
  end
143
+ if !File.exist? valid
144
+ logger.warn "Splitting failed to create valid bam: #{valid}"
145
+ end
146
+ [valid, invalid]
93
147
  end
94
148
 
95
- def initial_values
96
- @num_pairs = 0
97
- @total = 0
98
- @good = 0
99
- @bad = 0
100
- @both_mapped = 0
101
- @properly_paired = 0
102
- @improperly_paired = 0
103
- @proper_orientation = 0
104
- @improper_orientation = 0
105
- @same_contig = 0
106
- @realistic_overlap = 0
107
- @unrealistic_overlap = 0
108
- @realistic_fragment = 0
109
- @unrealistic_fragment = 0
110
- @n_uncovered_bases = 0
111
- @n_uncovered_base_contigs = 0 # any base cov < 1
112
- @n_uncovered_contigs = 0 # mean cov < 1
113
- @n_lowcovered_contigs = 0 # mean cov < 10
149
+ def assign_and_quantify bamfile
150
+ express = Express.new
151
+ results = express.run(@assembly, bamfile)
152
+ analyse_expression results.expression
153
+ results.align_samp
114
154
  end
115
155
 
116
- def realistic_distance insertsize, insertsd
117
- insertsize + (3 * insertsd)
156
+ def analyse_expression express_output
157
+ express_output.each_pair do |name, eff_count|
158
+ @contigs_uncovered += 1 if eff_count < 1
159
+ @contigs_lowcovered += 1 if eff_count < 10
160
+ contig = @assembly[name]
161
+ contig.coverage = eff_count
162
+ end
118
163
  end
119
164
 
120
- def check_read_single ls
165
+ def analyse_read_mappings bamfile, insertsize, insertsd, bridge=true
166
+ if File.exist?(bamfile) && File.size(bamfile) > 0
167
+ csv_output = "#{File.basename(@assembly.file)}_bam_info.csv"
168
+ csv_output = File.expand_path(csv_output)
121
169
 
122
- end
170
+ analyse_bam bamfile, csv_output
171
+ # open output csv file
172
+ @potential_bridges = 0
123
173
 
124
- def check_read_pair ls, rs, realistic_dist
125
- return unless ls.primary_aln?
126
- @total += 1
127
- if ls.both_mapped?
128
- # reads are paired
129
- @both_mapped += 1 if ls.primary_aln?
130
- if ls.read_properly_paired?
131
- # mapped in proper pair
132
- @properly_paired += 1
133
- self.check_orientation(ls, rs)
134
- else
135
- # not mapped in proper pair
136
- @improperly_paired += 1
137
- if ls.chrom == rs.chrom
138
- # both on same contig
139
- @same_contig += 1
140
- self.check_overlap_plausibility(ls, rs)
141
- else
142
- self.check_fragment_plausibility(ls, rs, realistic_dist)
143
- end
174
+ CSV.foreach(csv_output, :headers => true,
175
+ :header_converters => :symbol,
176
+ :converters => :all) do |row|
177
+ populate_contig_data row
144
178
  end
145
- end
146
- end
147
-
148
- def check_orientation ls, rs
149
- if ls.pair_opposite_strands?
150
- # mates in proper orientation
151
- @proper_orientation += 1
152
- @good += 1
179
+ @bad = @fragments_mapped - @good
153
180
  else
154
- # mates in wrong orientation
155
- @improper_orientation += 1
156
- @bad += 1
181
+ logger.warn "couldn't find bamfile: #{bamfile}"
182
+ end
183
+ @assembly.assembly.each_pair do |name, contig|
184
+ @contigs_good += 1 if contig.score >= 0.5
157
185
  end
186
+ update_proportions
158
187
  end
159
188
 
160
- def check_overlap_plausibility ls, rs
161
- if Math.sqrt((ls.pos - rs.pos) ** 2) < ls.seq.length
162
- # overlap is realistic
163
- @realistic_overlap += 1
164
- self.check_orientation(ls, rs)
165
- else
166
- # overlap not realistic
167
- @unrealistic_overlap+= 1
168
- @bad += 1
169
- end
189
+ def update_proportions
190
+ nbases = @assembly.n_bases.to_f
191
+ ncontigs = @assembly.size.to_f
192
+
193
+ @p_bases_uncovered = @bases_uncovered / nbases
194
+ @p_contigs_uncovbase = @contigs_uncovbase / ncontigs
195
+ @p_contigs_uncovered = @contigs_uncovered / ncontigs
196
+ @p_contigs_lowcovered = @contigs_lowcovered / ncontigs
197
+ @p_contigs_segmented = @contigs_segmented / ncontigs
198
+ @p_contigs_good = @contigs_good / ncontigs
199
+
200
+ @p_good_mapping = @good.to_f / @fragments.to_f
201
+ @p_fragments_mapped = @fragments_mapped / @fragments.to_f
170
202
  end
171
203
 
172
- def check_fragment_plausibility ls, rs, realistic_dist
173
- # mates on different contigs
174
- # are the mapping positions within a realistic distance of
175
- # the ends of contigs?
176
- ldist = [ls.pos, ls.seq.length - ls.pos].min
177
- rdist = [rs.pos, rs.seq.length - rs.pos].min
178
- if ldist + rdist <= realistic_dist
179
- # increase the evidence for this bridge
180
- key = [ls.chrom, rs.chrom].sort.join("<>").to_sym
181
- if @bridges.has_key? key
182
- @bridges[key] += 1
183
- else
184
- @bridges[key] = 1
204
+ def analyse_bam bamfile, csv_output
205
+ if !File.exist?(csv_output)
206
+ cmd = "#{@bam_reader} #{bamfile} #{csv_output}"
207
+ reader = Cmd.new cmd
208
+ reader.run
209
+ if !reader.status.success?
210
+ logger.warn "couldn't get information from bam file: #{bamfile}"
185
211
  end
186
- @realistic_fragment += 1
187
- @good += 1
188
- else
189
- @unrealistic_fragment += 1
190
- @bad += 1
191
212
  end
192
213
  end
193
214
 
194
- def check_bridges
195
- @supported_bridges = 0
196
- CSV.open('supported_bridges.csv', 'w') do |f|
197
- @bridges.each_pair do |b, count|
198
- start, finish = b.to_s.split('<>')
199
- @assembly[start].in_bridges += 1
200
- @assembly[finish].in_bridges += 1
201
- if count > 1
202
- f << [start, finish, count]
203
- @supported_bridges += 1
204
- end
205
- end
215
+ def populate_contig_data row
216
+ contig = @assembly[row[:name]]
217
+ scale = 0.7
218
+ contig.p_seq_true = (row[:p_seq_true] - scale) * (1.0 / (1 - scale))
219
+ contig.uncovered_bases = row[:bases_uncovered]
220
+ @bases_uncovered += contig.uncovered_bases
221
+ if row[:fragments_mapped] and row[:fragments_mapped] > 0
222
+ contig.p_good = row[:good]/row[:fragments_mapped].to_f
223
+ end
224
+ contig.p_not_segmented = row[:p_not_segmented]
225
+ if contig.p_not_segmented < 0.5
226
+ @contigs_segmented += 1
227
+ end
228
+ contig.in_bridges = row[:bridges]
229
+ contig.p_unique = row[:p_unique]
230
+ if row[:bridges] > 1
231
+ @potential_bridges += 1
232
+ end
233
+ @fragments_mapped += row[:fragments_mapped]
234
+ @good += row[:good]
235
+ if row[:bases_uncovered] > 0
236
+ @contigs_uncovbase += 1
206
237
  end
207
238
  end
208
239
 
209
- # Generate per-base and contig read coverage statistics.
210
- # Note that contigs less than 200 bases long are ignored in this
211
- # analysis.
212
- def analyse_coverage samfile
213
- bamfile, sorted, index = Samtools.sam_to_sorted_indexed_bam samfile
214
- bam = Bio::DB::Sam.new(:bam => sorted, :fasta => @assembly.file)
215
- # get per-base coverage and calculate mean,
216
- # identify zero-coverage bases
217
- n, tot_length, tot_coverage = 0, 0, 0
218
- @assembly.each_with_coverage(bam) do |contig, coverage|
219
- next if contig.length < 200
220
- contig.uncovered_bases, total = 0, 0
221
- coverage.each do |e|
222
- total += e
223
- contig.uncovered_bases += 1 if e < 1
224
- end
225
- tot_length += coverage.length
226
- tot_coverage += total
227
- contig.mean_coverage = total / coverage.length.to_f
228
- @n_uncovered_bases += contig.uncovered_bases
229
- @n_uncovered_base_contigs += 1 if contig.uncovered_bases > 0
230
- @n_uncovered_contigs += 1 if contig.mean_coverage < 1
231
- @n_lowcovered_contigs += 1 if contig.mean_coverage < 10
232
- end
233
- @mean_coverage = (tot_coverage / tot_length.to_f).round(2)
234
- @p_uncovered_bases = @n_uncovered_bases / @assembly.n_bases.to_f
235
- @p_uncovered_base_contigs = @n_uncovered_base_contigs /
236
- @assembly.size.to_f
237
- @p_uncovered_contigs = @n_uncovered_contigs / @assembly.size.to_f
238
- @p_lowcovered_contigs = @n_lowcovered_contigs / @assembly.size.to_f
240
+ def initial_values
241
+ @fragments = 0
242
+ @fragments_mapped = 0
243
+ @good = 0
244
+ @bad = 0
245
+ @bases_uncovered = 0
246
+ @contigs_uncovbase = 0 # any base cov < 1
247
+ @contigs_uncovered = 0 # mean cov < 1
248
+ @contigs_lowcovered = 0 # mean cov < 10
249
+ @contigs_segmented = 0 # p_not_segmented < 0.5
250
+ @contigs_good = 0
239
251
  end
240
252
 
241
253
  end # ReadMetrics
242
254
 
243
255
  end # Transrate
244
-