RubyGems - transrate - Versions diffs - 0.3.1 → 1.0.0.alpha.1 - Mend

transrate 0.3.1 → 1.0.0.alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

checksums.yaml +4 -4
data/.gitignore +7 -0
data/README.md +7 -6
data/bin/transrate +21 -9
data/deps/deps.yaml +49 -14
data/ext/transrate/transrate.c +200 -166
data/lib/transrate.rb +2 -3
data/lib/transrate/assembly.rb +0 -49
data/lib/transrate/cmd.rb +4 -0
data/lib/transrate/comparative_metrics.rb +16 -64
data/lib/transrate/contig.rb +57 -18
data/lib/transrate/express.rb +79 -0
data/lib/transrate/read_metrics.rb +196 -185
data/lib/transrate/samtools.rb +88 -16
data/lib/transrate/snap.rb +123 -0
data/lib/transrate/transrater.rb +16 -19
data/lib/transrate/version.rb +4 -4
data/test/data/bridging_reads.l.fastq +20 -0
data/test/data/bridging_reads.r.fastq +20 -0
data/test/test_bin.rb +50 -21
data/test/test_comp_metrics.rb +3 -27
data/test/test_contig.rb +8 -0
data/test/test_inline.rb +1 -1
data/test/test_read_metrics.rb +108 -19
data/test/test_transrater.rb +5 -5
data/transrate.gemspec +2 -5
metadata +66 -129
data/lib/transrate/bowtie2.rb +0 -75
data/lib/transrate/dimension_reduce.rb +0 -18
data/lib/transrate/metric.rb +0 -16
data/test/test_bowtie.rb +0 -66

data/lib/transrate.rb CHANGED

@@ -7,12 +7,11 @@ require 'transrate/transrater'
 require 'transrate/version'
 require 'transrate/contig'
 require 'transrate/assembly'
-require 'transrate/bowtie2'
+require 'transrate/snap'
+require 'transrate/express'
 require 'transrate/read_metrics'
 require 'transrate/comparative_metrics'
 require 'transrate/contig_metrics'
-require 'transrate/metric'
-require 'transrate/dimension_reduce'
 require 'transrate/samtools'
 require 'transrate/cmd'
 require 'transrate/transrate.so'

data/lib/transrate/assembly.rb CHANGED

@@ -184,55 +184,6 @@ module Transrate
     end # basic_bin_stats
-    # Calls *block* with two arguments, the contig and an array
-    # of integer per-base coverage counts.
-    #
-    # @param bam [Bio::Db::Sam] a bam alignment of reads against this assembly
-    # @param block [Block] the block to call
-    def each_with_coverage(bam, &block)
-      logger.debug 'enumerating assembly with coverage'
-      # generate coverage with samtools
-      covfile = Samtools.coverage bam
-      # get an assembly enumerator
-      assembly_enum = @assembly.to_enum
-      contig_name, contig = assembly_enum.next
-      # precreate an array of the correct size to contain
-      # coverage. this is necessary because samtools mpileup
-      # doesn't print a result line for bases with 0 coverage
-      contig.coverage = Array.new(contig.length, 0)
-      # the columns we need
-      name_i, pos_i, cov_i = 0, 1, 3
-      # parse the coverage file
-      File.open(covfile).each_line do |line|
-        cols = line.chomp.split("\t")
-        unless (cols && cols.length > 4)
-          # last line
-          break
-        end
-        # extract the columns
-        name = Bio::FastaDefline.new(cols[name_i]).entry_id
-        pos, cov =  cols[pos_i].to_i, cols[cov_i].to_i
-        unless contig_name == name
-          while contig_name != name
-            begin
-              block.call(contig, contig.coverage)
-              contig_name, contig = assembly_enum.next
-              contig.coverage = Array.new(contig.length, 0)
-            rescue StopIteration => stop_error
-              logger.error 'reached the end of assembly enumerator while ' +
-                        'there were contigs left in the coverage results'
-              logger.error "final assembly contig: #{@assembly.last.name}"
-              logger.error "coverage contig: #{name}"
-              raise stop_error
-            end
-          end
-        end
-        contig.coverage[pos - 1] = cov
-      end
-      # yield the final contig
-      block.call(contig, contig.coverage)
-    end
   end # Assembly
 end # Transrate

data/lib/transrate/cmd.rb CHANGED

@@ -14,6 +14,10 @@ module Transrate
       @stdout, @stderr, @status = Open3.capture3 @cmd
     end
+    def to_s
+      @cmd
+    end
   end
 end

data/lib/transrate/comparative_metrics.rb CHANGED

@@ -11,7 +11,6 @@ module Transrate
     attr_reader :has_run
     attr_reader :reference_coverage
     attr_reader :comp_stats
-    attr_reader :n_chimeras, :p_chimeras
     def initialize assembly, reference, threads
       @assembly = assembly
@@ -23,14 +22,12 @@ module Transrate
     def run
       @crbblast = reciprocal_best_blast
       @reference_coverage = coverage @crbblast
-      @collapse_factor = collapse_factor @crbblast.reciprocals
       @reciprocal_hits = @crbblast.size
       @rbh_per_reference = @reciprocal_hits.to_f / @reference.size.to_f
       @p_contigs_with_recip = @crbblast.reciprocals.size / @assembly.size.to_f
       @n_contigs_with_recip = @crbblast.reciprocals.size
       count_ref_crbbs
       @p_refs_with_recip = @n_refs_with_recip / @reference.size.to_f
-      chimeras @crbblast
       self.run_comp_stats
       @has_run = true
     end
@@ -43,9 +40,6 @@ module Transrate
       @comp_stats[:n_refs_with_CRBB] = @n_refs_with_recip
       @comp_stats[:rbh_per_reference] = @rbh_per_reference
       @comp_stats[:reference_coverage] = @reference_coverage
-      @comp_stats[:collapse_factor] = @collapse_factor
-      @comp_stats[:n_chimeras] = @n_chimeras
-      @comp_stats[:p_chimeras] = @p_chimeras
     end
     def reciprocal_best_blast
@@ -71,19 +65,29 @@ module Transrate
           contig = @assembly[hit.query]
           contig.has_crb = true
           # how much of the reference is covered by this single contig
-          contig.reference_coverage = hit.alnlen / hit.tlen
+          if crbblast.target_is_prot
+            contig.reference_coverage =
+                        (hit.alnlen - hit.mismatches - hit.gaps) / (3*hit.tlen)
+          else
+            contig.reference_coverage =
+                            (hit.alnlen - hit.mismatches - hit.gaps) / hit.tlen
+          end
           contig.hits << hit
         end
       end
       total_coverage = 0
       total_length = 0
       cov = [0.25, 0.5, 0.75, 0.85, 0.95]
+      @cov ||= [0, 0, 0, 0, 0]
       @reference.each_value do |ref_contig|
         key = ref_contig.name
         list = ref_contig.hits
-        total_length += crbblast.target_is_prot ? ref_contig.length : ref_contig.length*3
-        next if list.empty? # ah this is what was breaking everything
+        if crbblast.target_is_prot
+          total_length += ref_contig.length * 3
+        else
+          total_length += ref_contig.length
+        end
+        next if list.empty?
         blocks = []
         target_length = 0
         list.each do |hit|
@@ -162,9 +166,7 @@ module Transrate
         end # each_with_index a
         # sum blocks to find total coverage
         length_of_coverage = calculate_coverage blocks
-        @cov ||= [0, 0, 0, 0, 0]
         if target_length > 0
-          # puts "#{length_of_coverage} / #{target_length.to_f}"
           ref_p = length_of_coverage / target_length.to_f
         else
           ref_p = 0
@@ -179,10 +181,11 @@ module Transrate
         total_coverage += length_of_coverage
       end
       cov.each_with_index do |p, i|
         @comp_stats["cov#{(100*p).to_i}".to_sym] = @cov[i]
         @comp_stats["p_cov#{(100*p).to_i}".to_sym] =
-                                                  @cov[i]/@reference.size.to_f
+          @cov[i]/@reference.size.to_f
       end
       total_coverage / total_length.to_f
     end
@@ -210,44 +213,6 @@ module Transrate
       end
     end
-    def chimeras crbblast
-      @n_chimeras = 0
-      crbblast.reciprocals.each_pair do |key, list|
-        p = 0
-        list.each_with_index do |a, i|
-          list.each_with_index do |b, j|
-            if j>i
-              if a.target == b.target
-                astart, astop = [a.tstart, a.tend].minmax
-                bstart, bstop = [b.tstart, b.tend].minmax
-                oa = overlap_amount(astart, astop, bstart, bstop)
-                if oa > 0.75
-                  p += 1
-                end
-              else
-                astart, astop = [a.qstart, a.qend].minmax
-                bstart, bstop = [b.qstart, b.qend].minmax
-                oa = overlap_amount(astart, astop, bstart, bstop)
-                if oa < 0.25
-                  p += 1
-                end
-              end
-            end
-          end
-        end
-        if p/list.size.to_f >= 0.5
-          @n_chimeras += 1
-          unless @assembly.assembly.key? key
-            puts "key not in assembly: #{key}"
-          end
-          @assembly[key].is_chimera = true
-        end
-      end
-      @p_chimeras = @n_chimeras / crbblast.reciprocals.length.to_f
-    end
     def overlap(astart, astop, bstart, bstop)
       if astart == bstart and astop == bstop
         return 0
@@ -300,19 +265,6 @@ module Transrate
       end
     end
-    # Count unique reference proteins per contig
-    def collapse_factor reciprocals
-      return @collapse_factor unless @collapse_factor.nil?
-      cf_sum = 0
-      reciprocals.each do |query, hits|
-        uniq_hits = Set.new hits.map{ |h| h.target }
-        cf = uniq_hits.length
-        @assembly[query].collapse_factor = cf
-        cf_sum += cf
-      end
-      cf_sum / reciprocals.size
-    end
   end # ComparativeMetrics
 end # Transrate

data/lib/transrate/contig.rb CHANGED

@@ -10,9 +10,12 @@ module Transrate
     def_delegators :@seq, :size, :length
     attr_accessor :seq, :name
     # read-based metrics
-    attr_accessor :coverage, :uncovered_bases, :mean_coverage, :in_bridges
+    attr_accessor :coverage, :uncovered_bases, :p_uncovered_bases
+    attr_accessor :p_seq_true, :p_unique
+    attr_accessor :low_uniqueness_bases, :in_bridges
+    attr_accessor :p_good, :p_not_segmented
     # reference-based metrics
-    attr_accessor :has_crb, :is_chimera, :collapse_factor, :reference_coverage
+    attr_accessor :has_crb, :reference_coverage
     attr_accessor :hits
     def initialize(seq, name: nil)
@@ -22,11 +25,16 @@ module Transrate
       @name = seq.respond_to?(:entry_id) ? seq.entry_id : name
       @hits = []
       @reference_coverage = 0
-      @collapse_factor = 0
-      @is_chimera = false
       @has_crb = false
       @in_bridges = 0
-      @mean_coverage = 0
+      @p_seq_true = 0
+      @low_uniqueness_bases = 0
+      @p_good = -1
+      @uncovered_bases = length
+      @p_uncovered_bases = 1
+      @p_unique = 0
+      @p_not_segmented = 1
+      @score = -1
     end
     def each &block
@@ -43,34 +51,38 @@ module Transrate
         :cpg_count => cpg_count,
         :cpg_ratio => cpg_ratio,
         :orf_length => orf_length,
-        :linguistic_complexity_6 => linguistic_complexity(6)
+        :linguistic_complexity_6 => linguistic_complexity(6),
       }
     end
     def read_metrics
-      read = @coverage ? {
-        :uncovered_bases => uncovered_bases,
-        :mean_coverage => mean_coverage,
-        :in_bridges => in_bridges
+      read = @p_good>=0 ? {
+        :in_bridges => in_bridges,
+        :p_good => @p_good,
+        :p_bases_covered => p_bases_covered,
+        :p_seq_true => p_seq_true,
+        :score => score,
+        :p_unique => p_unique,
+        :p_not_segmented => p_not_segmented
       } : {
-        :uncovered_bases => "NA",
-        :mean_coverage => "NA",
-        :in_bridges => in_bridges
+        :in_bridges => "NA",
+        :p_good => "NA",
+        :p_bases_covered => "NA",
+        :p_seq_true => "NA",
+        :score => "NA",
+        :p_unique => p_unique,
+        :p_not_segmented => p_not_segmented
       }
     end
     def comparative_metrics
       reference = @has_crb ? {
         :has_crb => has_crb,
-        :collapse_factor => collapse_factor,
         :reference_coverage => reference_coverage,
-        :is_chimera => is_chimera,
         :hits => hits.map{ |h| h.target }.join(";")
       } : {
         :has_crb => false,
-        :collapse_factor => "NA",
         :reference_coverage => "NA",
-        :is_chimera => "NA",
         :hits => "NA"
       }
     end
@@ -89,7 +101,7 @@ module Transrate
       composition(@seq.seq)
       alphabet = ['a', 'c', 'g', 't', 'n']
       @base_composition = {}
-      @dibase_composition={}
+      @dibase_composition = {}
       bases = []
       dibases = []
       alphabet.each do |c|
@@ -208,6 +220,33 @@ module Transrate
     def linguistic_complexity k
       return kmer_count(k, @seq.seq)/(4**k).to_f # call to C
     end
+    def p_bases_covered
+      1 - p_uncovered_bases
+    end
+    def uncovered_bases= n
+      @uncovered_bases = n
+      @p_uncovered_bases = n / length.to_f
+    end
+    def p_unique_bases
+      (length - low_uniqueness_bases) / length.to_f
+    end
+    # Contig score (geometric mean of all score components)
+    def score
+      return @score if @score != -1
+      prod =
+        [p_bases_covered, 0.01].max * # proportion of bases covered
+        [p_not_segmented, 0.01].max * # prob contig has 0 changepoints
+        [p_good, 0.01].max * # proportion of reads that mapped good
+        [p_seq_true, 0.01].max * # scaled 1 - mean per-base edit distance
+        [p_unique, 0.01].max # prop mapQ >= 5
+      s = prod ** (1.0 / 5)
+      s = 0.01 if !s
+      @score = [s, 0.01].max
+    end
   end
 end

data/lib/transrate/express.rb ADDED

@@ -0,0 +1,79 @@
+module Transrate
+  class ExpressError < StandardError
+  end
+  class Express
+    require 'ostruct'
+    # return an Express object
+    def initialize
+      which = Cmd.new('which express')
+      which.run
+      if !which.status.success?
+        raise ExpressError.new("could not find express in the path")
+      end
+      @express = which.stdout.split("\n").first
+    end
+    # return struct containing:
+    #   results_file => path to the express results TSV
+    #   expression   => a hash of target => effective_count
+    #   align_samp   => path to the sampled alignments file
+    def run assembly, bamfile
+      assembly = assembly.file if assembly.is_a? Assembly
+      ex_output = 'results.xprs'
+      fin_output = "#{File.basename assembly}_#{ex_output}"
+      unless File.exists? fin_output
+        runner = Cmd.new build_command(assembly, bamfile)
+        runner.run
+        unless runner.status.success?
+          raise ExpressError.new("Express failed\n" +
+                                 runner.stderr + "\n" +
+                                 runner.stdout)
+        end
+        File.rename(ex_output, fin_output)
+      end
+      OpenStruct.new(:results_file => fin_output,
+                     :expression => load_expression(fin_output),
+                     :align_samp => 'hits.1.samp.bam')
+    end
+    # return the constructed eXpress command
+    def build_command assembly, bamfile
+      cmd = "#{@express}"
+      cmd << " #{File.expand_path assembly}"
+      cmd << " #{File.expand_path bamfile}"
+      cmd << " --output-dir ."
+      cmd << " --output-align-samp"
+      cmd << " --no-update-check"
+      cmd << " --additional-online 1"
+      cmd
+    end
+    # return a hash of target => effective_count created
+    # by parsing the results file
+    def load_expression file
+      expression = {}
+      first = true
+      File.open(file).each do |line|
+        if first
+          first = false
+          next
+        end
+        line = line.chomp.split("\t")
+        target = line[1]
+        effective_count = line[7]
+        expression[target] = effective_count.to_f
+      end
+      expression
+    end
+  end # Express
+ end # Transrate

data/lib/transrate/read_metrics.rb CHANGED

@@ -2,243 +2,254 @@ module Transrate
   class ReadMetrics
-    require 'bettersam'
-    require 'bio-samtools'
-    attr_reader :total
+    attr_reader :fragments_mapping
+    attr_reader :p_good_mapping
     attr_reader :bad
     attr_reader :supported_bridges
-    attr_reader :pr_good_mapping
-    attr_reader :percent_mapping
-    attr_reader :prop_expressed
     attr_reader :has_run
+    attr_reader :read_length
     def initialize assembly
       @assembly = assembly
-      @mapper = Bowtie2.new
+      @mapper = Snap.new
       self.initial_values
+      load_executables
+      @read_length = 100
+    end
+    def load_executables
+      @bam_splitter = get_bin_path 'bam-split'
+      @bam_reader = get_bin_path 'bam-read'
+    end
+    def get_bin_path bin
+      which_bin = Cmd.new("which #{bin}")
+      which_bin.run
+      if !which_bin.status.success?
+        raise IOError.new("ReadMetrics: could not find #{bin} in path")
+      end
+      which_bin.stdout.split("\n").first
     end
     def run left, right, insertsize:200, insertsd:50, threads:8
+      # check all read files exist
       [left, right].each do |readfile|
-        unless File.exist? readfile
-          raise IOError.new "ReadMetrics read file does not exist: #{readfile}"
+        raise IOError.new "Read file is nil" if readfile.nil?
+        readfile.split(",").each do |file|
+          unless File.exist? file
+            raise IOError.new "ReadMetrics: read file does not exist: #{file}"
+          end
         end
       end
-      @mapper.build_index @assembly.file
-      @num_pairs = `wc -l #{left}`.strip.split(/\s+/)[0].to_i/4
-      samfile = @mapper.map_reads(@assembly.file, left, right,
+      # estimate max read length
+      @read_length = get_read_length(left, right)
+      # map reads
+      @mapper.build_index(@assembly.file, threads)
+      bamfile = @mapper.map_reads(@assembly.file, left, right,
                                   insertsize: insertsize,
                                   insertsd: insertsd,
                                   threads: threads)
-      # check_bridges
-      analyse_read_mappings(samfile, insertsize, insertsd, true)
-      analyse_coverage(samfile)
-      @pr_good_mapping = @good.to_f / @num_pairs.to_f
-      @percent_mapping = @total.to_f / @num_pairs.to_f * 100.0
-      @pc_good_mapping = @pr_good_mapping * 100.0
+      @fragments = @mapper.read_count
+      # classify bam file into valid and invalid alignments
+      sorted_bam = "#{File.basename(bamfile, '.bam')}.merged.sorted.bam"
+      readsorted_bam = "#{File.basename(bamfile, '.bam')}.valid.sorted.bam"
+      unless File.exist? sorted_bam
+        valid_bam, invalid_bam = split_bam bamfile
+        readsorted_bam = Samtools.readsort_bam(valid_bam, threads)
+        File.delete valid_bam
+      end
+      # pass valid alignments to eXpress for assignment
+      # always have to run the eXpress command to load the results
+      assigned_bam = assign_and_quantify readsorted_bam
+      # merge the assigned alignments back with the invalid ones
+      unless File.exist? sorted_bam
+        File.delete readsorted_bam
+        merged_bam = "#{File.basename(bamfile, '.bam')}.merged.bam"
+        Samtools.merge_bam(invalid_bam, assigned_bam, merged_bam, threads=threads)
+        File.delete invalid_bam
+        File.delete assigned_bam
+        sorted_bam = Samtools.sort_bam(merged_bam, threads)
+        File.delete merged_bam
+      end
+      # analyse the final mappings
+      analyse_read_mappings(sorted_bam, insertsize, insertsd, true)
       @has_run = true
     end
     def read_stats
       {
-        :num_pairs => @num_pairs,
-        :total_mappings => @total,
-        :percent_mapping => @percent_mapping,
+        :fragments => @fragments,
+        :fragments_mapped => @fragments_mapped,
+        :p_fragments_mapped => @p_fragments_mapped,
         :good_mappings => @good,
-        :pc_good_mapping => @pc_good_mapping,
+        :p_good_mapping => @p_good_mapping,
         :bad_mappings => @bad,
-        :potential_bridges => @supported_bridges,
-        :mean_coverage => @mean_coverage,
-        :n_uncovered_bases => @n_uncovered_bases,
-        :p_uncovered_bases => @p_uncovered_bases,
-        :n_uncovered_base_contigs => @n_uncovered_base_contigs,
-        :p_uncovered_base_contigs => @p_uncovered_base_contigs,
-        :n_uncovered_contigs => @n_uncovered_contigs,
-        :p_uncovered_contigs => @p_uncovered_contigs,
-        :n_lowcovered_contigs => @n_lowcovered_contigs,
-        :p_lowcovered_contigs => @p_lowcovered_contigs
+        :potential_bridges => @potential_bridges,
+        :bases_uncovered => @bases_uncovered,
+        :p_bases_uncovered => @p_bases_uncovered,
+        :contigs_uncovbase => @contigs_uncovbase,
+        :p_contigs_uncovbase => @p_contigs_uncovbase,
+        :contigs_uncovered => @contigs_uncovered,
+        :p_contigs_uncovered => @p_contigs_uncovered,
+        :contigs_lowcovered => @contigs_lowcovered,
+        :p_contigs_lowcovered => @p_contigs_lowcovered,
+        :contigs_segmented => @contigs_segmented,
+        :p_contigs_segmented => @p_contigs_segmented,
+        :contigs_good => @contigs_good,
+        :p_contigs_good => @p_contigs_good
       }
     end
-    def analyse_read_mappings samfile, insertsize, insertsd, bridge=true
-      @bridges = {} if bridge
-      realistic_dist = self.realistic_distance(insertsize, insertsd)
-      if File.exists?(samfile) && File.size(samfile) > 0
-        ls = BetterSam.new
-        rs = BetterSam.new
-        sam = File.open(samfile)
-        line = sam.readline
-        while line and line=~/^@/
-          line = sam.readline rescue nil
-        end
-        while line
-          ls.parse_line(line)
-          if ls.mate_unmapped?
-            self.check_read_single(ls)
-            line = sam.readline rescue nil
-          else
-            line2 = sam.readline rescue nil
-            if line2
-              rs.parse_line(line2)
-              self.check_read_pair(ls, rs, realistic_dist)
-            end
-            line = sam.readline rescue nil
-          end
+    def get_read_length(left, right)
+      count=0
+      file = File.open(left.split(",").first)
+      name = file.readline.chomp
+      seq = file.readline.chomp
+      na = file.readline.chomp
+      qual = file.readline.chomp
+      read_length = 0
+      while name and count < 5000 # get max read length from first 5000 reads
+        read_length = [read_length, seq.length].max
+        name = file.readline.chomp rescue nil
+        seq = file.readline.chomp rescue nil
+        na = file.readline.chomp rescue nil
+        qual = file.readline.chomp rescue nil
+        count+=1
+      end
+      read_length
+    end
+    def split_bam bamfile
+      base = File.basename(bamfile, '.bam')
+      valid = "#{base}.valid.bam"
+      invalid = "#{base}.invalid.bam"
+      if !File.exist? valid
+        cmd = "#{@bam_splitter} #{bamfile}"
+        splitter = Cmd.new cmd
+        splitter.run
+        if !splitter.status.success?
+          logger.warn "Couldn't split bam file: #{bamfile}" +
+                      "\n#{splitter.stdout}\n#{splitter.stderr}"
         end
-        check_bridges
-      else
-        raise "samfile #{samfile} not found"
       end
+      if !File.exist? valid
+        logger.warn "Splitting failed to create valid bam: #{valid}"
+      end
+      [valid, invalid]
     end
-    def initial_values
-      @num_pairs = 0
-      @total = 0
-      @good = 0
-      @bad = 0
-      @both_mapped = 0
-      @properly_paired = 0
-      @improperly_paired = 0
-      @proper_orientation = 0
-      @improper_orientation = 0
-      @same_contig = 0
-      @realistic_overlap = 0
-      @unrealistic_overlap = 0
-      @realistic_fragment = 0
-      @unrealistic_fragment = 0
-      @n_uncovered_bases = 0
-      @n_uncovered_base_contigs = 0 # any base cov < 1
-      @n_uncovered_contigs = 0 # mean cov < 1
-      @n_lowcovered_contigs = 0 # mean cov < 10
+    def assign_and_quantify bamfile
+      express = Express.new
+      results = express.run(@assembly, bamfile)
+      analyse_expression results.expression
+      results.align_samp
     end
-    def realistic_distance insertsize, insertsd
-      insertsize + (3 * insertsd)
+    def analyse_expression express_output
+      express_output.each_pair do |name, eff_count|
+        @contigs_uncovered += 1 if eff_count < 1
+        @contigs_lowcovered += 1 if eff_count < 10
+        contig = @assembly[name]
+        contig.coverage = eff_count
+      end
     end
-    def check_read_single ls
+    def analyse_read_mappings bamfile, insertsize, insertsd, bridge=true
+      if File.exist?(bamfile) && File.size(bamfile) > 0
+        csv_output = "#{File.basename(@assembly.file)}_bam_info.csv"
+        csv_output = File.expand_path(csv_output)
-    end
+        analyse_bam bamfile, csv_output
+        # open output csv file
+        @potential_bridges = 0
-    def check_read_pair ls, rs, realistic_dist
-      return unless ls.primary_aln?
-      @total += 1
-      if ls.both_mapped?
-        # reads are paired
-        @both_mapped += 1 if ls.primary_aln?
-        if ls.read_properly_paired?
-          # mapped in proper pair
-          @properly_paired += 1
-          self.check_orientation(ls, rs)
-        else
-          # not mapped in proper pair
-          @improperly_paired += 1
-          if ls.chrom == rs.chrom
-            # both on same contig
-            @same_contig += 1
-            self.check_overlap_plausibility(ls, rs)
-          else
-            self.check_fragment_plausibility(ls, rs, realistic_dist)
-          end
+        CSV.foreach(csv_output, :headers => true,
+                                :header_converters => :symbol,
+                                :converters => :all) do |row|
+          populate_contig_data row
         end
-      end
-    end
-    def check_orientation ls, rs
-      if ls.pair_opposite_strands?
-        # mates in proper orientation
-        @proper_orientation += 1
-        @good += 1
+        @bad = @fragments_mapped - @good
       else
-        # mates in wrong orientation
-        @improper_orientation += 1
-        @bad += 1
+        logger.warn "couldn't find bamfile: #{bamfile}"
+      end
+      @assembly.assembly.each_pair do |name, contig|
+        @contigs_good += 1 if contig.score >= 0.5
       end
+      update_proportions
     end
-    def check_overlap_plausibility ls, rs
-      if Math.sqrt((ls.pos - rs.pos) ** 2) < ls.seq.length
-        # overlap is realistic
-        @realistic_overlap += 1
-        self.check_orientation(ls, rs)
-      else
-        # overlap not realistic
-        @unrealistic_overlap+= 1
-        @bad += 1
-      end
+    def update_proportions
+      nbases = @assembly.n_bases.to_f
+      ncontigs = @assembly.size.to_f
+      @p_bases_uncovered = @bases_uncovered / nbases
+      @p_contigs_uncovbase = @contigs_uncovbase / ncontigs
+      @p_contigs_uncovered = @contigs_uncovered / ncontigs
+      @p_contigs_lowcovered = @contigs_lowcovered / ncontigs
+      @p_contigs_segmented = @contigs_segmented / ncontigs
+      @p_contigs_good = @contigs_good / ncontigs
+      @p_good_mapping = @good.to_f / @fragments.to_f
+      @p_fragments_mapped = @fragments_mapped / @fragments.to_f
     end
-    def check_fragment_plausibility ls, rs, realistic_dist
-      # mates on different contigs
-      # are the mapping positions within a realistic distance of
-      # the ends of contigs?
-      ldist = [ls.pos, ls.seq.length - ls.pos].min
-      rdist = [rs.pos, rs.seq.length - rs.pos].min
-      if ldist + rdist <= realistic_dist
-        # increase the evidence for this bridge
-        key = [ls.chrom, rs.chrom].sort.join("<>").to_sym
-        if @bridges.has_key? key
-          @bridges[key] += 1
-        else
-          @bridges[key] = 1
+    def analyse_bam bamfile, csv_output
+      if !File.exist?(csv_output)
+        cmd = "#{@bam_reader} #{bamfile} #{csv_output}"
+        reader = Cmd.new cmd
+        reader.run
+        if !reader.status.success?
+          logger.warn "couldn't get information from bam file: #{bamfile}"
         end
-        @realistic_fragment += 1
-        @good += 1
-      else
-        @unrealistic_fragment += 1
-        @bad += 1
       end
     end
-    def check_bridges
-      @supported_bridges = 0
-      CSV.open('supported_bridges.csv', 'w') do |f|
-        @bridges.each_pair do |b, count|
-          start, finish = b.to_s.split('<>')
-          @assembly[start].in_bridges += 1
-          @assembly[finish].in_bridges += 1
-          if count > 1
-            f << [start, finish, count]
-            @supported_bridges += 1
-          end
-        end
+    def populate_contig_data row
+      contig = @assembly[row[:name]]
+      scale = 0.7
+      contig.p_seq_true = (row[:p_seq_true] - scale) * (1.0 / (1 - scale))
+      contig.uncovered_bases = row[:bases_uncovered]
+      @bases_uncovered += contig.uncovered_bases
+      if row[:fragments_mapped] and row[:fragments_mapped] > 0
+        contig.p_good = row[:good]/row[:fragments_mapped].to_f
+      end
+      contig.p_not_segmented = row[:p_not_segmented]
+      if contig.p_not_segmented < 0.5
+        @contigs_segmented += 1
+      end
+      contig.in_bridges = row[:bridges]
+      contig.p_unique = row[:p_unique]
+      if row[:bridges] > 1
+        @potential_bridges += 1
+      end
+      @fragments_mapped += row[:fragments_mapped]
+      @good += row[:good]
+      if row[:bases_uncovered] > 0
+        @contigs_uncovbase += 1
       end
     end
-    # Generate per-base and contig read coverage statistics.
-    # Note that contigs less than 200 bases long are ignored in this
-    # analysis.
-    def analyse_coverage samfile
-      bamfile, sorted, index = Samtools.sam_to_sorted_indexed_bam samfile
-      bam = Bio::DB::Sam.new(:bam => sorted, :fasta => @assembly.file)
-      # get per-base coverage and calculate mean,
-      # identify zero-coverage bases
-      n, tot_length, tot_coverage = 0, 0, 0
-      @assembly.each_with_coverage(bam) do |contig, coverage|
-        next if contig.length < 200
-        contig.uncovered_bases, total = 0, 0
-        coverage.each do |e|
-          total += e
-          contig.uncovered_bases += 1 if e < 1
-        end
-        tot_length += coverage.length
-        tot_coverage += total
-        contig.mean_coverage = total / coverage.length.to_f
-        @n_uncovered_bases += contig.uncovered_bases
-        @n_uncovered_base_contigs += 1 if contig.uncovered_bases > 0
-        @n_uncovered_contigs += 1 if contig.mean_coverage < 1
-        @n_lowcovered_contigs += 1 if contig.mean_coverage < 10
-      end
-      @mean_coverage = (tot_coverage / tot_length.to_f).round(2)
-      @p_uncovered_bases = @n_uncovered_bases / @assembly.n_bases.to_f
-      @p_uncovered_base_contigs = @n_uncovered_base_contigs /
-                                  @assembly.size.to_f
-      @p_uncovered_contigs = @n_uncovered_contigs / @assembly.size.to_f
-      @p_lowcovered_contigs = @n_lowcovered_contigs / @assembly.size.to_f
+    def initial_values
+      @fragments = 0
+      @fragments_mapped = 0
+      @good = 0
+      @bad = 0
+      @bases_uncovered = 0
+      @contigs_uncovbase = 0 # any base cov < 1
+      @contigs_uncovered = 0 # mean cov < 1
+      @contigs_lowcovered = 0 # mean cov < 10
+      @contigs_segmented = 0 # p_not_segmented < 0.5
+      @contigs_good = 0
     end
   end # ReadMetrics
 end # Transrate