RubyGems - bio-maf - Versions diffs - 0.1.0-java → 0.2.0-java - Mend

bio-maf 0.1.0-java → 0.2.0-java

Files changed (26) hide show

data/.gitignore +53 -0
data/DEVELOPMENT.md +29 -0
data/Gemfile +1 -0
data/README.md +69 -1
data/Rakefile +4 -3
data/bin/find_overlaps +21 -0
data/bin/maf_tile +103 -0
data/bio-maf.gemspec +43 -0
data/features/gap-filling.feature +158 -0
data/features/gap-removal.feature +50 -0
data/features/step_definitions/gap-filling_steps.rb +32 -0
data/features/step_definitions/gap_removal_steps.rb +19 -0
data/features/step_definitions/parse_steps.rb +2 -1
data/lib/bio/maf.rb +2 -0
data/lib/bio/maf/index.rb +15 -8
data/lib/bio/maf/maf.rb +267 -0
data/lib/bio/maf/parser.rb +115 -175
data/lib/bio/maf/tiler.rb +167 -0
data/man/maf_tile.1 +108 -0
data/man/maf_tile.1.ronn +104 -0
data/spec/bio/maf/index_spec.rb +1 -0
data/spec/bio/maf/parser_spec.rb +103 -0
data/spec/bio/maf/tiler_spec.rb +69 -0
data/test/data/gap-sp1.fa +6 -0
data/test/data/mm8_chr7_tiny.kct +0 -0
metadata +65 -7

@@ -0,0 +1,50 @@
+Feature: Remove gaps from MAF files
+  In order to work with only the alignment data involving sequences
+  Which can be used by downstream software
+  We may want to filter out certain species
+  Which can leave gap regions where sequence data was only present
+  For removed species
+  So it is useful to be able to remove those gaps
+  Background:
+    Given MAF data:
+    """
+    ##maf version=1
+    a score=10542.0
+    s mm8.chr7                 80082334 34 + 145134094 GGGCTGAGGGC--AGGGATGG---AGGGCGGTCC--------------CAGCA-
+    s rn4.chr1                136011785 34 + 267910886 GGGCTGAGGGC--AGGGACGG---AGGGCGGTCC--------------CAGCA-
+    s oryCun1.scaffold_199771     14021 43 -     75077 -----ATGGGC--AAGCGTGG---AGGGGAACCTCTCCTCCCCTCCGACAAAG-
+    s hg18.chr15               88557580 27 + 100338915 --------GGC--AAGTGTGGA--AGGGAAGCCC--------------CAGAA-
+    s panTro2.chr15            87959837 27 + 100063422 --------GGC--AAGTGTGGA--AGGGAAGCCC--------------CAGAA-
+    s rheMac2.chr7             69864714 28 + 169801366 -------GGGC--AAGTATGGA--AGGGAAGCCC--------------CAGAA-
+    s canFam2.chr3             56030570 39 +  94715083 AGGTTTAGGGCAGAGGGATGAAGGAGGAGAATCC--------------CTATG-
+    s dasNov1.scaffold_106893      7435 34 +      9831 GGAACGAGGGC--ATGTGTGG---AGGGGGCTGC--------------CCACA-
+    s loxAfr1.scaffold_8298       30264 38 +     78952 ATGATGAGGGG--AAGCGTGGAGGAGGGGAACCC--------------CTAGGA
+    s echTel1.scaffold_304651       594 37 -     10007 -TGCTATGGCT--TTGTGTCTAGGAGGGGAATCC--------------CCAGGA
+    """
+    When I open it with a MAF reader
+    And filter for only the species
+    | mm8     |
+    | rn4     |
+    | hg18    |
+    | canFam2 |
+    | loxAfr1 |
+  Scenario: Detect filtered blocks
+    When an alignment block can be obtained
+    Then the alignment block is marked as filtered
+    And the alignment block has 5 sequences
+  Scenario: Detect gaps
+    When an alignment block can be obtained
+    Then 1 gap is found with length [14]
+  Scenario: Remove gaps
+    When an alignment block can be obtained
+    And gaps are removed
+    Then the text size of the block is 40
+  Scenario: Remove gaps in the parser
+    When I enable the :remove_gaps parser option
+    And an alignment block can be obtained
+    Then the text size of the block is 40

data/features/step_definitions/gap-filling_steps.rb ADDED

@@ -0,0 +1,32 @@
+Given /^chromosome reference sequence:$/ do |string|
+  sio = StringIO.new(string)
+  @refseq = Bio::MAF::FASTARangeReader.new(sio)
+end
+When /^tile ([^:\s]+):(\d+)-(\d+)( with the chromosome reference)?$/ do |seq, i_start, i_end, ref_p|
+  @tiler = Bio::MAF::Tiler.new
+  @tiler.index = @idx
+  @tiler.parser = @parser
+  @tiler.reference = @refseq if ref_p
+  @tiler.interval = Bio::GenomicInterval.zero_based(seq,
+                                                    i_start.to_i,
+                                                    i_end.to_i)
+end
+When /^tile with species \[(.+?)\]$/ do |species_text|
+  @tiler.species = species_text.split(/,\s*/)
+end
+When /^map species (\S+) as (\S+)$/ do |sp1, sp2|
+  @tiler.species_map[sp1] = sp2
+end
+When /^write the tiled data as FASTA$/ do
+  @dst = Tempfile.new(["cuke", ".fa"])
+  @tiler.write_fasta(@dst)
+end
+Then /^the FASTA data obtained should be:$/ do |string|
+  @dst.seek(0)
+  @dst.read.rstrip.should == string.rstrip
+end

data/features/step_definitions/gap_removal_steps.rb ADDED

@@ -0,0 +1,19 @@
+Then /^the alignment block is marked as filtered$/ do
+  @block.filtered?.should be_true
+end
+Then /^(\d+) gaps? (?:is|are) found with length \[(\d+)\]$/ do |n_gaps, gap_sizes_s|
+  gaps = @block.find_gaps
+  gaps.size.should == n_gaps.to_i
+  e_gap_sizes = gap_sizes_s.split(/,\s*/).collect { |n| n.to_i }
+  gap_sizes = gaps.collect { |gap| gap[1] }
+  gap_sizes.should == e_gap_sizes
+end
+When /^gaps are removed$/ do
+  @block.remove_gaps!
+end
+Then /^the text size of the block is (\d+)$/ do |e_text_size|
+  @block.text_size.should == e_text_size.to_i
+end

data/features/step_definitions/parse_steps.rb CHANGED

@@ -1,5 +1,6 @@
 When /^I open it with a MAF reader$/ do
-  @parser = Bio::MAF::Parser.new(@src_f, @opts || {})
+  @opts ||= {}
+  @parser = Bio::MAF::Parser.new(@src_f, @opts)
 end
 When /^I enable the :(\S+) parser option$/ do |opt_s|

data/lib/bio/maf.rb CHANGED

@@ -1,4 +1,6 @@
+require 'bio/maf/maf'
 require 'bio/maf/struct'
 require 'bio/maf/index'
 require 'bio/maf/parser'
 require 'bio/maf/writer'
+require 'bio/maf/tiler'

data/lib/bio/maf/index.rb CHANGED

@@ -65,10 +65,11 @@ module Bio
       include KVHelpers
       attr_reader :db, :species, :species_max_id
-      attr_accessor :index_sequences
+      attr_accessor :index_sequences, :ref_seq
       FORMAT_VERSION_KEY = 'bio-maf:index-format-version'
       FORMAT_VERSION = 2
+      REF_SEQ_KEY = 'bio-maf:reference-sequence'
       MAX_SPECIES = 64
       ## Key-value store index format
@@ -221,6 +222,7 @@ module Bio
           raise "Could not open DB file!"
         end
         if mode == KyotoCabinet::DB::OREADER
+          self.ref_seq = db[REF_SEQ_KEY]
           load_index_sequences
           load_species
         end
@@ -309,11 +311,12 @@ module Bio
         end
         ready = Time.now
         $stderr.puts "bin intervals computed after #{ready - start} seconds."
-        if RUBY_PLATFORM == 'java'
-          scan_bins_parallel(chrom_id, bin_intervals, filters)
-        else
-          scan_bins(chrom_id, bin_intervals, filters)
-        end
+        matches = if RUBY_PLATFORM == 'java'
+                    scan_bins_parallel(chrom_id, bin_intervals, filters)
+                  else
+                    scan_bins(chrom_id, bin_intervals, filters)
+                  end
+        matches.sort_by! { |e| e[0] } # sort by offset in file
       end # #fetch_list
       # Scan the index for blocks matching the given bins and intervals.
@@ -344,7 +347,7 @@ module Bio
       def scan_bins_parallel(chrom_id, bin_intervals, filters)
         start = Time.now
-        n_threads = ENV['profile'] ? 1 : 4
+        n_threads = ENV['profile'] ? 1 : java.lang.Runtime.runtime.availableProcessors
         jobs = java.util.concurrent.ConcurrentLinkedQueue.new(bin_intervals.to_a)
         completed = java.util.concurrent.LinkedBlockingQueue.new(128)
         threads = []
@@ -445,7 +448,8 @@ module Bio
       def build_default(parser)
         first_block = parser.parse_block
-        ref_seq = first_block.sequences.first.source
+        self.ref_seq = first_block.sequences.first.source
+        db[REF_SEQ_KEY] = ref_seq
         db[FORMAT_VERSION_KEY] = FORMAT_VERSION
         @index_sequences = { ref_seq => 0 }
         store_index_sequences!
@@ -521,6 +525,9 @@ module Bio
       end
       def entries_for(block)
+        unless block.ref_seq.source == @ref_seq
+          raise "Inconsistent reference sequence: expected #{@ref_seq}, got #{block.ref_seq.source}"
+        end
         h = {}
         val = build_block_value(block)
         block.sequences.each do |seq|

data/lib/bio/maf/maf.rb ADDED

@@ -0,0 +1,267 @@
+module Bio
+  module MAF
+    # A MAF header, containing the variable-value pairs from the first
+    # line of the file as well as the alignment parameters.
+    # @api public
+    class Header
+      # Variable-value pairs from the ##maf line
+      # @return [Hash]
+      attr_accessor :vars
+      # Alignment parameters from the MAF header.
+      # @return [Hash]
+      attr_accessor :alignment_params
+      def initialize(vars, params)
+        @vars = vars
+        @alignment_params = params
+      end
+      # The required version parameter.
+      # @return [String]
+      def version
+        vars[:version]
+      end
+      # The optional scoring parameter, if present.
+      # @return [String]
+      def scoring
+        vars[:scoring]
+      end
+    end
+    # A MAF alignment block.
+    # @api public
+    class Block
+      # Parameters from the 'a' line starting the alignment block.
+      attr_reader :vars
+      # Sequences, one per 's' or 'e' line.
+      # @return [Array<Sequence>]
+      attr_reader :sequences
+      # Offset of the alignment block within the MAF file, in bytes.
+      # @return [Integer]
+      attr_reader :offset
+      # Size of the alignment block within the MAF file, in bytes.
+      # @return [Integer]
+      attr_reader :size
+      def initialize(vars, sequences, offset, size, filtered)
+        @vars = vars
+        @sequences = sequences
+        @offset = offset
+        @size = size
+        @filtered = filtered
+      end
+      def ref_seq
+        sequences[0]
+      end
+      def raw_seq(i)
+        sequences.fetch(i)
+      end
+      def each_raw_seq
+        sequences.each { |s| yield s }
+      end
+      # Text size of the alignment block. This is the number of text
+      # characters in each line of sequence data, including dashes and
+      # other gaps in the sequence.
+      def text_size
+        sequences.first.text.size
+      end
+      # Whether this block has been modified by a parser filter.
+      # @return [Boolean]
+      def filtered?
+        @filtered
+      end
+      GAP = /-+/
+      # Remove gaps present in all sequences. These would generally
+      # occur when some sequences have been filtered out.
+      # @see #remove_gaps!
+      # @see Parser#sequence_filter
+      def find_gaps
+        ref_s = StringScanner.new(sequences.first.text)
+        others = sequences.slice(1, sequences.size - 1).reject { |s| s.empty? }.collect { |s| StringScanner.new(s.text) }
+        gaps = []
+        while ref_s.scan_until(GAP)
+          offset = ref_s.pos - ref_s.matched_size
+          others.each { |s| s.pos = offset }
+          unless others.find { |s| ! s.scan(GAP) }
+            # all matched
+            gap_size = [ref_s.matched_size,
+                        others.map {|s| s.matched_size}.min].min
+            gaps << [offset, gap_size]
+          end
+        end
+        gaps
+      end
+      # Remove gaps present in all sequences. These would generally
+      # occur when some sequences have been filtered out.
+      # @see #find_gaps
+      # @see Parser#sequence_filter
+      def remove_gaps!
+        gaps = find_gaps()
+        gaps.reverse_each do |offset, len|
+          sequences.each do |seq|
+            seq.delete_text(offset, len)
+          end
+        end
+        gaps.size
+      end
+    end
+    # A sequence within an alignment block.
+    # @api public
+    class Sequence
+      # @return [String] Source sequence name.
+      attr_reader :source
+      # @return [Integer] Zero-based start position.
+      attr_reader :start
+      # @return [Integer] Size of aligning region in source sequence.
+      attr_reader :size
+      # :+ or :-, indicating which strand the alignment is to.
+      # @return [Symbol]
+      attr_reader :strand
+      # Size of the entire source sequence, not just the aligning
+      # region.
+      # @return [Integer]
+      attr_reader :src_size
+      # Sequence data for the alignment, including insertions.
+      # @return [String]
+      attr_reader :text
+      # Array of raw synteny information from 'i' line.
+      # @return [Array<String>]
+      attr_accessor :i_data
+      # Quality string from 'q' line.
+      # @return [String]
+      attr_accessor :quality
+      alias_method :source_size, :src_size
+      def initialize(*args)
+        @source, @start, @size, @strand, @src_size, @text = args
+      end
+      def end
+        start + size
+      end
+      # Whether this sequence is empty. Only true for {EmptySequence}
+      # instances from 'e' lines.
+      def empty?
+        false
+      end
+      def gapped?
+        size != text.size
+      end
+      def species
+        parts = source.split('.', 2)
+        parts.size == 2 ? parts[0] : nil
+      end
+      def delete_text(offset, len)
+        unless empty?
+          text.slice!(offset, len)
+          if quality
+            quality.slice!(offset, len)
+          end
+        end
+      end
+      def write_fasta(writer)
+        writer.write("#{source}:#{start}-#{start + size}",
+                     text)
+      end
+      # Maps the given zero-based genomic range onto a range of string
+      # offsets, suitable for extracting the text for the given range
+      # from #text.
+      #
+      # @see String#slice
+      def text_range(range)
+        r_end = range.exclude_end? ? range.end : range.end + 1
+        r_size = r_end - range.begin
+        if range.begin == start && r_size == size
+          # special case, entire text
+          0...text.size
+        else
+          if range.begin < start || r_end > self.end
+            raise "Range #{range} outside sequence bounds; start #{start}, size #{size}"
+          end
+          if ! gapped?
+            # no gaps, can map indexes directly
+            (range.begin - start)...(r_end - start)
+          else
+            # gaps present
+            g_start = start     # genomic position of the start
+            t_start = 0         # text position of the start
+            m_begin = nil       # beginning of match
+            match = nil
+            text.scan(/(\w+|-+)/) do |parts|
+              part = parts[0]
+              if part[0] != '-'
+                # sequence text
+                g_end = g_start + part.size
+                if g_start <= range.begin && range.begin < g_end
+                  offset_in_part = range.begin - g_start
+                  m_begin = offset_in_part + t_start
+                end
+                if g_start <= r_end && r_end <= g_end
+                  raise "reached end before start!" unless m_begin
+                  offset_in_part = r_end - g_start
+                  m_end = offset_in_part + t_start
+                  match = m_begin...m_end
+                  break
+                end
+                g_start = g_end
+              else
+                # gap
+              end
+              t_start += part.size
+            end
+            raise "no match found!" unless match
+            return match
+          end
+        end
+      end
+    end
+    # An empty sequence record from an 'e' line.
+    #
+    # This indicates that "there isn't aligning DNA for a species but
+    # that the current block is bridged by a chain that connects
+    # blocks before and after this block" (MAF spec).
+    # @api public
+    class EmptySequence < Sequence
+      attr_reader :status
+      def initialize(*args)
+        super(*args[0..4])
+        @status = args[5]
+      end
+      def text
+        ''
+      end
+      def empty?
+        true
+      end
+      def write_fasta(writer)
+        raise "empty sequence output not implemented!"
+      end
+    end
+  end
+end