RubyGems - bio-maf - Versions diffs - 0.2.0 → 0.3.0 - Mend

bio-maf 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

data/.gitignore +1 -0
data/Gemfile +2 -1
data/README.md +98 -29
data/Rakefile +6 -2
data/bin/maf_tile +59 -35
data/bio-maf.gemspec +4 -3
data/features/block-joining.feature +32 -0
data/features/dir-access.feature +46 -0
data/features/maf-indexing.feature +23 -0
data/features/maf-to-fasta.feature +9 -0
data/features/slice.feature +54 -0
data/features/step_definitions/dir-access_steps.rb +15 -0
data/features/step_definitions/file_steps.rb +7 -0
data/features/step_definitions/gap_removal_steps.rb +4 -0
data/features/step_definitions/index_steps.rb +3 -3
data/features/step_definitions/output_steps.rb +9 -1
data/features/step_definitions/parse_steps.rb +13 -2
data/features/step_definitions/query_steps.rb +7 -6
data/features/step_definitions/slice_steps.rb +15 -0
data/features/step_definitions/{gap-filling_steps.rb → tiling_steps.rb} +0 -0
data/features/support/aruba.rb +1 -0
data/features/support/env.rb +3 -1
data/features/{gap-filling.feature → tiling.feature} +85 -0
data/lib/bio/maf/index.rb +223 -11
data/lib/bio/maf/maf.rb +209 -0
data/lib/bio/maf/parser.rb +190 -111
data/lib/bio/maf/tiler.rb +33 -6
data/man/maf_index.1 +1 -1
data/man/maf_tile.1 +7 -7
data/man/maf_tile.1.ronn +21 -13
data/man/maf_to_fasta.1 +1 -1
data/spec/bio/maf/index_spec.rb +99 -0
data/spec/bio/maf/maf_spec.rb +184 -0
data/spec/bio/maf/parser_spec.rb +75 -115
data/spec/bio/maf/tiler_spec.rb +44 -0
data/test/data/chr22_ieq2.maf +11 -0
data/test/data/gap-1.kct +0 -0
data/test/data/gap-1.maf +9 -0
data/test/data/gap-filled1.fa +6 -0
data/test/data/gap-sp1.fa.gz +0 -0
data/test/data/mm8_chr7_tiny_slice1.maf +9 -0
data/test/data/mm8_chr7_tiny_slice2.maf +10 -0
data/test/data/mm8_chr7_tiny_slice3.maf +10 -0
data/test/data/mm8_chrM_tiny.kct +0 -0
data/test/data/mm8_chrM_tiny.maf +1000 -0
metadata +59 -7

data/.gitignore CHANGED Viewed

@@ -51,3 +51,4 @@ pkg
 # Ignore Gemfile.lock for gems. See http://yehudakatz.com/2010/12/16/clarifying-the-roles-of-the-gemspec-and-gemfile/
 Gemfile.lock
+tmp

data/Gemfile CHANGED Viewed

@@ -13,7 +13,6 @@ group :development do
   gem "redcarpet", "~> 2.1.1", :platforms => :mri
   gem "ronn", "~> 0.7.3", :platforms => :mri
   gem "sinatra", "~> 1.3.2" # for ronn --server
-  gem "rubygems-tasks", "~> 0.2.3"
 end
 group :test do
@@ -21,4 +20,6 @@ group :test do
   gem "rake", ">= 0.9"
   gem "cucumber", ">= 0"
   gem "rspec", "~> 2.10.0"
+  gem "rubygems-tasks", "~> 0.2.3"
+  gem "aruba", "~> 0.4.11"
 end

data/README.md CHANGED Viewed

@@ -92,6 +92,19 @@ Or programmatically:
 Refer to [`mm8_chr7_tiny.maf`](https://github.com/csw/bioruby-maf/blob/master/test/data/mm8_chr7_tiny.maf).
+    require 'bio-maf'
+    access = Bio::MAF::Access.maf_dir('test/data')
+    q = [Bio::GenomicInterval.zero_based('mm8.chr7', 80082592, 80082766)]
+    access.find(q) do |block|
+      ref_seq = block.sequences[0]
+      puts "Matched block at #{ref_seq.start}, #{ref_seq.size} bases"
+    end
+    # => Matched block at 80082592, 121 bases
+    # => Matched block at 80082713, 54 bases
+Or, equivalently, one can work with a specific MAF file and index directly:
     require 'bio-maf'
     parser = Bio::MAF::Parser.new('test/data/mm8_chr7_tiny.maf')
@@ -106,15 +119,27 @@ Refer to [`mm8_chr7_tiny.maf`](https://github.com/csw/bioruby-maf/blob/master/te
     # => Matched block at 80082592, 121 bases
     # => Matched block at 80082713, 54 bases
+### Extract alignment blocks truncated to a given interval
+Given a genomic interval of interest, one can also extract only the
+subsets of blocks that intersect with that interval, using the
+`#slice` method like so:
+    require 'bio-maf'
+    access = Bio::MAF::Access.maf_dir('test/data')
+    int = Bio::GenomicInterval.zero_based('mm8.chr7', 80082350, 80082380)
+    blocks = access.slice(int).to_a
+    puts "Got #{blocks.size} blocks, first #{blocks.first.ref_seq.size} base pairs."
+    # => Got 2 blocks, first 18 base pairs.
 ### Filter species returned in alignment blocks
     require 'bio-maf'
-    parser = Bio::MAF::Parser.new('test/data/mm8_chr7_tiny.maf')
-    idx = Bio::MAF::KyotoIndex.open('test/data/mm8_chr7_tiny.kct')
+    access = Bio::MAF::Access.maf_dir('test/data')
-    parser.sequence_filter = { :only_species => %w(hg18 mm8 rheMac2) }
+    access.sequence_filter = { :only_species => %w(hg18 mm8 rheMac2) }
     q = [Bio::GenomicInterval.zero_based('mm8.chr7', 80082592, 80082766)]
-    blocks = idx.find(q, parser)
+    blocks = access.find(q)
     block = blocks.first
     puts "Block has #{block.sequences.size} sequences."
@@ -129,23 +154,26 @@ See also the [Cucumber feature][] and [step definitions][] for this.
 #### Match only blocks with all specified species
+    access = Bio::MAF::Access.maf_dir('test/data')
     q = [Bio::GenomicInterval.zero_based('mm8.chr7', 80082471, 80082730)]
-    filter = { :with_all_species => %w(panTro2 loxAfr1) }
-    n_blocks = idx.find(q, parser, filter).count
+    access.block_filter = { :with_all_species => %w(panTro2 loxAfr1) }
+    n_blocks = access.find(q).count
     # => 1
 #### Match only blocks with a certain number of sequences
+    access = Bio::MAF::Access.maf_dir('test/data')
     q = [Bio::GenomicInterval.zero_based('mm8.chr7', 80082767, 80083008)]
-    filter = { :at_least_n_sequences => 6 }
-    n_blocks = idx.find(q, parser, filter).count
+    access.block_filter = { :at_least_n_sequences => 6 }
+    n_blocks = access.find(q).count
     # => 1
 #### Match only blocks within a text size range
+    access = Bio::MAF::Access.maf_dir('test/data')
     q = [Bio::GenomicInterval.zero_based('mm8.chr7', 0, 80100000)]
-    filter = { :min_size => 72, :max_size => 160 }
-    n_blocks = idx.find(q, parser, filter).count
+    access.block_filter = { :min_size => 72, :max_size => 160 }
+    n_blocks = access.find(q).count
     # => 3
 ### Process each block in a MAF file
@@ -155,7 +183,7 @@ See also the [Cucumber feature][] and [step definitions][] for this.
     puts "MAF version: #{p.header.version}"
     # => MAF version: 1
-    p.parse_blocks.each do |block|
+    p.each_block do |block|
       block.sequences.each do |seq|
         do_something(seq)
       end
@@ -183,6 +211,12 @@ Refer to [`chr22_ieq.maf`](https://github.com/csw/bioruby-maf/blob/master/test/d
     #      @size=1601, @strand=:+, @src_size=50103, @text=nil,
     #      @status="I">
+Such options can also be set on a Bio::MAF::Access object:
+    require 'bio-maf'
+    access = Bio::MAF::Access.maf_dir('test/data')
+    access.parse_options[:parse_empty] = true
 ### Remove gaps from parsed blocks
 After filtering out species with
@@ -192,8 +226,42 @@ sequences that were filtered out. Such gaps can be removed by setting
 the `:remove_gaps` parser option:
     require 'bio-maf'
-    p = Bio::MAF::Parser.new('test/data/chr22_ieq.maf',
-                             :remove_gaps => true)
+    access = Bio::MAF::Access.maf_dir('test/data')
+    access.parse_options[:remove_gaps] = true
+### Join blocks after filtering together
+Similarly, filtering out species may remove a species which had caused
+two adjacent alignment blocks to be split. By enabling the
+`:join_blocks` parser option, such blocks can be joined together:
+    require 'bio-maf'
+    access = Bio::MAF::Access.maf_dir('test/data')
+    access.parse_options[:join_blocks] = true
+See the [Cucumber feature][] for more details.
+[Cucumber feature]: https://github.com/csw/bioruby-maf/blob/master/features/block-joining.feature
+### Extract bio-alignment representations of blocks
+When the `:as_bio_alignment` parser option is given, blocks will be
+returned as [Bio::BioAlignment::Alignment][] objects as used in the
+[bio-alignment] Biogem. This offers a great deal of built-in
+functionality for column-wise operations, alignment manipulation, and
+more.
+[Bio::BioAlignment::Alignment]: http://rdoc.info/gems/bio-alignment/Bio/BioAlignment/Alignment
+[bio-alignment]: https://github.com/pjotrp/bioruby-alignment
+    require 'bio-maf'
+    access = Bio::MAF::Access.maf_dir('test/data')
+    access.parse_options[:as_bio_alignment] = true
+    q = [Bio::GenomicInterval.zero_based('mm8.chr7', 80082592, 80082766)]
+    access.find(q) do |aln|
+      col = aln.columns[3]
+      puts "bases in column 3: #{col}"
+    end
 ### Tile blocks together over an interval
@@ -206,24 +274,25 @@ the
 [`maf_tile(1)`](http://csw.github.com/bioruby-maf/man/maf_tile.1.html)
 man page.
-[feature]: https://github.com/csw/bioruby-maf/blob/master/features/gap-filling.feature
+[feature]: https://github.com/csw/bioruby-maf/blob/master/features/tiling.feature
     require 'bio-maf'
-    tiler = Bio::MAF::Tiler.new
-    tiler.index = Bio::MAF::KyotoIndex.open('test/data/mm8_chr7_tiny.kct')
-    tiler.parser = Bio::MAF::Parser.new('test/data/mm8_chr7_tiny.maf')
-    # optional
-    tiler.reference = Bio::MAF::FASTARangeReader.new('reference.fa.gz')
-    tiler.species = %w(mm8 rn4 hg18)
-    tiler.species_map = {
-      'mm8' => 'mouse',
-      'rn4' => 'rat',
-      'hg18' => 'human'
-    }
-    tiler.interval = Bio::GenomicInterval.zero_based('mm8.chr7',
-                                                     80082334,
-                                                     80082468)
-    tiler.write_fasta($stdout)
+    access = Bio::MAF::Access.maf_dir('test/data')
+    interval = Bio::GenomicInterval.zero_based('mm8.chr7',
+                                               80082334,
+                                               80082468)
+    access.tile(interval) do |tiler|
+      # reference is optional
+      tiler.reference = 'reference.fa.gz'
+      tiler.species = %w(mm8 rn4 hg18)
+      # species_map is optional
+      tiler.species_map = {
+        'mm8' => 'mouse',
+        'rn4' => 'rat',
+        'hg18' => 'human'
+      }
+      tiler.write_fasta($stdout)
+    end
 ### Command line tools

data/Rakefile CHANGED Viewed

@@ -23,7 +23,10 @@ RSpec::Core::RakeTask.new(:spec) do |spec|
 end
 require 'cucumber/rake/task'
-Cucumber::Rake::Task.new do |features|
+Cucumber::Rake::Task.new do |t|
+  opts = "features"
+  opts << ' --tags ~@no_jruby' if RUBY_PLATFORM == 'java'
+  t.cucumber_opts = opts
 end
 task :test => [ :spec, :cucumber ]
@@ -44,7 +47,8 @@ if ronn_avail
   desc "Generate man pages"
   task :man do
     file_spec = RONN_FILES.join(' ')
-    sh "ronn --roff --html --style toc --date #{$gemspec.date.strftime('%Y-%m-%d')} --manual='BioRuby Manual' --organization='#{$gemspec.author}' #{file_spec}"
+    #sh "ronn --roff --html --style toc --date #{$gemspec.date.strftime('%Y-%m-%d')} --manual='BioRuby Manual' --organization='#{$gemspec.author}' #{file_spec}"
+    sh "ronn --roff --html --style toc --date #{Time.now.strftime('%Y-%m-%d')} --manual='BioRuby Manual' --organization='BioRuby' #{file_spec}"
   end
   namespace :man do

data/bin/maf_tile CHANGED Viewed

@@ -6,6 +6,24 @@ require 'ostruct'
 require 'bio-maf'
 require 'bio-genomic-interval'
+def parse_interval(line)
+  src, r_start_s, r_end_s, _ = line.split(nil, 4)
+  r_start = r_start_s.to_i
+  r_end = r_end_s.to_i
+  return Bio::GenomicInterval.zero_based(src, r_start, r_end)
+end
+def target_for(base, interval, &blk)
+  path = "#{base}_#{interval.zero_start}-#{interval.zero_end}.fa"
+  File.open(path, 'w', &blk)
+end
+def apply_options(options, tiler)
+  tiler.reference = options.ref if options.ref
+  tiler.species = options.species
+  tiler.species_map = options.species_map
+end
 options = OpenStruct.new
 options.p = { :threads => 1 }
 options.species = []
@@ -13,16 +31,20 @@ options.species_map = {}
 options.usage = false
 o_parser = OptionParser.new do |opts|
-  opts.banner = "Usage: maf_tile [options] <maf> <index>"
+  opts.banner = "Usage: maf_tile [options] <maf> [index]"
   opts.separator ""
   opts.separator "Options:"
   opts.on("-r", "--reference SEQ", "FASTA reference sequence") do |ref|
     options.ref = ref
   end
-  opts.on("-i", "--interval BEGIN:END", "Genomic interval, zero-based") do |int|
-    if int =~ /(\d+):(\d+)/
+  opts.on("-i", "--interval [CHR:]BEGIN:END", "Genomic interval, zero-based") do |int|
+    if int =~ /(.+):(\d+):(\d+)/
+      gi = Bio::GenomicInterval.zero_based($1, ($2.to_i), ($3.to_i))
+      options.genomic_interval = gi
+    elsif int =~ /(\d+):(\d+)/
       options.interval = ($1.to_i)...($2.to_i)
     else
+      $stderr.puts "Invalid interval specification #{int}!"
       options.usage = true
     end
   end
@@ -51,30 +73,19 @@ maf_p = ARGV.shift
 index_p = ARGV.shift
 unless (! options.usage) \
-  && maf_p && index_p && (! options.species.empty?) \
-  && (options.output_base ? options.bed : options.interval)
+  && maf_p && (! options.species.empty?) \
+  && (options.output_base \
+      ? options.bed \
+      : options.interval || options.genomic_interval)
   $stderr.puts o_parser
   exit 2
 end
-tiler = Bio::MAF::Tiler.new
-tiler.index = Bio::MAF::KyotoIndex.open(index_p)
-tiler.parser = Bio::MAF::Parser.new(maf_p, options.p)
-tiler.reference = Bio::MAF::FASTARangeReader.new(options.ref) if options.ref
-tiler.species = options.species
-tiler.species_map = options.species_map
-def parse_interval(line)
-  src, r_start_s, r_end_s, _ = line.split(nil, 4)
-  r_start = r_start_s.to_i
-  r_end = r_end_s.to_i
-  return Bio::GenomicInterval.zero_based(src, r_start, r_end)
-end
-def target_for(base, interval)
-  path = "#{base}_#{interval.zero_start}-#{interval.zero_end}.fa"
-  File.open(path, 'w')
-end
+access = if File.directory? maf_p
+           Bio::MAF::Access.maf_dir(maf_p, options.p)
+         else
+           Bio::MAF::Access.file(maf_p, index_p, options.p)
+         end
 if options.bed
   intervals = []
@@ -83,21 +94,34 @@ if options.bed
   end
   intervals.sort_by! { |int| int.zero_start }
   intervals.each do |int|
-    tiler.interval = int
-    target = target_for(options.output_base, int)
-    tiler.write_fasta(target)
-    target.close
+    access.tile(int) do |tiler|
+      apply_options(options, tiler)
+      target_for(options.output_base, int) do |target|
+        tiler.write_fasta(target)
+      end
+    end
   end
 else
   # single interval
-  tiler.interval = Bio::GenomicInterval.zero_based(tiler.index.ref_seq,
-                                                   options.interval.begin,
-                                                   options.interval.end)
-  if options.output_base
-    target = target_for(options.output_base, tiler.interval)
+  if options.genomic_interval
+    interval = options.genomic_interval
   else
-    target = $stdout
+    if access.indices.size != 1
+      raise "Must explicitly specify sequence in --interval argument with multiple candidate MAF files!"
+    end
+    ref_seq = access.indices.keys.first
+    interval = Bio::GenomicInterval.zero_based(ref_seq,
+                                               options.interval.begin,
+                                               options.interval.end)
+  end
+  access.tile(interval) do |tiler|
+    apply_options(options, tiler)
+    if options.output_base
+      target = target_for(options.output_base, tiler.interval)
+    else
+      target = $stdout
+    end
+    tiler.write_fasta(target)
+    target.close
   end
-  tiler.write_fasta(target)
-  target.close
 end

data/bio-maf.gemspec CHANGED Viewed

@@ -2,11 +2,11 @@
 Gem::Specification.new do |s|
   s.name = "bio-maf"
-  s.version = "0.2.0"
+  s.version = "0.3.0"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["Clayton Wheeler"]
-  s.date = "2012-06-29"
+  s.date = "2012-07-18"
   s.description = "Multiple Alignment Format parser for BioRuby."
   s.email = "cswh@umich.edu"
   s.executables = ["maf_count", "maf_dump_blocks", "maf_extract_ranges_count", "maf_index", "maf_parse_bench", "maf_to_fasta", "maf_write", "random_ranges"]
@@ -32,10 +32,11 @@ Gem::Specification.new do |s|
     s.platform = 'java'
   end
+  s.add_runtime_dependency('bio-alignment', ["~> 0.0.7"])
   s.add_runtime_dependency('bio-bigbio', [">= 0"])
   s.add_runtime_dependency('bio-genomic-interval', ["~> 0.1.2"])
   if RUBY_PLATFORM == 'java'
-    s.add_runtime_dependency('kyotocabinet-java', ["~> 0.2.0"])
+    s.add_runtime_dependency('kyotocabinet-java', ["~> 0.3.0"])
   else
     s.add_runtime_dependency('kyotocabinet-ruby', ["~> 1.27.1"])
   end

data/features/block-joining.feature ADDED Viewed

@@ -0,0 +1,32 @@
+Feature: Join adjacent alignment blocks
+  After filtering out sequences
+  The sequence that caused two blocks to be separate may be removed
+  So it can be desirable to join such blocks together
+  Scenario: Two blocks natively in indexed access
+    Given indexed MAF files in "test/data"
+    When I query for the genomic intervals
+    | chrom    | start    | end      |
+    | mm8.chr7 | 80082334 | 80082471 |
+    Then 2 blocks are obtained
+    And the text size of block 0 is 54
+    And the text size of block 1 is 156
+  Scenario: Two blocks joined in indexed access
+    Given indexed MAF files in "test/data"
+    When I enable the :join_blocks parser option
+    And I filter for only the species
+    | mm8     |
+    | rn4     |
+    | oryCun1 |
+    | hg18    |
+    | panTro2 |
+    | rheMac2 |
+    | canFam2 |
+    | loxAfr1 |
+    | echTel1 |
+    And I query for the genomic intervals
+    | chrom    | start    | end      |
+    | mm8.chr7 | 80082334 | 80082471 |
+    Then 1 block is obtained
+    And the text size of block 0 is 210

data/features/dir-access.feature ADDED Viewed

@@ -0,0 +1,46 @@
+Feature: Provide access to multiple MAF files in a directory
+  In order to efficiently work with many MAF files
+  We need to provide a convenient interface to them
+  Scenario: Query for several chromosomes at once
+    Given indexed MAF files in "test/data"
+    When I query for the genomic intervals
+    | chrom    | start    | end      |
+    | mm8.chr7 | 80082580 | 80082612 |
+    | mm8.chrM | 1400     | 1590     |
+    Then 5 blocks are obtained
+  Scenario: Apply block filters
+    Given indexed MAF files in "test/data"
+    When I filter for blocks with text size at most 200
+    And I query for the genomic intervals
+    | chrom    | start    | end      |
+    | mm8.chr7 | 80082580 | 80082612 |
+    | mm8.chrM | 1400     | 1590     |
+    Then 3 blocks are obtained
+  Scenario: Apply sequence filters
+    Given indexed MAF files in "test/data"
+    When I filter for only the species
+    | mm8  |
+    | rn4  |
+    | hg18 |
+    And I query for the genomic intervals
+    | chrom    | start    | end      |
+    | mm8.chr7 | 80082580 | 80082612 |
+    | mm8.chrM | 1400     | 1590     |
+    Then 5 blocks are obtained
+    And block 0 has 3 sequences
+  Scenario: Set parse options
+    Given indexed MAF files in "test/data"
+    When I enable the :remove_gaps parser option
+    And I filter for only the species
+    | mm8  |
+    | rn4  |
+    | hg18 |
+    And I query for the genomic intervals
+    | chrom    | start    | end      |
+    | mm8.chr7 | 80082580 | 80082612 |
+    Then 2 blocks are obtained
+    And the text size of block 1 is 121