RubyGems - bio-maf - Versions diffs - 0.1.0-java - Mend

bio-maf 0.1.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (76) hide show

data/.document +5 -0
data/.simplecov +1 -0
data/.travis.yml +16 -0
data/.yardopts +3 -0
data/DEVELOPMENT.md +40 -0
data/Gemfile +23 -0
data/LICENSE.txt +20 -0
data/README.md +209 -0
data/Rakefile +76 -0
data/VERSION +1 -0
data/benchmarks/dispatch_bench +53 -0
data/benchmarks/iter_bench +44 -0
data/benchmarks/read_bench +40 -0
data/benchmarks/sort_bench +33 -0
data/benchmarks/split_bench +33 -0
data/bin/maf_count +82 -0
data/bin/maf_dump_blocks +27 -0
data/bin/maf_extract_ranges_count +44 -0
data/bin/maf_index +88 -0
data/bin/maf_parse_bench +94 -0
data/bin/maf_to_fasta +68 -0
data/bin/maf_write +84 -0
data/bin/random_ranges +35 -0
data/features/maf-indexing.feature +31 -0
data/features/maf-output.feature +29 -0
data/features/maf-parsing.feature +44 -0
data/features/maf-querying.feature +75 -0
data/features/maf-to-fasta.feature +50 -0
data/features/step_definitions/convert_steps.rb +45 -0
data/features/step_definitions/index_steps.rb +20 -0
data/features/step_definitions/output_steps.rb +27 -0
data/features/step_definitions/parse_steps.rb +63 -0
data/features/step_definitions/query_steps.rb +31 -0
data/features/step_definitions/ucsc_bin_steps.rb +14 -0
data/features/support/env.rb +16 -0
data/features/ucsc-bins.feature +24 -0
data/lib/bio-maf.rb +12 -0
data/lib/bio-maf/maf.rb +3 -0
data/lib/bio/maf.rb +4 -0
data/lib/bio/maf/index.rb +620 -0
data/lib/bio/maf/parser.rb +888 -0
data/lib/bio/maf/struct.rb +63 -0
data/lib/bio/maf/writer.rb +63 -0
data/lib/bio/ucsc.rb +2 -0
data/lib/bio/ucsc/genomic-interval-bin.rb +13 -0
data/lib/bio/ucsc/ucsc_bin.rb +117 -0
data/man/.gitignore +1 -0
data/man/maf_index.1 +105 -0
data/man/maf_index.1.markdown +97 -0
data/man/maf_index.1.ronn +83 -0
data/man/maf_to_fasta.1 +53 -0
data/man/maf_to_fasta.1.ronn +51 -0
data/spec/bio/maf/index_spec.rb +363 -0
data/spec/bio/maf/parser_spec.rb +354 -0
data/spec/bio/maf/struct_spec.rb +75 -0
data/spec/spec_helper.rb +14 -0
data/test/data/big-block.maf +15999 -0
data/test/data/chr22_ieq.maf +11 -0
data/test/data/chrY-1block.maf +6 -0
data/test/data/empty +0 -0
data/test/data/empty.db +0 -0
data/test/data/mm8_chr7_tiny.kct +0 -0
data/test/data/mm8_chr7_tiny.maf +76 -0
data/test/data/mm8_mod_a.maf +7 -0
data/test/data/mm8_single.maf +13 -0
data/test/data/mm8_subset_a.maf +23 -0
data/test/data/t1-bad1.maf +15 -0
data/test/data/t1.fasta +12 -0
data/test/data/t1.maf +15 -0
data/test/data/t1a.maf +17 -0
data/test/helper.rb +18 -0
data/test/test_bio-maf.rb +7 -0
data/travis-ci/install_kc +13 -0
data/travis-ci/install_kc_java +13 -0
data/travis-ci/report_errors +4 -0
metadata +182 -0

data/bin/maf_to_fasta ADDED

@@ -0,0 +1,68 @@
+#!/usr/bin/env ruby
+require 'bio-maf'
+require 'bigbio'
+require 'optparse'
+require 'ostruct'
+options = OpenStruct.new
+options.parser = Bio::MAF::Parser
+OptionParser.new do |opts|
+  opts.banner = "Usage: maf_to_fasta [options] <maf> <fasta>"
+  opts.separator ""
+  opts.separator "Options:"
+  opts.on("-p", "--profile PROF", "Profile with PerfTools") do |prof|
+    options.prof = prof
+  end
+  opts.on("--ruby-prof PATH", "Profile with ruby-prof") do |path|
+    options.ruby_prof = path
+  end
+  opts.on("--profile-gc", "Profile GC") do |prof|
+    options.profile_gc = true
+  end
+  opts.on("--parser PARSER", "parser") do |name|
+    options.parser = Bio::MAF.const_get(name)
+  end
+end.parse!(ARGV)
+src_path = ARGV.shift
+dst_path = ARGV.shift
+if options.prof
+  require 'perftools'
+  PerfTools::CpuProfiler.start(options.prof)
+elsif options.ruby_prof
+  require 'ruby-prof'
+  RubyProf.start
+end
+if options.profile_gc
+  GC::Profiler.enable
+end
+parser = options.parser.new(src_path)
+writer = FastaWriter.new(dst_path)
+parser.each_block do |block|
+  block.each_raw_seq do |seq|
+    seq.write_fasta(writer)
+  end
+end
+writer.close
+if options.profile_gc
+  $stderr.puts GC::Profiler.result
+  GC::Profiler.disable
+end
+if options.prof
+  PerfTools::CpuProfiler.stop
+elsif options.ruby_prof
+  res = RubyProf.stop
+  printer = RubyProf::FlatPrinter.new(res)
+  File.open(options.ruby_prof, 'w') do |f|
+    printer.print(f)
+  end
+end

data/bin/maf_write ADDED

@@ -0,0 +1,84 @@
+#!/usr/bin/env ruby
+require 'bio-maf'
+require 'optparse'
+require 'ostruct'
+options = OpenStruct.new
+options.parser = Bio::MAF::Parser
+options.opts = {
+  :chunk_reader => Bio::MAF::ChunkReader,
+  :parse_extended => false
+}
+PRINTERS = {
+  'flat' => :FlatPrinter,
+  'stack' => :CallStackPrinter
+}
+OptionParser.new do |opts|
+  opts.banner = "Usage: maf_write [options] <maf>"
+  opts.separator ""
+  opts.separator "Options:"
+  opts.on("-p", "--profile PROF", "Profile with PerfTools") do |prof|
+    options.prof = prof
+  end
+  opts.on("--ruby-prof PATH", "Profile with ruby-prof") do |pspec|
+    if pspec =~ /(\w+):(.+)/
+      require 'ruby-prof'
+      options.ruby_prof_printer = RubyProf.const_get(PRINTERS.fetch($1))
+      options.ruby_prof_path = $2
+    else
+      options.ruby_prof_printer = RubyProf::FlatPrinter
+      options.ruby_prof_path = pspec
+    end
+  end
+  opts.on("--profile-gc", "Profile GC") do |prof|
+    options.profile_gc = true
+  end
+  opts.on("--parser PARSER", "parser") do |name|
+    options.parser = Bio::MAF.const_get(name)
+  end
+  opts.on("-t", "--threaded") do
+    options.opts[:chunk_reader] = Bio::MAF::ThreadedChunkReader
+    options.opts[:threads] = 1
+  end
+  opts.on("-e", "--extended") do
+    options.opts[:parse_extended] = true
+    options.opts[:parse_empty] = true
+  end
+end.parse!(ARGV)
+src_path = ARGV.shift
+if options.prof
+  require 'perftools'
+  PerfTools::CpuProfiler.start(options.prof)
+elsif options.ruby_prof_path
+  require 'ruby-prof'
+  RubyProf.start
+end
+if options.profile_gc
+  GC::Profiler.enable
+end
+parser = options.parser.new(src_path, options.opts)
+writer = Bio::MAF::Writer.new($stdout)
+writer.write_header(parser.header)
+writer.write_blocks(parser.parse_blocks)
+if options.profile_gc
+  $stderr.puts GC::Profiler.result
+  GC::Profiler.disable
+end
+if options.prof
+  PerfTools::CpuProfiler.stop
+elsif options.ruby_prof_path
+  res = RubyProf.stop
+  printer = options.ruby_prof_printer.new(res)
+  File.open(options.ruby_prof_path, 'w') do |f|
+    printer.print(f)
+  end
+end

data/bin/random_ranges ADDED

@@ -0,0 +1,35 @@
+#!/usr/bin/env ruby
+require 'optparse'
+require 'ostruct'
+options = OpenStruct.new
+op = OptionParser.new do |opts|
+  opts.banner = "Usage: random_ranges [options]"
+  opts.on("-r", "--range START:END", "range") do |range|
+    s, e = range.split(':')
+    options.start = s.to_i
+    options.end = e.to_i
+  end
+  opts.on("-l", "--length LEN", "block length") do |len|
+    options.length = len.to_i
+  end
+  opts.on("-n", "--number NUM", "number of blocks") do |num|
+    options.num = num.to_i
+  end
+  opts.on("-s", "--sequence SEQ", "sequence") do |seq|
+    options.seq = seq
+  end
+end.parse!(ARGV)
+rand = Random.new
+range = options.end - options.start
+block_range = range / options.num
+block_start_range = block_range - options.length
+(0...options.num).each do |n|
+  block_offset = rand.rand(block_start_range)
+  b_start = options.start + (block_range * n) + block_offset
+  b_end = b_start + options.length
+  puts "#{options.seq}\t#{b_start}\t#{b_end}\tx"
+end

data/features/maf-indexing.feature ADDED

@@ -0,0 +1,31 @@
+@milestone_2
+Feature: Indexed access to MAF files
+  In order to extract alignment blocks from MAF files
+  By chromosomal ranges matching a source sequence
+  I want to have a way to build indexes on MAF files
+  And use indexes to efficiently find alignment blocks
+  Because linear searches of a 200 GB file are impractical
+  Scenario: Index a MAF file
+    Given a MAF source file "mm8_chr7_tiny.maf"
+    When I open it with a MAF reader
+    And build an index on the reference sequence
+    Then the index has at least 8 entries
+  Scenario: Extract alignment blocks by chromosomal range
+    Given a MAF source file "mm8_chr7_tiny.maf"
+    When I open it with a MAF reader
+    And build an index on the reference sequence
+    And search for blocks between positions 80082592 and 80082766 of mm8.chr7
+    Then 2 blocks are obtained
+    And sequence mm8.chr7 of block 0 has start 80082592
+    And sequence mm8.chr7 of block 1 has start 80082713
+  Scenario: Extract alignment blocks by chromosomal range from index file
+    Given a MAF source file "mm8_chr7_tiny.maf"
+    And a Kyoto Cabinet index file "mm8_chr7_tiny.kct"
+    When I open it with a MAF reader
+    And search for blocks between positions 80082592 and 80082766 of mm8.chr7
+    Then 2 blocks are obtained
+    And sequence mm8.chr7 of block 0 has start 80082592
+    And sequence mm8.chr7 of block 1 has start 80082713

data/features/maf-output.feature ADDED

@@ -0,0 +1,29 @@
+Feature: MAF output
+  In order to output modified MAF files or subsets of them
+  I want to be able to write out parsed MAF data
+  Scenario: Reproduce simple test data
+    Given a MAF source file "mm8_single.maf"
+    When I open it with a MAF reader
+    And open a new MAF writer
+    And write the header from the original MAF file
+    And write all the parsed blocks
+    Then the output should match, except whitespace, "mm8_single.maf"
+  Scenario: Reproduce longer test data
+    Given a MAF source file "mm8_chr7_tiny.maf"
+    When I open it with a MAF reader
+    And open a new MAF writer
+    And write the header from the original MAF file
+    And write all the parsed blocks
+    Then the output should match, except whitespace, "mm8_chr7_tiny.maf"
+  Scenario: Reproduce test data with i, e, q lines
+    Given a MAF source file "chr22_ieq.maf"
+    When I enable the :parse_extended parser option
+    And I enable the :parse_empty parser option
+    And I open it with a MAF reader
+    And open a new MAF writer
+    And write the header from the original MAF file
+    And write all the parsed blocks
+    Then the output should match, except whitespace, "chr22_ieq.maf"

data/features/maf-parsing.feature ADDED

@@ -0,0 +1,44 @@
+Feature: Parse MAF files
+  In order to extract information from a MAF file
+  I want to read it and pull out information
+  Scenario: Read MAF header
+    Given MAF data:
+    """
+    ##maf version=1 scoring=humor.v4
+    # humor.v4 R=30 M=10 /cluster/data/hg15/bed/blastz.mm3/axtNet25/chr22.maf /cluster/data/hg15/bed/blastz.rn3/axtNet25/chr22.maf
+    a score=0.128
+    s human_hoxa 100  8 + 100257 ACA-TTACT
+    s horse_hoxa 120  9 -  98892 ACAATTGCT
+    s fugu_hoxa   88  7  + 90788 ACA--TGCT
+    """
+    When I open it with a MAF reader
+    Then the MAF version should be "1"
+    And the scoring scheme should be "humor.v4"
+    # third line a continuation
+    And the alignment parameters should be "humor.v4 R=30 M=10 /cluster/data/hg15/bed/blastz.mm3/axtNet25/chr22.maf /cluster/data/hg15/bed/blastz.rn3/axtNet25/chr22.maf"
+  Scenario: Read alignment block
+    Given MAF data:
+    """
+    ##maf version=1 scoring=humor.v4
+    # humor.v4 R=30 M=10 /cluster/data/hg15/bed/blastz.mm3/axtNet300/chr1.maf
+    # /cluster/data/hg15/bed/blastz.rn3/axtNet300/chr1.maf
+    a score=0.128
+    s human_hoxa 100  8 + 100257 ACA-TTACT
+    s horse_hoxa 120  9 -  98892 ACAATTGCT
+    s fugu_hoxa   88  7  + 90788 ACA--TGCT
+    """
+    When I open it with a MAF reader
+    Then an alignment block can be obtained
+    And the alignment block has 3 sequences
+    And sequence 0 has source "human_hoxa"
+    And sequence 0 has start 100
+    And sequence 0 has size 8
+    And sequence 0 has strand :+
+    And sequence 0 has source size 100257
+    And sequence 0 has text "ACA-TTACT"
+    And sequence 1 has strand :-

data/features/maf-querying.feature ADDED

@@ -0,0 +1,75 @@
+@milestone_3
+Feature: Filter results from MAF files
+  In order to work with only relevant data from a MAF file
+  Such as only species recognized by PhyloCSF
+  I want to filter the results of MAF queries
+  Scenario: Return only specified species
+    Given MAF data:
+    """
+    ##maf version=1
+    a score=10542.0
+    s mm8.chr7                 80082334 34 + 145134094 GGGCTGAGGGC--AGGGATGG---AGGGCGGTCC--------------CAGCA-
+    s rn4.chr1                136011785 34 + 267910886 GGGCTGAGGGC--AGGGACGG---AGGGCGGTCC--------------CAGCA-
+    s oryCun1.scaffold_199771     14021 43 -     75077 -----ATGGGC--AAGCGTGG---AGGGGAACCTCTCCTCCCCTCCGACAAAG-
+    s hg18.chr15               88557580 27 + 100338915 --------GGC--AAGTGTGGA--AGGGAAGCCC--------------CAGAA-
+    s panTro2.chr15            87959837 27 + 100063422 --------GGC--AAGTGTGGA--AGGGAAGCCC--------------CAGAA-
+    s rheMac2.chr7             69864714 28 + 169801366 -------GGGC--AAGTATGGA--AGGGAAGCCC--------------CAGAA-
+    s canFam2.chr3             56030570 39 +  94715083 AGGTTTAGGGCAGAGGGATGAAGGAGGAGAATCC--------------CTATG-
+    s dasNov1.scaffold_106893      7435 34 +      9831 GGAACGAGGGC--ATGTGTGG---AGGGGGCTGC--------------CCACA-
+    s loxAfr1.scaffold_8298       30264 38 +     78952 ATGATGAGGGG--AAGCGTGGAGGAGGGGAACCC--------------CTAGGA
+    s echTel1.scaffold_304651       594 37 -     10007 -TGCTATGGCT--TTGTGTCTAGGAGGGGAATCC--------------CCAGGA
+    """
+    When I open it with a MAF reader
+    And filter for only the species
+    | hg18    |
+    | mm8     |
+    | rheMac2 |
+    Then an alignment block can be obtained
+    And the alignment block has 3 sequences
+  Scenario: Return only blocks having all specified species
+    Given a MAF source file "mm8_chr7_tiny.maf"
+    When I open it with a MAF reader
+    And build an index on the reference sequence
+    And filter for blocks with the species
+    | panTro2 |
+    | loxAfr1 |
+    And search for blocks between positions 80082471 and 80082730 of mm8.chr7
+    Then 1 block is obtained
+  Scenario: Return only blocks having a certain number of sequences
+    Given a MAF source file "mm8_chr7_tiny.maf"
+    When I open it with a MAF reader
+    And build an index on the reference sequence
+    And filter for blocks with at least 6 sequences
+    And search for blocks between positions 80082767 and 80083008 of mm8.chr7
+    Then 1 block is obtained
+  # sizes present:
+  # 55 64 128 148 157 163 165 192
+  Scenario: Return blocks with a maximum text size
+    Given a MAF source file "mm8_chr7_tiny.maf"
+    When I open it with a MAF reader
+    And build an index on the reference sequence
+    And filter for blocks with text size at least 150
+    And search for blocks between positions 0 and 80100000 of mm8.chr7
+    Then 4 blocks are obtained
+  Scenario: Return blocks with a minimum text size
+    Given a MAF source file "mm8_chr7_tiny.maf"
+    When I open it with a MAF reader
+    And build an index on the reference sequence
+    And filter for blocks with text size at most 72
+    And search for blocks between positions 0 and 80100000 of mm8.chr7
+    Then 2 blocks are obtained
+  Scenario: Return blocks within a text size range
+    Given a MAF source file "mm8_chr7_tiny.maf"
+    When I open it with a MAF reader
+    And build an index on the reference sequence
+    And filter for blocks with text size between 72 and 160
+    And search for blocks between positions 0 and 80100000 of mm8.chr7
+    Then 3 blocks are obtained

data/features/maf-to-fasta.feature ADDED

@@ -0,0 +1,50 @@
+Feature: Convert MAF file to FASTA
+  In order to use multiple alignment data with other tools
+  I want to read a Multiple Alignment Format (MAF) file and write out its data as FASTA
+  Scenario: Convert simple MAF file
+    Given a MAF source file "t1.maf"
+    When I select FASTA output
+    And I open it with a MAF reader
+    And process the file
+    Then the output should match "t1.fasta"
+  Scenario: Convert simple MAF data
+    Given MAF data:
+    """
+    ##maf version=1 scoring=humor.v4
+    # humor.v4 R=30 M=10 /cluster/data/hg15/bed/blastz.mm3/axtNet300/chr1.maf
+    # /cluster/data/hg15/bed/blastz.rn3/axtNet300/chr1.maf
+    a score=0.128
+    s human_hoxa 100  8 + 100257 ACA-TTACT
+    s horse_hoxa 120  9 -  98892 ACAATTGCT
+    s fugu_hoxa   88  7  + 90788 ACA--TGCT
+    a score=0.071
+    s human_unc 9077 8 + 10998 ACAGTATT
+    # Comment
+    s horse_unc 4555 6 -  5099 ACA--ATT
+    s fugu_unc  4000 4 +  4038 AC----TT
+    """
+    When I select FASTA output
+    And I open it with a MAF reader
+    And process the file
+    Then the output should be:
+    """
+    >human_hoxa:100-108
+    ACA-TTACT
+    >horse_hoxa:120-129
+    ACAATTGCT
+    >fugu_hoxa:88-95
+    ACA--TGCT
+    >human_unc:9077-9085
+    ACAGTATT
+    >horse_unc:4555-4561
+    ACA--ATT
+    >fugu_unc:4000-4004
+    AC----TT
+    """

data/features/step_definitions/convert_steps.rb ADDED

@@ -0,0 +1,45 @@
+require 'bigbio'                # FASTA support
+Given /^a MAF source file "(.*?)"$/ do |src|
+  @src_f = $test_data + src
+  @src_f.exist?.should be_true
+end
+Given /^MAF data:$/ do |string|
+  @src_f = Tempfile.new(['rspec', '.maf'])
+  @src_f.write(string)
+  @src_f.close
+end
+When /^I select FASTA output$/ do
+  @dst = Tempfile.new(['cuke', ".#{@out_fmt.to_s}"])
+  @dst.close
+  @writer = FastaWriter.new(@dst.path)
+end
+When /^process the file$/ do
+  @parser.each_block do |block|
+    block.each_raw_seq do |seq|
+      seq.write_fasta(@writer)
+    end
+  end
+  @writer.close
+end
+Then /^the output should match "(.*?)"$/ do |ref|
+  ref_p = $test_data + ref
+  ref_p.exist?.should be_true
+  #system("diff #{ref} #{@dst.path} >/dev/null 2>&1").should be_true
+  File.read(@dst.path).should == File.read(ref_p)
+end
+Then /^the output should be:$/ do |string|
+  File.read(@dst.path).should == string
+end
+After do
+  if @dst
+    @dst.close
+    @dst.unlink
+  end
+end