RubyGems - bio-maf - Versions diffs - 0.2.0-java → 0.3.0-java - Mend

bio-maf 0.2.0-java → 0.3.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

data/.gitignore +1 -0
data/Gemfile +3 -1
data/README.md +98 -29
data/Rakefile +6 -2
data/bin/maf_tile +59 -35
data/bio-maf.gemspec +4 -3
data/features/block-joining.feature +32 -0
data/features/dir-access.feature +46 -0
data/features/maf-indexing.feature +23 -0
data/features/maf-to-fasta.feature +9 -0
data/features/slice.feature +54 -0
data/features/step_definitions/dir-access_steps.rb +15 -0
data/features/step_definitions/file_steps.rb +7 -0
data/features/step_definitions/gap_removal_steps.rb +4 -0
data/features/step_definitions/index_steps.rb +3 -3
data/features/step_definitions/output_steps.rb +9 -1
data/features/step_definitions/parse_steps.rb +13 -2
data/features/step_definitions/query_steps.rb +7 -6
data/features/step_definitions/slice_steps.rb +15 -0
data/features/step_definitions/{gap-filling_steps.rb → tiling_steps.rb} +0 -0
data/features/support/aruba.rb +1 -0
data/features/support/env.rb +3 -1
data/features/{gap-filling.feature → tiling.feature} +85 -0
data/lib/bio/maf/index.rb +223 -11
data/lib/bio/maf/maf.rb +209 -0
data/lib/bio/maf/parser.rb +190 -111
data/lib/bio/maf/tiler.rb +33 -6
data/man/maf_index.1 +1 -1
data/man/maf_tile.1 +7 -7
data/man/maf_tile.1.ronn +21 -13
data/man/maf_to_fasta.1 +1 -1
data/spec/bio/maf/index_spec.rb +99 -0
data/spec/bio/maf/maf_spec.rb +184 -0
data/spec/bio/maf/parser_spec.rb +75 -115
data/spec/bio/maf/tiler_spec.rb +44 -0
data/test/data/chr22_ieq2.maf +11 -0
data/test/data/gap-1.kct +0 -0
data/test/data/gap-1.maf +9 -0
data/test/data/gap-filled1.fa +6 -0
data/test/data/gap-sp1.fa.gz +0 -0
data/test/data/mm8_chr7_tiny_slice1.maf +9 -0
data/test/data/mm8_chr7_tiny_slice2.maf +10 -0
data/test/data/mm8_chr7_tiny_slice3.maf +10 -0
data/test/data/mm8_chrM_tiny.kct +0 -0
data/test/data/mm8_chrM_tiny.maf +1000 -0
metadata +65 -16

data/lib/bio/maf/tiler.rb CHANGED Viewed

@@ -1,3 +1,4 @@
+require 'pathname'
 require 'zlib'
 module Bio::MAF
@@ -9,7 +10,7 @@ module Bio::MAF
     attr_accessor :index
     attr_accessor :parser
-    attr_accessor :reference
+    attr_reader :reference
     # GenomicInterval
     attr_accessor :interval
     attr_accessor :species
@@ -19,6 +20,25 @@ module Bio::MAF
       @species_map = {}
     end
+    # Set the reference sequence.
+    #
+    # @param source [FASTARangeReader, String, Pathname]
+    def reference=(source)
+      ref = case
+            when source.is_a?(FASTARangeReader)
+              source
+            when source.respond_to?(:seek)
+              # open file
+              FASTARangeReader.new(source)
+            when source.respond_to?(:start_with?) && source.start_with?('>')
+              # FASTA string
+              FASTARangeReader.new(StringIO.new(source))
+            else
+              FASTARangeReader.new(source.to_s)
+            end
+      @reference = ref
+    end
     def ref_data(range)
       if reference
         if reference.respond_to? :read_interval
@@ -33,8 +53,12 @@ module Bio::MAF
       end
     end
+    def species_to_use
+      species || species_map.keys
+    end
     def tile
-      parser.sequence_filter[:only_species] = @species
+      parser.sequence_filter[:only_species] = species_to_use
       # TODO: remove gaps
       blocks = index.find([interval], parser).sort_by { |b| b.vars[:score] }
       mask = Array.new(interval.length, :ref)
@@ -51,7 +75,7 @@ module Bio::MAF
                   (slice_start - i_start)...(slice_end - i_start))
       end
       text = []
-      species.each { |s| text << '' }
+      species_to_use.each { |s| text << '' }
       nonref_text = text[1...text.size]
       runs(mask) do |range, block|
         g_range = (range.begin + i_start)...(range.end + i_start)
@@ -69,7 +93,7 @@ module Bio::MAF
         else
           # covered by an alignment block
           t_range = block.ref_seq.text_range(g_range)
-          species.each_with_index do |species, i|
+          species_to_use.each_with_index do |species, i|
             sp_text = text[i]
             seq = block.sequences.find { |s| s.source == species || s.species == species }
             if seq
@@ -86,7 +110,7 @@ module Bio::MAF
     end
     def write_fasta(f)
-      species.zip(tile()) do |species, text|
+      species_to_use.zip(tile()) do |species, text|
         sp_out = species_map[species] || species
         f.puts ">#{sp_out}"
         f.puts text
@@ -147,10 +171,13 @@ module Bio::MAF
         line = line_raw.strip
         end_pos = pos + line.size
         if (! in_region) && pos <= z_start && z_start < end_pos
-          data << line.slice((z_start - pos)...(line.size))
+          offset = z_start - pos
+          end_offset = [(offset + region_size), line.size].min
+          data << line.slice(offset...end_offset)
           in_region = true
         elsif in_region
           need = region_size - data.size
+          raise "should not happen: region #{region_size}, data #{data.size}, need #{need}" if need < 0
           if need > line.size
             data << line
           else

data/man/maf_index.1 CHANGED Viewed

@@ -1,7 +1,7 @@
 .\" generated with Ronn/v0.7.3
 .\" http://github.com/rtomayko/ronn/tree/0.7.3
 .
-.TH "MAF_INDEX" "1" "June 2012" "Clayton Wheeler" "BioRuby Manual"
+.TH "MAF_INDEX" "1" "July 2012" "BioRuby" "BioRuby Manual"
 .
 .SH "NAME"
 \fBmaf_index\fR \- build and examine MAF indexes

data/man/maf_tile.1 CHANGED Viewed

@@ -1,22 +1,22 @@
 .\" generated with Ronn/v0.7.3
 .\" http://github.com/rtomayko/ronn/tree/0.7.3
 .
-.TH "MAF_TILE" "1" "June 2012" "Clayton Wheeler" "BioRuby Manual"
+.TH "MAF_TILE" "1" "July 2012" "BioRuby" "BioRuby Manual"
 .
 .SH "NAME"
 \fBmaf_tile\fR \- synthesize an alignment for a given region
 .
 .SH "SYNOPSIS"
-\fBmaf_tile\fR [\fIoptions\fR] \-i BEGIN:END [\-s SPECIES[:NAME] \.\.\.] \fImaf\fR \fIindex\fR
+\fBmaf_tile\fR [\fIoptions\fR] \-i [SEQ:]BEGIN:END [\-s SPECIES[:NAME] \.\.\.] \fImaf\fR [index]
 .
 .P
-\fBmaf_tile\fR [\fIoptions\fR] \-\-bed BED \-o BASE [\-s SPECIES[:NAME] \.\.\.] \fImaf\fR \fIindex\fR
+\fBmaf_tile\fR [\fIoptions\fR] \-\-bed BED \-o BASE [\-s SPECIES[:NAME] \.\.\.] \fImaf\fR [index]
 .
 .SH "DESCRIPTION"
-\fBmaf_tile\fR takes a MAF file with index (generated by maf_index(1)), extracts alignment blocks overlapping the given genomic interval, and constructs a single alignment block covering the entire interval for the specified species\. Optionally, any gaps in coverage of the MAF file\'s reference sequence can be filled in from a FASTA sequence file\.
+\fBmaf_tile\fR takes a MAF file, with optional index, or directory of indexed MAF files, extracts alignment blocks overlapping the given genomic interval, and constructs a single alignment block covering the entire interval for the specified species\. Optionally, any gaps in coverage of the MAF file\'s reference sequence can be filled in from a FASTA sequence file\.
 .
 .P
-If a single interval is specified, the output will be written to stdout in FASTA format\. If the \fB\-\-output\-base\fR option is specified, \fB_<start>:<end>\.fa\fR will be appended to the given  parameter and used to construct the output path\. If a BED file is specified with \fB\-\-bed\fR, \fB\-\-output\-base\fR is also required\.
+If a single interval is specified, the output will be written to stdout in FASTA format\. If a directory of MAF files is supplied as the \fImaf\fR parameter, the interval must include the sequence identifier in the form \fBsequence:begin:end\fR\. If the \fB\-\-output\-base\fR option is specified, \fB_<begin>:<end>\.fa\fR will be appended to the given  parameter and used to construct the output path\. If a BED file is specified with \fB\-\-bed\fR, \fB\-\-output\-base\fR is also required\.
 .
 .P
 Species can be renamed for output by specifying them as SPECIES:NAME; the first component will be used to select the species from the MAF file, and the second will be used in the FASTA description line for output\.
@@ -84,10 +84,10 @@ $ maf_tile \-\-bed /tmp/mm8\.bed \-\-output\-base /tmp/mm8 \e
 The output is generated in FASTA format, with one sequence per species\.
 .
 .P
-The input \fImaf\fR file must be a Multiple Alignment Format file\.
+The \fImaf\fR parameter must specify either a Multiple Alignment Format (MAF) file or a directory of such files, with indexes\.
 .
 .P
-The \fIindex\fR must be a MAF index built with maf_index(1)\.
+The \fIindex\fR must be a MAF index built with maf_index(1)\. This parameter is ignored if the \fImaf\fR parameter is a directory\. It can be omitted if a single MAF file is given, but in this case the entire file will be parsed to build a temporary index\. For large files which will be reused, this is not advisable\.
 .
 .P
 If \fB\-\-bed\fR \fIbed\fR is specified, its argument must be a BED file\. Only the second and third columns will be used, to specify the zero\-based start and end positions of intervals\.

data/man/maf_tile.1.ronn CHANGED Viewed

@@ -3,23 +3,26 @@ maf_tile(1) -- synthesize an alignment for a given region
 ## SYNOPSIS
-`maf_tile` [<options>] -i BEGIN:END [-s SPECIES[:NAME] ...] <maf> <index>
+`maf_tile` [<options>] -i [SEQ:]BEGIN:END [-s SPECIES[:NAME] ...] <maf> [index]
-`maf_tile` [<options>] --bed BED -o BASE [-s SPECIES[:NAME] ...] <maf> <index>
+`maf_tile` [<options>] --bed BED -o BASE [-s SPECIES[:NAME] ...] <maf> [index]
 ## DESCRIPTION
-**maf_tile** takes a MAF file with index (generated by maf_index(1)),
-extracts alignment blocks overlapping the given genomic interval, and
-constructs a single alignment block covering the entire interval for
-the specified species. Optionally, any gaps in coverage of the MAF
-file's reference sequence can be filled in from a FASTA sequence file.
+**maf_tile** takes a MAF file, with optional index, or directory of
+indexed MAF files, extracts alignment blocks overlapping the given
+genomic interval, and constructs a single alignment block covering the
+entire interval for the specified species. Optionally, any gaps in
+coverage of the MAF file's reference sequence can be filled in from a
+FASTA sequence file.
 If a single interval is specified, the output will be written to
-stdout in FASTA format. If the `--output-base` option is specified,
-`_<start>:<end>.fa` will be appended to the given <base> parameter and
-used to construct the output path. If a BED file is specified with
-`--bed`, `--output-base` is also required.
+stdout in FASTA format. If a directory of MAF files is supplied as the
+<maf> parameter, the interval must include the sequence identifier in
+the form `sequence:begin:end`. If the `--output-base` option is
+specified, `_<begin>:<end>.fa` will be appended to the given <base>
+parameter and used to construct the output path. If a BED file is
+specified with `--bed`, `--output-base` is also required.
 Species can be renamed for output by specifying them as SPECIES:NAME;
 the first component will be used to select the species from the MAF
@@ -80,9 +83,14 @@ sequence:
 The output is generated in FASTA format, with one sequence per
 species.
-The input <maf> file must be a Multiple Alignment Format file.
+The <maf> parameter must specify either a Multiple Alignment Format
+(MAF) file or a directory of such files, with indexes.
-The <index> must be a MAF index built with maf_index(1).
+The <index> must be a MAF index built with maf_index(1). This
+parameter is ignored if the <maf> parameter is a directory. It can be
+omitted if a single MAF file is given, but in this case the entire
+file will be parsed to build a temporary index. For large files which
+will be reused, this is not advisable.
 If `--bed` <bed> is specified, its argument must be a BED file. Only
 the second and third columns will be used, to specify the zero-based

data/man/maf_to_fasta.1 CHANGED Viewed

@@ -1,7 +1,7 @@
 .\" generated with Ronn/v0.7.3
 .\" http://github.com/rtomayko/ronn/tree/0.7.3
 .
-.TH "MAF_TO_FASTA" "1" "June 2012" "Clayton Wheeler" "BioRuby Manual"
+.TH "MAF_TO_FASTA" "1" "July 2012" "BioRuby" "BioRuby Manual"
 .
 .SH "NAME"
 \fBmaf_to_fasta\fR \- convert MAF file to FASTA

data/spec/bio/maf/index_spec.rb CHANGED Viewed

@@ -3,6 +3,73 @@ require 'spec_helper'
 module Bio
   module MAF
+    describe Access do
+      describe "#tile" do
+        it "gives correct output with a Pathname" do
+          access = Access.maf_dir(TestData)
+          interval = GenomicInterval.zero_based('sp1.chr1', 0, 50)
+          buf = StringIO.new
+          access.tile(interval) do |tiler|
+            tiler.reference = TestData + 'gap-sp1.fa'
+            tiler.species = %w(sp1 sp2 sp3)
+            tiler.write_fasta(buf)
+          end
+          buf.string.should == File.read(TestData + 'gap-filled1.fa')
+        end
+        it "gives correct output with only a species map" do
+          access = Access.maf_dir(TestData)
+          interval = GenomicInterval.zero_based('sp1.chr1', 0, 50)
+          buf = StringIO.new
+          access.tile(interval) do |tiler|
+            tiler.reference = TestData + 'gap-sp1.fa'
+            tiler.species_map = {
+              'sp1' => 'sp1',
+              'sp2' => 'sp2',
+              'sp3' => 'sp3'
+            }
+            tiler.write_fasta(buf)
+          end
+          buf.string.should == File.read(TestData + 'gap-filled1.fa')
+        end
+        it "gives correct output with no species specified" do
+          pending("issue 88") do
+            access = Access.maf_dir(TestData)
+            interval = GenomicInterval.zero_based('sp1.chr1', 0, 50)
+            buf = StringIO.new
+            access.tile(interval) do |tiler|
+              tiler.reference = TestData + 'gap-sp1.fa'
+              tiler.write_fasta(buf)
+            end
+            buf.string.should == File.read(TestData + 'gap-filled1.fa')
+          end
+        end
+      end
+      describe ".file" do
+        it "accepts a MAF file and index" do
+          access = Access.file(TestData + 'gap-1.maf',
+                               TestData + 'gap-1.kct')
+          blocks = access.find([GenomicInterval.zero_based('sp1.chr1',
+                                                           10,
+                                                           23)]).to_a
+          blocks.size.should == 1
+        end
+        it "accepts a MAF file and finds the index" do
+          access = Access.file(TestData + 'gap-1.maf')
+          blocks = access.find([GenomicInterval.zero_based('sp1.chr1',
+                                                           10,
+                                                           23)]).to_a
+          blocks.size.should == 1
+        end
+        it "accepts a MAF file and builds a temp index" do
+          access = Access.file(TestData + 'chrY-1block.maf')
+          blocks = access.find([GenomicInterval.zero_based('hg19.chrY',
+                                                           10501,
+                                                           10544)]).to_a
+          blocks.size.should == 1
+        end
+      end
+    end
     describe KyotoIndex do
       def has_at_least_n_with_prefix(n, start)
         @idx.db.cursor_process do |cur|
@@ -87,6 +154,38 @@ module Bio
             l[0].offset.should == 16
           end
+          it "takes a block arg" do
+            called = false
+            @idx.find([GenomicInterval.zero_based('mm8.chr7',
+                                                  80082334,
+                                                  80082338)],
+                      @p) do |block|
+              block.offset.should == 16
+              called = true
+            end
+            called.should be_true
+          end
+          it "with a block and no match, returns" do
+            called = false
+            @idx.find([GenomicInterval.zero_based('mm8.chr7',
+                                                  20082334,
+                                                  20082338)],
+                      @p) do |block|
+              called = true
+            end
+            called.should be_false
+          end
+          it "with no block and no match, returns an empty list" do
+            v = @idx.find([GenomicInterval.zero_based('mm8.chr7',
+                                                  20082334,
+                                                  20082338)],
+                          @p)
+            v.should_not be_nil
+            v.should respond_to(:count)
+          end
           after(:each) do
             @idx.db.close
             @p.f.close

data/spec/bio/maf/maf_spec.rb ADDED Viewed

@@ -0,0 +1,184 @@
+require 'spec_helper'
+module Bio
+  module MAF
+    describe Header do
+      before(:each) do
+        @p = Parser.new(TestData + 't1.maf')
+      end
+      it "provides version information" do
+        @p.header.version.should == '1'
+      end
+      it "provides the scoring scheme" do
+        @p.header.scoring.should == 'humor.v4'
+      end
+      it "provides alignment parameters" do
+        @p.header.alignment_params.should =~ /humor.v4 R=30/
+      end
+      it "presents multiline parameters correctly" do
+        @p.header.alignment_params.should == "humor.v4 R=30 M=10 /cluster/data/hg15/bed/blastz.mm3/axtNet300/chr1.maf /cluster/data/hg15/bed/blastz.rn3/axtNet300/chr1.maf"
+      end
+      it "provides arbitrary parameters"
+    end
+    describe Block do
+      describe "#find_gaps" do
+        it "finds a single 14-base gap" do
+          p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
+          p.sequence_filter = { :only_species => %w(mm8 rn4 hg18 canFam2 loxAfr1) }
+          block = p.parse_block
+          gaps = block.find_gaps
+          gaps.size.should == 1
+          gaps[0][0].should == 34
+          gaps[0][1].should == 14
+        end
+      end
+      describe "#remove_gaps!" do
+        it "removes a single 14-base gap" do
+          p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
+          p.sequence_filter = { :only_species => %w(mm8 rn4 hg18 canFam2 loxAfr1) }
+          block = p.parse_block
+          block.sequences.size.should == 5
+          block.text_size.should == 54
+          block.remove_gaps!
+          block.text_size.should == 40
+        end
+      end
+      describe "#joinable_with?" do
+        it "is false for blocks with different sequences" do
+          p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
+          sp = %w(mm8 rn4 oryCun1 hg18 panTro2 rheMac2 canFam2 dasNov1 loxAfr1 echTel1)
+          p.sequence_filter = { :only_species => sp }
+          b1 = p.parse_block
+          b2 = p.parse_block
+          b1.joinable_with?(b2).should be_false
+        end
+        it "is true for blocks with same sequences" do
+          p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
+          sp = %w(mm8 rn4 oryCun1 hg18 panTro2 rheMac2 canFam2 loxAfr1 echTel1)
+          p.sequence_filter = { :only_species => sp }
+          b1 = p.parse_block
+          b2 = p.parse_block
+          b1.joinable_with?(b2).should be_true
+        end
+      end
+      describe "#to_bio_alignment" do
+        it "returns a usable Bio::BioAlignment::Alignment" do
+          p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
+          b = p.parse_block
+          ba = b.to_bio_alignment
+          ba.size.should == 10
+          ba.sequences[0].id.should == "mm8.chr7"
+          ba.sequences[0].seq.should =~ /^GGGCTGAGGGC--/
+        end
+      end
+    end
+    describe Sequence do
+      before(:each) do
+        @parser = DummyParser.new
+      end
+      describe "#gapped?" do
+        it "is false for sequences with no gaps" do
+          line = "s human_unc 9077 8 + 10998 ACAGTATT"
+          s = @parser.parse_seq_line(line, nil)
+          s.gapped?.should be_false
+        end
+        it "is true for sequences with gaps" do
+          line = "s human_unc 9077 8 + 10998 AC-AGTATT"
+          s = @parser.parse_seq_line(line, nil)
+          s.gapped?.should be_true
+        end
+      end
+      describe "#text_range" do
+        it "returns 0...text.size for a spanning interval" do
+          line = "s human_unc 9077 8 + 10998 ACAGTATT"
+          s = @parser.parse_seq_line(line, nil)
+          range = s.text_range(9077...(9077 + 8))
+          range.should == (0...(s.text.size))
+        end
+        it "returns 0...text.size for a gapped spanning interval" do
+          line = "s human_unc 9077 8 + 10998 AC--AGTATT"
+          s = @parser.parse_seq_line(line, nil)
+          range = s.text_range(9077...(9077 + 8))
+          range.should == (0...(s.text.size))
+        end
+        it "handles a leading subset" do
+          line = "s human_unc 9077 8 + 10998 ACAGTATT"
+          s = @parser.parse_seq_line(line, nil)
+          range = s.text_range(9077...(9077 + 2))
+          range.should == (0...2)
+        end
+        it "handles a trailing subset" do
+          line = "s human_unc 9077 8 + 10998 ACAGTATT"
+          s = @parser.parse_seq_line(line, nil)
+          range = s.text_range(9079...9085)
+          range.should == (2...8)
+        end
+        it "handles a gap in the middle" do
+          line = "s human_unc 9077 8 + 10998 AC--AGTATT"
+          s = @parser.parse_seq_line(line, nil)
+          range = s.text_range(9078...(9077 + 8))
+          range.should == (1...(s.text.size))
+        end
+        it "errors on a range starting before" do
+          expect {
+            line = "s human_unc 9077 8 + 10998 ACAGTATT"
+            s = @parser.parse_seq_line(line, nil)
+            range = s.text_range(9076...(9077 + 8))
+          }.to raise_error
+        end
+        it "errors on a range ending after" do
+          expect {
+            line = "s human_unc 9077 8 + 10998 ACAGTATT"
+            s = @parser.parse_seq_line(line, nil)
+            range = s.text_range(9076...(9077 + 9))
+          }.to raise_error
+        end
+      end
+      describe "synteny data" do
+        it "extracts basic data from i lines" do
+          p = Parser.new(TestData + 'chr22_ieq2.maf',
+                         :parse_extended => true)
+          b = p.parse_block
+          b.sequences[0].left_status_char.should be_nil
+          b.sequences[0].left_status.should be_nil
+          b.sequences[0].left_count.should be_nil
+          b.sequences[0].right_status_char.should be_nil
+          b.sequences[0].right_status.should be_nil
+          b.sequences[0].right_count.should be_nil
+          # works but let's not over-specify internal state
+          #b.sequences[1].i_data.should == %w(N 0 C 0)
+          b.sequences[1].left_status_char.should == 'N'
+          b.sequences[1].left_status.should == :first
+          b.sequences[1].right_status_char.should == 'C'
+          b.sequences[1].right_status.should == :contiguous
+          b.sequences[2].left_status.should == :contiguous
+          b.sequences[2].right_status_char.should == 'I'
+          b.sequences[2].right_status.should == :intervening
+          b.sequences[2].right_count.should == 146
+        end
+      end
+      describe "#to_bioalignment" do
+        it "returns a usable Bio::BioAlignment::Sequence" do
+          @parser = DummyParser.new
+          line = "s human_unc 9077 8 + 10998 ACAGTATT"
+          s = @parser.parse_seq_line(line, nil)
+          as = s.to_bio_alignment
+          as.id.should == "human_unc"
+          as.seq.should == "ACAGTATT"
+        end
+      end
+    end
+  end
+end