RubyGems - bio-maf - Versions diffs - 0.1.0-java - Mend

bio-maf 0.1.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (76) hide show

data/.document +5 -0
data/.simplecov +1 -0
data/.travis.yml +16 -0
data/.yardopts +3 -0
data/DEVELOPMENT.md +40 -0
data/Gemfile +23 -0
data/LICENSE.txt +20 -0
data/README.md +209 -0
data/Rakefile +76 -0
data/VERSION +1 -0
data/benchmarks/dispatch_bench +53 -0
data/benchmarks/iter_bench +44 -0
data/benchmarks/read_bench +40 -0
data/benchmarks/sort_bench +33 -0
data/benchmarks/split_bench +33 -0
data/bin/maf_count +82 -0
data/bin/maf_dump_blocks +27 -0
data/bin/maf_extract_ranges_count +44 -0
data/bin/maf_index +88 -0
data/bin/maf_parse_bench +94 -0
data/bin/maf_to_fasta +68 -0
data/bin/maf_write +84 -0
data/bin/random_ranges +35 -0
data/features/maf-indexing.feature +31 -0
data/features/maf-output.feature +29 -0
data/features/maf-parsing.feature +44 -0
data/features/maf-querying.feature +75 -0
data/features/maf-to-fasta.feature +50 -0
data/features/step_definitions/convert_steps.rb +45 -0
data/features/step_definitions/index_steps.rb +20 -0
data/features/step_definitions/output_steps.rb +27 -0
data/features/step_definitions/parse_steps.rb +63 -0
data/features/step_definitions/query_steps.rb +31 -0
data/features/step_definitions/ucsc_bin_steps.rb +14 -0
data/features/support/env.rb +16 -0
data/features/ucsc-bins.feature +24 -0
data/lib/bio-maf.rb +12 -0
data/lib/bio-maf/maf.rb +3 -0
data/lib/bio/maf.rb +4 -0
data/lib/bio/maf/index.rb +620 -0
data/lib/bio/maf/parser.rb +888 -0
data/lib/bio/maf/struct.rb +63 -0
data/lib/bio/maf/writer.rb +63 -0
data/lib/bio/ucsc.rb +2 -0
data/lib/bio/ucsc/genomic-interval-bin.rb +13 -0
data/lib/bio/ucsc/ucsc_bin.rb +117 -0
data/man/.gitignore +1 -0
data/man/maf_index.1 +105 -0
data/man/maf_index.1.markdown +97 -0
data/man/maf_index.1.ronn +83 -0
data/man/maf_to_fasta.1 +53 -0
data/man/maf_to_fasta.1.ronn +51 -0
data/spec/bio/maf/index_spec.rb +363 -0
data/spec/bio/maf/parser_spec.rb +354 -0
data/spec/bio/maf/struct_spec.rb +75 -0
data/spec/spec_helper.rb +14 -0
data/test/data/big-block.maf +15999 -0
data/test/data/chr22_ieq.maf +11 -0
data/test/data/chrY-1block.maf +6 -0
data/test/data/empty +0 -0
data/test/data/empty.db +0 -0
data/test/data/mm8_chr7_tiny.kct +0 -0
data/test/data/mm8_chr7_tiny.maf +76 -0
data/test/data/mm8_mod_a.maf +7 -0
data/test/data/mm8_single.maf +13 -0
data/test/data/mm8_subset_a.maf +23 -0
data/test/data/t1-bad1.maf +15 -0
data/test/data/t1.fasta +12 -0
data/test/data/t1.maf +15 -0
data/test/data/t1a.maf +17 -0
data/test/helper.rb +18 -0
data/test/test_bio-maf.rb +7 -0
data/travis-ci/install_kc +13 -0
data/travis-ci/install_kc_java +13 -0
data/travis-ci/report_errors +4 -0
metadata +182 -0

data/man/maf_index.1.ronn ADDED

@@ -0,0 +1,83 @@
+maf_index(1) -- build and examine MAF indexes
+=============================================
+## SYNOPSIS
+`maf_index` [-t] <maf> <index><br>
+`maf_index` `-d`|`--dump` <index>
+## DESCRIPTION
+**maf_index** is part of the bioruby-maf library and creates
+Kyoto Cabinet indexes for Multiple Alignment Format (MAF)
+files. These indexes enable other MAF tools to selectively extract
+alignment blocks of interest.
+In its default mode, `maf_index` parses the <maf> file given as an
+argument and creates an index in <index>.
+The index data is stored in binary form, so with the `--dump`
+argument, `maf_index` can dump out the index data in human-readable
+form for debugging.
+## FILES
+The <maf> input file must be a valid MAF file of any length.
+The index created is a Kyoto Cabinet TreeDB (B+ tree) database;
+<index> must have a `.kct` extension.
+## OPTIONS
+TODO
+ * `-d`, `--dump`:
+   Instead of creating an index, dump out the given <index> in
+   human-readable form. Index records will appear like:
+       0 [bin 1195] 80082334:80082368
+         offset 16, length 1087
+         text size: 54
+         sequences in block: 10
+         species vector: 00000000000003ff
+ * `-t`, `--threaded`:
+   Use a separate reader thread to do I/O in parallel with
+   parsing. Only useful on JRuby.
+ * `--time`:
+   Print elapsed time for index creation. Mainly useful for measuring
+   performance with different Ruby implementations, I/O subsystems,
+   etc.
+## EXAMPLES
+Build an index on a MAF file:
+    $ maf_index chr22.maf chr22.kct
+Dump out an index:
+    $ maf_index -d chr22.kct > /tmp/chr22.dump
+## ENVIRONMENT
+`maf_index` is a Ruby program and relies on ordinary Ruby environment
+variables.
+## BUGS
+`maf_index` does not currently allow Kyoto Cabinet database parameters
+to be set.
+## COPYRIGHT
+`maf_index` is copyright (C) 2012 Clayton Wheeler.
+## SEE ALSO
+ruby(1), kctreemgr(1)
+ * <https://github.com/csw/bioruby-maf/>
+ * <http://fallabs.com/kyotocabinet/>

data/man/maf_to_fasta.1 ADDED

@@ -0,0 +1,53 @@
+.\" generated with Ronn/v0.7.3
+.\" http://github.com/rtomayko/ronn/tree/0.7.3
+.
+.TH "MAF_TO_FASTA" "1" "June 2012" "Clayton Wheeler" "BioRuby Manual"
+.
+.SH "NAME"
+\fBmaf_to_fasta\fR \- convert MAF file to FASTA
+.
+.SH "SYNOPSIS"
+\fBmaf_to_fasta\fR [\fIoptions\fR\.\.\.] \fImaf\fR \fIfasta\fR
+.
+.SH "DESCRIPTION"
+\fBmaf_to_fasta\fR, part of the bioruby\-maf library, converts Multiple Alignment Format (MAF) files to FASTA format\. It does not attempt to combine alignment blocks, but simply writes out each sequence in order\.
+.
+.SH "FILES"
+The \fImaf\fR input file must be a valid MAF file of any length\.
+.
+.P
+The \fIfasta\fR output file will be written in FASTA format\.
+.
+.SH "OPTIONS"
+The options are only useful for performance measurement\.
+.
+.SH "EXAMPLES"
+Convert a MAF file to FASTA:
+.
+.IP "" 4
+.
+.nf
+$ maf_to_fasta chrY\.maf chrY\.fa
+.
+.fi
+.
+.IP "" 0
+.
+.SH "ENVIRONMENT"
+\fBmaf_to_fasta\fR is a Ruby program and relies on ordinary Ruby environment variables\.
+.
+.SH "BUGS"
+\fBmaf_to_fasta\fR should provide flexibility in selecting the alignment blocks and sequences to convert, and perhaps allow alignment blocks to be spliced together\.
+.
+.SH "COPYRIGHT"
+\fBmaf_to_fasta\fR is copyright (C) 2012 Clayton Wheeler\.
+.
+.SH "SEE ALSO"
+ruby(1)
+.
+.IP "\(bu" 4
+\fIhttps://github\.com/csw/bioruby\-maf/\fR
+.
+.IP "" 0

data/man/maf_to_fasta.1.ronn ADDED

@@ -0,0 +1,51 @@
+maf_to_fasta(1) -- convert MAF file to FASTA
+============================================
+## SYNOPSIS
+`maf_to_fasta` [<options>...] <maf> <fasta>
+## DESCRIPTION
+**maf_to_fasta**, part of the bioruby-maf library, converts Multiple
+Alignment Format (MAF) files to FASTA format. It does not attempt to
+combine alignment blocks, but simply writes out each sequence in
+order.
+## FILES
+The <maf> input file must be a valid MAF file of any length.
+The <fasta> output file will be written in FASTA format.
+## OPTIONS
+The options are only useful for performance measurement.
+## EXAMPLES
+Convert a MAF file to FASTA:
+    $ maf_to_fasta chrY.maf chrY.fa
+## ENVIRONMENT
+`maf_to_fasta` is a Ruby program and relies on ordinary Ruby
+environment variables.
+## BUGS
+`maf_to_fasta` should provide flexibility in selecting the alignment
+blocks and sequences to convert, and perhaps allow alignment blocks to
+be spliced together.
+## COPYRIGHT
+`maf_to_fasta` is copyright (C) 2012 Clayton Wheeler.
+## SEE ALSO
+ruby(1)
+ * <https://github.com/csw/bioruby-maf/>

data/spec/bio/maf/index_spec.rb ADDED

@@ -0,0 +1,363 @@
+require 'spec_helper'
+module Bio
+  module MAF
+    describe KyotoIndex do
+      def has_at_least_n_with_prefix(n, start)
+        @idx.db.cursor_process do |cur|
+          i = 0
+          cur.jump(start)
+          k = cur.get_key(true)
+          while k && k.start_with?(start) && i < n
+            i += 1
+          end
+          return i == n
+        end
+      end
+      describe ".build" do
+        it "accepts '%' as a path for an in-memory DB" do
+          expect {
+            @p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
+            @idx = KyotoIndex.build(@p, '%')
+            @p.f.close
+            @idx.close
+          }.not_to raise_error
+        end
+        it "accepts .kct paths"
+        it "rejects other paths"
+        context "mm8_chr7" do
+          before(:each) do
+            @p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
+            @idx = KyotoIndex.build(@p, '%')
+          end
+          it "uses the first sequence appearing as the reference sequence" do
+            @idx.index_sequences.to_a.should == [["mm8.chr7", 0]]
+          end
+          it "creates 8 index entries" do
+            has_at_least_n_with_prefix(8, "\xFF\x00").should be_true
+          end
+          it "stores the sequence IDs" do
+            @idx.db.match_prefix("sequence:").size.should == 1
+          end
+          it "stores the sequence IDs" do
+            @idx.db.get("sequence:mm8.chr7").should == "0"
+          end
+          describe "loads sequence data correctly" do
+            before(:each) { @idx = @idx.reopen }
+            it "uses the first sequence appearing as the reference sequence" do
+              @idx.index_sequences.to_a.should == [["mm8.chr7", 0]]
+            end
+          end
+          after(:each) do
+            @idx.db.close
+          end
+        end
+      end
+      describe ".open" do
+        it "opens an existing index successfully" do
+          @idx = KyotoIndex.open(TestData + 'mm8_chr7_tiny.kct')
+          @idx.db.count.should be > 8
+        end
+        it "populates #index_sequences" do
+          @idx = KyotoIndex.open(TestData + 'mm8_chr7_tiny.kct')
+          @idx.index_sequences.size.should be > 0
+          @idx.index_sequences['mm8.chr7'].should == 0
+        end
+        after(:each) do
+          @idx.db.close if @idx
+        end
+      end
+      describe "#find" do
+        context "mm8_chr7" do
+          before(:each) do
+            @p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
+            @idx = KyotoIndex.build(@p, '%')
+          end
+          it "returns a block given a range contained in the block" do
+            l = @idx.find([GenomicInterval.zero_based('mm8.chr7',
+                                                      80082334,
+                                                      80082338)],
+                                @p).to_a
+            l.size.should == 1
+            l[0].offset.should == 16
+          end
+          after(:each) do
+            @idx.db.close
+            @p.f.close
+          end
+        end
+      end
+      describe "#fetch_list" do
+        context "mm8_chr7" do
+          before(:each) do
+            @p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
+            @idx = KyotoIndex.build(@p, '%')
+          end
+          it "returns a block spec given a range contained in the block" do
+            l = @idx.fetch_list([GenomicInterval.zero_based('mm8.chr7',
+                                                            80082334,
+                                                            80082338)])
+            l.size.should == 1
+            l[0][0].should == 16 # block offset
+          end
+          it "returns a block spec with correct size" do
+            l = @idx.fetch_list([GenomicInterval.zero_based('mm8.chr7',
+                                                            80082334,
+                                                            80082338)])
+            l.size.should == 1
+            l[0][0].should == 16 # block offset
+            l[0][1].should == 1087 # block size
+          end
+          it "returns a block spec given its range exactly" do
+            l = @idx.fetch_list([GenomicInterval.zero_based('mm8.chr7',
+                                                            80082334,
+                                                            80082368)])
+            l.size.should == 1
+            l[0][0].should == 16 # block offset
+          end
+          it "returns specs for adjoining blocks given a range partially in each" do
+            l = @idx.fetch_list([GenomicInterval.zero_based('mm8.chr7',
+                                                            80082360,
+                                                            80082370)])
+            l.size.should == 2
+            l.collect { |e| e[0] }.should == [16, 1103]
+          end
+          it "returns a block spec given a range ending in it" do
+            l = @idx.fetch_list([GenomicInterval.zero_based('mm8.chr7',
+                                                            80082330,
+                                                            80082339)])
+            l.size.should == 1
+            l[0][0].should == 16 # block offset
+          end
+          it "returns no block spec given a zero-based range ending at a block start" do
+            l = @idx.fetch_list([GenomicInterval.zero_based('mm8.chr7',
+                                                            80082330,
+                                                            80082334)])
+            l.size.should == 0
+          end
+          it "returns a block spec given a range beginning in it" do
+            l = @idx.fetch_list([GenomicInterval.zero_based('mm8.chr7',
+                                                            80083009,
+                                                            80083220)])
+            l.size.should == 1
+            l[0][0].should == 10113 # block offset
+          end
+          it "returns no block spec given a range beginning at its end" do
+            l = @idx.fetch_list([GenomicInterval.zero_based('mm8.chr7',
+                                                            80083156,
+                                                            80083200)])
+            l.size.should == 0
+          end
+          it "returns specs for all blocks given a range fitting a larger bin" do
+            l = @idx.fetch_list([GenomicInterval.zero_based('mm8.chr7',
+                                                            0,
+                                                            80083200)])
+            l.size.should == 8
+          end
+          it "returns no blocks given a range outside" do
+            l = @idx.fetch_list([GenomicInterval.zero_based('mm8.chr7',
+                                                            80083200,
+                                                            80083300)])
+          end
+          after(:each) do
+            if @idx
+              @idx.db.close
+            end
+          end
+        end
+      end
+      describe "#overlaps?" do
+        before(:each) do
+          @idx = KyotoIndex.new('%')
+        end
+        def check_overlap(x, y)
+          i = x[0]...x[1]
+          @idx.overlaps?(i, y[0], y[1])
+        end
+        it "handles equal intervals" do
+          check_overlap([0, 10],
+                        [0, 10]).should be_true
+        end
+        it "handles X contains Y" do
+          check_overlap([0, 10],
+                        [0, 9]).should be_true
+          check_overlap([0, 10],
+                        [1, 9]).should be_true
+          check_overlap([0, 10],
+                        [1, 10]).should be_true
+        end
+        it "handles Y contains X" do
+          check_overlap([0, 9],
+                        [0, 10]).should be_true
+          check_overlap([1, 9],
+                        [0, 10]).should be_true
+          check_overlap([1, 10],
+                        [0, 10]).should be_true
+        end
+        it "handles partial overlap" do
+          check_overlap([0, 9],
+                        [1, 10]).should be_true
+          check_overlap([1, 10],
+                        [0, 9]).should be_true
+        end
+        it "handles end cases" do
+          check_overlap([0, 10],
+                        [10, 15]).should be_false
+          check_overlap([10, 15],
+                        [0, 10]).should be_false
+        end
+        it "handles separated intervals" do
+          check_overlap([0, 10], [15, 20]).should be_false
+          check_overlap([15, 20], [0, 10]).should be_false
+        end
+        after(:each) do
+          @idx.db.close
+        end
+      end
+      describe "#entries_for" do
+        before(:each) do
+          @p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
+          @block = @p.parse_block
+          @idx = KyotoIndex.new('%')
+        end
+        context "single ref seq" do
+          before(:each) do
+            @idx.index_sequences = { 'mm8.chr7' => 0 }
+            @e = @idx.entries_for(@block)
+          end
+          it "gives the correct key data" do
+            _, seq, bin, i_start, i_end = @e.keys.first.unpack("CCS>L>L>")
+            seq.should == 0
+            bin.should == 1195
+            i_start.should == 80082334
+            i_end.should == 80082368
+          end
+          it "gives the correct offset" do
+            b_offset, b_len = @e.values.first.unpack("Q>L>")
+            b_offset.should == 16
+          end
+          it "gives the correct length" do
+            b_offset, b_len = @e.values.first.unpack("Q>L>")
+            b_len.should == 1087
+          end
+        end
+        after(:each) do
+          @p.f.close
+          @idx.db.close
+        end
+      end
+    end
+    describe "#species" do
+      before(:each) do
+        @p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
+        @idx = KyotoIndex.build(@p, '%')
+      end
+      shared_examples "species" do
+        it "records the correct number of species" do
+          @idx.species.size.should == 11
+        end
+        it "sets species_max_id correctly" do
+          @idx.species_max_id.should == 10
+        end
+      end
+      describe "after building index" do
+        include_examples "species"
+        it "records species in order" do
+          @idx.db["species:mm8"].should == "0"
+        end
+      end
+      describe "after loading index" do
+        before(:each) { @idx = @idx.reopen }
+        include_examples "species"
+      end
+    end
+    describe "Filter classes" do
+      before(:each) do
+        @p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
+        @idx = KyotoIndex.build(@p, '%')
+      end
+      describe AllSpeciesFilter do
+        def fake_entry_with(species_l)
+          ids = species_l.collect {|s| @idx.species.fetch(s)}
+          vec = ids.collect { |id| 1 << id }.reduce(0, :|)
+          return ['', [0, 0, 0, 0, vec].pack(KyotoIndex::VAL_FMT)]
+        end
+       context "with an empty set" do
+          before(:each) do
+            @filter = AllSpeciesFilter.new([], @idx)
+          end
+          it "matches anything" do
+            e = fake_entry_with(%w(mm8 rn4 oryCun1))
+            @filter.match(e).should be_true
+          end
+        end
+        context "with [mm8 rn4]" do
+          before(:each) do
+            @filter = AllSpeciesFilter.new(%w(mm8 rn4), @idx)
+          end
+          it "does not match an empty entry" do
+            e = fake_entry_with(%w())
+            KVHelpers.extract_species_vec(e).should == 0
+            @filter.bs.should_not == 0
+            @filter.match(e).should be_false
+          end
+          it "does not match an entry with mm8" do
+            e = fake_entry_with(%w(mm8))
+            @filter.match(e).should be_false
+          end
+          it "does not match an entry with mm8 oryCun1" do
+            e = fake_entry_with(%w(mm8 oryCun1))
+            @filter.match(e).should be_false
+          end
+          it "matches an entry with mm8 rn4" do
+            e = fake_entry_with(%w(mm8 rn4))
+            @filter.match(e).should be_true
+          end
+          it "does not match an entry with mm8 rn4 oryCun1" do
+            e = fake_entry_with(%w(mm8 rn4 oryCun1))
+            @filter.match(e).should be_true
+          end
+        end
+      end # AllSpeciesFilter
+      describe AtLeastNSequencesFilter do
+        def fake_entry_with(n)
+          return ['', [0, 0, 0, n, 0].pack(KyotoIndex::VAL_FMT)]
+        end
+        context "n = 3" do
+          before(:each) do
+            @filter = AtLeastNSequencesFilter.new(3, @idx)
+          end
+          it "does not match 2 sequences" do
+            e = fake_entry_with(2)
+            @filter.match(e).should be_false
+          end
+          it "matches 3 sequences" do
+            e = fake_entry_with(3)
+            @filter.match(e).should be_true
+          end
+        end
+      end # AtLeastNSequencesFilter
+      after(:each) do
+        @idx.close
+      end
+    end # filter classes
+  end # module MAF
+end # module Bio