RubyGems - bio-maf - Versions diffs - 0.1.0-java - Mend

bio-maf 0.1.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (76) hide show

data/.document +5 -0
data/.simplecov +1 -0
data/.travis.yml +16 -0
data/.yardopts +3 -0
data/DEVELOPMENT.md +40 -0
data/Gemfile +23 -0
data/LICENSE.txt +20 -0
data/README.md +209 -0
data/Rakefile +76 -0
data/VERSION +1 -0
data/benchmarks/dispatch_bench +53 -0
data/benchmarks/iter_bench +44 -0
data/benchmarks/read_bench +40 -0
data/benchmarks/sort_bench +33 -0
data/benchmarks/split_bench +33 -0
data/bin/maf_count +82 -0
data/bin/maf_dump_blocks +27 -0
data/bin/maf_extract_ranges_count +44 -0
data/bin/maf_index +88 -0
data/bin/maf_parse_bench +94 -0
data/bin/maf_to_fasta +68 -0
data/bin/maf_write +84 -0
data/bin/random_ranges +35 -0
data/features/maf-indexing.feature +31 -0
data/features/maf-output.feature +29 -0
data/features/maf-parsing.feature +44 -0
data/features/maf-querying.feature +75 -0
data/features/maf-to-fasta.feature +50 -0
data/features/step_definitions/convert_steps.rb +45 -0
data/features/step_definitions/index_steps.rb +20 -0
data/features/step_definitions/output_steps.rb +27 -0
data/features/step_definitions/parse_steps.rb +63 -0
data/features/step_definitions/query_steps.rb +31 -0
data/features/step_definitions/ucsc_bin_steps.rb +14 -0
data/features/support/env.rb +16 -0
data/features/ucsc-bins.feature +24 -0
data/lib/bio-maf.rb +12 -0
data/lib/bio-maf/maf.rb +3 -0
data/lib/bio/maf.rb +4 -0
data/lib/bio/maf/index.rb +620 -0
data/lib/bio/maf/parser.rb +888 -0
data/lib/bio/maf/struct.rb +63 -0
data/lib/bio/maf/writer.rb +63 -0
data/lib/bio/ucsc.rb +2 -0
data/lib/bio/ucsc/genomic-interval-bin.rb +13 -0
data/lib/bio/ucsc/ucsc_bin.rb +117 -0
data/man/.gitignore +1 -0
data/man/maf_index.1 +105 -0
data/man/maf_index.1.markdown +97 -0
data/man/maf_index.1.ronn +83 -0
data/man/maf_to_fasta.1 +53 -0
data/man/maf_to_fasta.1.ronn +51 -0
data/spec/bio/maf/index_spec.rb +363 -0
data/spec/bio/maf/parser_spec.rb +354 -0
data/spec/bio/maf/struct_spec.rb +75 -0
data/spec/spec_helper.rb +14 -0
data/test/data/big-block.maf +15999 -0
data/test/data/chr22_ieq.maf +11 -0
data/test/data/chrY-1block.maf +6 -0
data/test/data/empty +0 -0
data/test/data/empty.db +0 -0
data/test/data/mm8_chr7_tiny.kct +0 -0
data/test/data/mm8_chr7_tiny.maf +76 -0
data/test/data/mm8_mod_a.maf +7 -0
data/test/data/mm8_single.maf +13 -0
data/test/data/mm8_subset_a.maf +23 -0
data/test/data/t1-bad1.maf +15 -0
data/test/data/t1.fasta +12 -0
data/test/data/t1.maf +15 -0
data/test/data/t1a.maf +17 -0
data/test/helper.rb +18 -0
data/test/test_bio-maf.rb +7 -0
data/travis-ci/install_kc +13 -0
data/travis-ci/install_kc_java +13 -0
data/travis-ci/report_errors +4 -0
metadata +182 -0

data/lib/bio/maf/struct.rb ADDED

@@ -0,0 +1,63 @@
+## NOTE: this is probably not the best place for this, ultimately.
+## If it works, think about moving it.
+module Bio
+  module MAF
+    class Struct
+      def initialize(spec)
+        @members = []
+        @by_name = {}
+        offset = 0
+        spec.each do |m_spec|
+          m = Member.new(offset, *m_spec)
+          @members << m
+          @by_name[m.name] = m
+          offset += m.size
+        end
+      end
+      def fmt
+        @members.collect { |m| m.fmt }.join('')
+      end
+      def extractor_fmt(*names)
+        extract = names.collect { |name| @by_name.fetch(name) }
+        extract.sort_by! { |m| m.offset }
+        fmt = ''
+        pos = 0
+        extract.each do |member|
+          if member.offset != pos
+            fmt << "@#{member.offset}"
+            pos = member.offset
+          end
+          fmt << member.fmt
+          pos += member.size
+        end
+        return fmt
+      end
+    end
+    TYPE_PROPS = {
+      :uint8  => { :size => 1, :fmt => 'C'  },
+      :uint16 => { :size => 2, :fmt => 'S>' },
+      :uint32 => { :size => 4, :fmt => 'L>' },
+      :uint64 => { :size => 8, :fmt => 'Q>' }
+    }
+    class Member
+      attr_reader :offset, :name, :type, :size, :fmt
+      def initialize(offset, name, type)
+        @offset = offset
+        @name = name
+        @type = type
+        props = TYPE_PROPS.fetch(type)
+        @size = props.fetch(:size)
+        @fmt = props.fetch(:fmt)
+      end
+    end
+ end
+end

data/lib/bio/maf/writer.rb ADDED

@@ -0,0 +1,63 @@
+module Bio::MAF
+  class Writer
+    attr_reader :f, :path
+    def initialize(fspec)
+      if fspec.respond_to? :write
+        @f = fspec
+        if fspec.respond_to? :path
+          @path = fspec.path
+        end
+      else
+        @path = fspec
+        @f = File.open(fspec, 'w')
+      end
+    end
+    def flatten_vars(vars)
+      vars.to_a.collect {|k, v| "#{k}=#{v}"}.join(" ")
+    end
+    def write_header(header)
+      f.puts "##maf #{flatten_vars(header.vars)}"
+      f.puts "##{header.alignment_params}" if header.alignment_params
+    end
+    def write_blocks(blocks)
+      blocks.each do |block|
+        write_block(block)
+      end
+      f.flush
+    end
+    def write_block(block)
+      lines = ["a #{flatten_vars(block.vars)}"]
+      block.sequences.each do |seq|
+        write_seq(seq, lines)
+      end
+      lines << " "
+      f.puts lines.join("\n")
+    end
+    def write_seq(s, lines)
+      lines << sprintf("%s %-20s %12d %2d %s %9d %s",
+                       s.empty? ? "e" : "s",
+                       s.source,
+                       s.start,
+                       s.size,
+                       s.strand,
+                       s.src_size,
+                       s.empty? ? s.status : s.text)
+      if s.quality
+        lines << sprintf("q %-20s                           %s",
+                         s.source, s.quality)
+      end
+      if s.i_data
+        lines << sprintf("i %-20s %s %s %s %s",
+                         s.source, *s.i_data)
+      end
+    end
+  end
+end

data/lib/bio/ucsc.rb ADDED

	@@ -0,0 +1,2 @@
1	+ require 'bio/ucsc/ucsc_bin'
2	+ require 'bio/ucsc/genomic-interval-bin'

data/lib/bio/ucsc/genomic-interval-bin.rb ADDED

@@ -0,0 +1,13 @@
+require "bio-genomic-interval"
+module Bio
+  class GenomicInterval
+    def bin
+      Bio::Ucsc::UcscBin.bin(self.zero_start, self.zero_end)
+    end
+    def bin_all
+      Bio::Ucsc::UcscBin.bin_all(self.zero_start, self.zero_end)
+    end
+  end
+end

data/lib/bio/ucsc/ucsc_bin.rb ADDED

@@ -0,0 +1,117 @@
+# = UCSCBin
+# Author::    MISHIMA, Hiroyuki
+# Copyright:: MISHIMA, Hiroyuki, 2010-2011
+# License::   The Ruby licence (Ryby's / GPLv2 dual)
+#
+# Original program in C by Jim Kent, 2002
+# See also http://genomewiki.ucsc.edu/index.php/Bin_indexing_system;
+# a paper Kent, et. al. Genome Research 2002.12:996-1006;
+# and src/lib/binRange.c in the kent source tree.
+#
+# Bio::Ucsc::UcscBin -
+# 1) convert between 0-based half-open interval and
+#    1-based full-close intervals.
+# 2) Calculate Bin number from genomic physical position
+# according to UCSC's Bin Indexing System.
+#
+module Bio
+  module Ucsc
+    class UcscBin
+      # Version = "0.1.0" # 20100714
+      # Version = "0.2.0" # 20101028
+      # Version = "0.2.1" # 20110408
+      Version = "0.2.2" # 20110418 the licence is changed
+                        # embeded in BioRubyUcscApi
+                        # handle the case, start==end in [start, end)
+      BINRANGE_MAXEND_512M       = (512*1024*1024)
+      BIN_OFFSETS_EXTENDED       = [4096+512+64+8+1, 512+64+8+1, 64+8+1, 8+1, 1, 0]
+ #     BIN_OFFSETS_EXTENDED = [4096+512+64+8+1, 512+64+8+1, 64+8+1, 8+1, 1] (to omit BIN=0)
+      BIN_OFFSETS                = [512+64+8+1, 64+8+1, 8+1, 1, 0]
+ #     BIN_OFFSETS = [512+64+8+1, 64+8+1, 8+1, 1] (to omit BIN=0)
+      BIN_OFFSET_OLD_TO_EXTENDED = 4681
+      # How much to shift to get to finest bin.
+      BIN_FIRST_SHIFT            = 17
+      # How much to shift to get to next larger bin.
+      BIN_NEXT_SHIFT             = 3
+      # Return a Integer of a BIN which is the smallest/finest bin
+      # containing whole the interval/range.
+      #
+      # Extended bin index for positions >= 512M is not supported yet
+      # Do you need it? Please email me.
+      def self.bin_from_range(bin_start, bin_end)
+        if bin_end <= BINRANGE_MAXEND_512M
+          bin_from_range_standard(bin_start, bin_end)
+        else
+          bin_from_range_extended(bin_start, bin_end)
+        end
+      end
+      class << self;  alias bin bin_from_range; end
+      # Return an Array of BINs which are all bins containing whole the
+      # interval/range. Thus, it always contains "0" indicating a bin
+      # containing whole of a chromosome.
+      #
+      # extended bin index for positions >= 512M is not supported yet
+      # Do you need it? Please email me.
+      #
+      def self.bin_all(p_start, p_end)
+        if p_end <= BINRANGE_MAXEND_512M
+          bin_all_standard(p_start, p_end)
+        else
+          bin_all_extended(p_start, p_end)
+        end
+      end
+      private
+      def self.bin_from_range_standard(bin_start, bin_end)
+        # Given start,end in chromosome coordinates assign it
+        # a bin.   There's a bin for each 128k segment, for each
+        # 1M segment, for each 8M segment, for each 64M segment,
+        # and for each chromosome (which is assumed to be less than
+        # 512M.)  A range goes into the smallest bin it will fit in.
+        bin_start >>= BIN_FIRST_SHIFT
+        bin_end -= 1
+        bin_end >>= BIN_FIRST_SHIFT
+        BIN_OFFSETS.each do |offset|
+          return offset + bin_start if bin_start == bin_end
+          bin_start >>= BIN_NEXT_SHIFT
+          bin_end   >>= BIN_NEXT_SHIFT
+        end
+        raise RangeError, \
+        "start #{bin_start}, end #{bin_end} out of range in findBin (max is 512M)"
+      end
+      def self.bin_from_range_extended(bin_start, bin_end)
+        raise NotImplementedError, "Extended bins are not supported yet"
+      end
+      def self.bin_all_standard(bin_start, bin_end)
+        bin_start_orig = bin_start
+        bin_end_orig   = bin_end
+        results = Array.new
+        bin_start >>= BIN_FIRST_SHIFT
+        bin_end -= 1
+        bin_end >>= BIN_FIRST_SHIFT
+        BIN_OFFSETS.each do |offset|
+          results.concat(((offset + bin_start)..(offset + bin_end)).to_a)
+          bin_start >>= BIN_NEXT_SHIFT
+          bin_end   >>= BIN_NEXT_SHIFT
+        end
+        return results
+      end
+      def self.bin_all_extended(bin_start, bin_end)
+        raise NotImplementedError, "Extended bins are not supported yet"
+      end
+    end # class UcscBin
+  end # module Ucsc
+end # module Bio

data/man/.gitignore ADDED

	@@ -0,0 +1 @@
1	+ *.html

data/man/maf_index.1 ADDED

@@ -0,0 +1,105 @@
+.\" generated with Ronn/v0.7.3
+.\" http://github.com/rtomayko/ronn/tree/0.7.3
+.
+.TH "MAF_INDEX" "1" "June 2012" "Clayton Wheeler" "BioRuby Manual"
+.
+.SH "NAME"
+\fBmaf_index\fR \- build and examine MAF indexes
+.
+.SH "SYNOPSIS"
+\fBmaf_index\fR [\-t] \fImaf\fR \fIindex\fR
+.
+.br
+\fBmaf_index\fR \fB\-d\fR|\fB\-\-dump\fR \fIindex\fR
+.
+.SH "DESCRIPTION"
+\fBmaf_index\fR is part of the bioruby\-maf library and creates Kyoto Cabinet indexes for Multiple Alignment Format (MAF) files\. These indexes enable other MAF tools to selectively extract alignment blocks of interest\.
+.
+.P
+In its default mode, \fBmaf_index\fR parses the \fImaf\fR file given as an argument and creates an index in \fIindex\fR\.
+.
+.P
+The index data is stored in binary form, so with the \fB\-\-dump\fR argument, \fBmaf_index\fR can dump out the index data in human\-readable form for debugging\.
+.
+.SH "FILES"
+The \fImaf\fR input file must be a valid MAF file of any length\.
+.
+.P
+The index created is a Kyoto Cabinet TreeDB (B+ tree) database; \fIindex\fR must have a \fB\.kct\fR extension\.
+.
+.SH "OPTIONS"
+TODO
+.
+.TP
+\fB\-d\fR, \fB\-\-dump\fR
+Instead of creating an index, dump out the given \fIindex\fR in human\-readable form\. Index records will appear like:
+.
+.IP "" 4
+.
+.nf
+0 [bin 1195] 80082334:80082368
+  offset 16, length 1087
+  text size: 54
+  sequences in block: 10
+  species vector: 00000000000003ff
+.
+.fi
+.
+.IP "" 0
+.
+.TP
+\fB\-t\fR, \fB\-\-threaded\fR
+Use a separate reader thread to do I/O in parallel with parsing\. Only useful on JRuby\.
+.
+.TP
+\fB\-\-time\fR
+Print elapsed time for index creation\. Mainly useful for measuring performance with different Ruby implementations, I/O subsystems, etc\.
+.
+.SH "EXAMPLES"
+Build an index on a MAF file:
+.
+.IP "" 4
+.
+.nf
+$ maf_index chr22\.maf chr22\.kct
+.
+.fi
+.
+.IP "" 0
+.
+.P
+Dump out an index:
+.
+.IP "" 4
+.
+.nf
+$ maf_index \-d chr22\.kct > /tmp/chr22\.dump
+.
+.fi
+.
+.IP "" 0
+.
+.SH "ENVIRONMENT"
+\fBmaf_index\fR is a Ruby program and relies on ordinary Ruby environment variables\.
+.
+.SH "BUGS"
+\fBmaf_index\fR does not currently allow Kyoto Cabinet database parameters to be set\.
+.
+.SH "COPYRIGHT"
+\fBmaf_index\fR is copyright (C) 2012 Clayton Wheeler\.
+.
+.SH "SEE ALSO"
+ruby(1), kctreemgr(1)
+.
+.IP "\(bu" 4
+\fIhttps://github\.com/csw/bioruby\-maf/\fR
+.
+.IP "\(bu" 4
+\fIhttp://fallabs\.com/kyotocabinet/\fR
+.
+.IP "" 0

data/man/maf_index.1.markdown ADDED

@@ -0,0 +1,97 @@
+maf_index(1) -- build and examine MAF indexes
+=============================================
+## SYNOPSIS
+`maf_index` [-t] <var>maf</var> <var>index</var><br>
+`maf_index` `-d`|`--dump` <var>index</var>
+## DESCRIPTION
+**maf_index** is part of the bioruby-maf library and creates
+Kyoto Cabinet indexes for Multiple Alignment Format (MAF)
+files. These indexes enable other MAF tools to selectively extract
+alignment blocks of interest.
+In its default mode, `maf_index` parses the <var>maf</var> file given as an
+argument and creates an index in <var>index</var>.
+The index data is stored in binary form, so with the `--dump`
+argument, `maf_index` can dump out the index data in human-readable
+form for debugging.
+## FILES
+The <var>maf</var> input file must be a valid MAF file of any length.
+The index created is a Kyoto Cabinet TreeDB (B+ tree) database;
+<var>index</var> must have a `.kct` extension.
+## OPTIONS
+TODO
+ * `-d`, `--dump`:
+   Instead of creating an index, dump out the given <var>index</var> in
+   human-readable form. Index records will appear like:
+       0 [bin 1195] 80082334:80082368
+         offset 16, length 1087
+         text size: 54
+         sequences in block: 10
+         species vector: 00000000000003ff
+ * `-t`, `--threaded`:
+   Use a separate reader thread to do I/O in parallel with
+   parsing. Only useful on JRuby.
+ * `--time`:
+   Print elapsed time for index creation. Mainly useful for measuring
+   performance with different Ruby implementations, I/O subsystems,
+   etc.
+## EXAMPLES
+Build an index on a MAF file:
+    $ maf_index chr22.maf chr22.kct
+Dump out an index:
+    $ maf_index -d chr22.kct > /tmp/chr22.dump
+## ENVIRONMENT
+`maf_index` is a Ruby program and relies on ordinary Ruby environment
+variables.
+## BUGS
+`maf_index` does not currently allow Kyoto Cabinet database parameters
+to be set.
+## COPYRIGHT
+`maf_index` is copyright (C) 2012 Clayton Wheeler.
+## SEE ALSO
+ruby(1), kctreemgr(1)
+ * <https://github.com/csw/bioruby-maf/>
+ * <http://fallabs.com/kyotocabinet/>
+[SYNOPSIS]: #SYNOPSIS "SYNOPSIS"
+[DESCRIPTION]: #DESCRIPTION "DESCRIPTION"
+[FILES]: #FILES "FILES"
+[OPTIONS]: #OPTIONS "OPTIONS"
+[EXAMPLES]: #EXAMPLES "EXAMPLES"
+[ENVIRONMENT]: #ENVIRONMENT "ENVIRONMENT"
+[BUGS]: #BUGS "BUGS"
+[COPYRIGHT]: #COPYRIGHT "COPYRIGHT"
+[SEE ALSO]: #SEE-ALSO "SEE ALSO"
+[maf_index(1)]: maf_index.1.html