RubyGems - bio-maf - Versions diffs - 0.3.0-java → 0.3.2-java - Mend

bio-maf 0.3.0-java → 0.3.2-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

data/DEVELOPMENT.md +4 -0
data/README.md +172 -114
data/bin/maf_count +0 -1
data/bin/maf_dump_blocks +0 -1
data/bin/maf_extract +180 -0
data/bin/maf_index +15 -8
data/bin/maf_tile +2 -0
data/bin/maf_to_fasta +4 -7
data/bio-maf.gemspec +3 -4
data/features/maf-indexing.feature +21 -1
data/features/step_definitions/convert_steps.rb +2 -7
data/features/step_definitions/index_steps.rb +4 -0
data/lib/bio-maf.rb +5 -0
data/lib/bio/maf/index.rb +33 -23
data/lib/bio/maf/maf.rb +10 -7
data/lib/bio/maf/parser.rb +37 -15
data/lib/bio/maf/tiler.rb +60 -8
data/lib/bio/maf/writer.rb +26 -0
data/man/maf_extract.1 +268 -0
data/man/maf_extract.1.ronn +213 -0
data/man/maf_index.1 +21 -10
data/man/maf_index.1.ronn +14 -7
data/man/maf_tile.1 +12 -0
data/man/maf_tile.1.ronn +9 -0
data/spec/bio/maf/index_spec.rb +23 -0
metadata +14 -10

data/lib/bio/maf/tiler.rb CHANGED

@@ -13,16 +13,44 @@ module Bio::MAF
     attr_reader :reference
     # GenomicInterval
     attr_accessor :interval
+    # The species of interest to extract from the MAF file. Will be
+    # set as a {Parser#sequence_filter} for parsing. Defaults to the
+    # keys of {#species_map}.
+    #
+    # @return [Array<String>]
     attr_accessor :species
+    # A hash mapping species to their desired output names.
+    #
+    # @return [Hash]
     attr_accessor :species_map
+    # The character used to fill regions where no sequence data is available for a particular species. Defaults to `*`.
+    # @return [String]
+    attr_reader   :fill_char
     def initialize
       @species_map = {}
+      self.fill_char = '*'
+    end
+    # Set the character to be used for filling regions with no
+    # sequence data from the MAF file or a reference sequence.
+    # @param c [String] a one-character String to fill with
+    def fill_char=(c)
+      unless c.is_a?(String) && c.length == 1
+        raise ArgumentError, "not a single character: #{c.inspect}"
+      end
+      @fill_char = c
     end
-    # Set the reference sequence.
+    # Set the reference sequence. This can be a {Pathname} or a
+    # {String} giving the path to an optionally-gzipped FASTA file, an
+    # open {IO} stream to a FASTA file, a String containing FASTA
+    # data, or a {FASTARangeReader} instance.
     #
-    # @param source [FASTARangeReader, String, Pathname]
+    # @param source [FASTARangeReader, String, Pathname, #readline]
     def reference=(source)
       ref = case
             when source.is_a?(FASTARangeReader)
@@ -57,6 +85,13 @@ module Bio::MAF
       species || species_map.keys
     end
+    def species_for_output
+      species_to_use.collect { |s| species_map[s] || s }
+    end
+    # Return an array of tiled sequence data, in the order given by
+    # {#species_to_use}.
+    # @return [Array<String>]
     def tile
       parser.sequence_filter[:only_species] = species_to_use
       # TODO: remove gaps
@@ -88,8 +123,8 @@ module Bio::MAF
                      else
                        'N' * range_size
                      end
-          stars = '*' * range_size
-          nonref_text.each { |t| t << stars }
+          fill_text = fill_char * range_size
+          nonref_text.each { |t| t << fill_text }
         else
           # covered by an alignment block
           t_range = block.ref_seq.text_range(g_range)
@@ -100,8 +135,8 @@ module Bio::MAF
               # got alignment text
               sp_text << seq.text.slice(t_range)
             else
-              # no alignment for this one here, use '*'
-              sp_text << '*' * (t_range.end - t_range.begin)
+              # no alignment for this one here, use the fill char
+              sp_text << fill_char * (t_range.end - t_range.begin)
             end
           end
         end
@@ -109,9 +144,26 @@ module Bio::MAF
       text
     end
+    # Tile sequences to build a new {Bio::BioAlignment::Alignment
+    # Alignment} object. This will have one
+    # {Bio::BioAlignment::Sequence Sequence} per entry in {#species}
+    # or {#species_map}, in the same order. Each sequence will have an
+    # {Bio::BioAlignment::Sequence#id id} given by {#species_map} or,
+    # if none is present, the identifier from {#species}.
+    #
+    # @return [Bio::BioAlignment::Alignment]
+    # @api public
+    def build_bio_alignment
+      Bio::BioAlignment::Alignment.new(tile(), species_for_output)
+    end
+    # Write a FASTA representation of the tiled sequences to the given
+    # output stream.
+    #
+    # @param [#puts] f the output stream to write the FASTA data to.
+    # @api public
     def write_fasta(f)
-      species_to_use.zip(tile()) do |species, text|
-        sp_out = species_map[species] || species
+      species_for_output.zip(tile()) do |sp_out, text|
         f.puts ">#{sp_out}"
         f.puts text
       end

data/lib/bio/maf/writer.rb CHANGED

@@ -59,5 +59,31 @@ module Bio::MAF
       end
     end
   end
+  FASTA_LINE_LEN = 72
+  class FASTAWriter
+    def initialize(outf)
+      @f = outf
+    end
+    def write_block(block)
+      block.sequences.each do |seq|
+        write_sequence(seq) unless seq.empty?
+      end
+    end
+    def write_sequence(seq)
+      @f.puts(">#{seq.fasta_desc}")
+      0.step(seq.text.size, FASTA_LINE_LEN) do |pos|
+        @f.puts(seq.text.slice(pos, FASTA_LINE_LEN))
+      end
+    end
+    def close
+      @f.close
+    end
+  end
 end

data/man/maf_extract.1 ADDED

@@ -0,0 +1,268 @@
+.\" generated with Ronn/v0.7.3
+.\" http://github.com/rtomayko/ronn/tree/0.7.3
+.
+.TH "MAF_EXTRACT" "1" "July 2012" "BioRuby" "BioRuby Manual"
+.
+.SH "NAME"
+\fBmaf_extract\fR \- extract blocks from MAF files
+.
+.SH "SYNOPSIS"
+\fBmaf_extract\fR \-m MAF [\-i INDEX] \-\-interval SEQ:START\-END \fIOPTIONS\fR
+.
+.P
+\fBmaf_extract\fR \-m MAF [\-i INDEX] \-\-bed BED \fIOPTIONS\fR
+.
+.P
+\fBmaf_extract\fR \-d MAFDIR \-\-interval SEQ:START\-END \fIOPTIONS\fR
+.
+.P
+\fBmaf_extract\fR \-d MAFDIR \-\-bed BED \fIOPTIONS\fR
+.
+.SH "DESCRIPTION"
+\fBmaf_extract\fR extracts alignment blocks from one or more indexed MAF files, according to either a genomic interval specified with \fB\-\-interval\fR or multiple intervals given in a BED file specified with \fB\-\-bed\fR\.
+.
+.P
+It can either match blocks intersecting the specified intervals with \fB\-\-mode intersect\fR, the default, or extract slices of them which cover only the specified intervals, with \fB\-\-mode slice\fR\.
+.
+.P
+Blocks and the sequences they contain can be filtered with a variety of options including \fB\-\-only\-species\fR, \fB\-\-with\-all\-species\fR, \fB\-\-min\-sequences\fR, \fB\-\-min\-text\-size\fR, and \fB\-\-max\-text\-size\fR\.
+.
+.P
+With the \fB\-\-join\-blocks\fR option, adjacent parsed blocks can be joined if sequence filtering has removed a species causing them to be separated\. The \fB\-\-remove\-gaps\fR option will remove columns containing only gaps (\fB\-\fR)\.
+.
+.P
+Blocks can be output in MAF format, with \fB\-\-format maf\fR (the default), or FASTA format, with \fB\-\-format fasta\fR\. Output can be directed to a file with \fB\-\-output\fR\.
+.
+.P
+This tool exposes almost all the random\-access functionality of the Bio::MAF::Access class\. The exception is MAF tiling, which is provided by maf_tile(1)\.
+.
+.SH "FILES"
+A single MAF file can be processed by specifying it with \fB\-\-maf\fR\. Its accompanying index, created by maf_index(1), is specified with \fB\-\-index\fR\. If \fB\-\-maf\fR is given but no index is specified, the entire file will be parsed to build a temporary in\-memory index\. This facilitates processing small, transient MAF files\. However, on a large file this will incur a great deal of overhead; files expected to be used more than once should be indexed with maf_index(1)\.
+.
+.P
+Alternatively, a directory of indexed MAF files can be specified with \fB\-\-maf\-dir\fR; in this case, they will all be used to satisfy queries\.
+.
+.SH "OPTIONS"
+MAF source options:
+.
+.TP
+\fB\-m\fR, \fB\-\-maf MAF\fR
+A single MAF file to process\.
+.
+.TP
+\fB\-i\fR, \fB\-\-index INDEX\fR
+An index for the file specified with \fB\-\-maf\fR, as created by maf_index(1)\.
+.
+.TP
+\fB\-d\fR, \fB\-\-maf\-dir DIR\fR
+A directory of indexed MAF files\.
+.
+.P
+Extraction options:
+.
+.TP
+\fB\-\-mode (intersect | slice)\fR
+The extraction mode to use\. With \fB\-\-mode intersect\fR, any alignment block intersecting the genomic intervals specified will be matched in its entirety\. With \fB\-\-mode slice\fR, intersecting blocks will be matched in the same way, but columns extending outside the specified interval will be removed\.
+.
+.TP
+\fB\-\-bed BED\fR
+The specified file will be parsed as a BED file, and each interval it contains will be matched in turn\.
+.
+.TP
+\fB\-\-interval SEQ:START\-END\fR
+A single zero\-based half\-open genomic interval will be matched, with sequence identifier \fIseq\fR, (inclusive) start position \fIstart\fR, and (exclusive) end position \fIend\fR\.
+.
+.P
+Output options:
+.
+.TP
+\fB\-f\fR, \fB\-\-format (maf | fasta)\fR
+Output will be written in the specified format, either MAF or FASTA\.
+.
+.TP
+\fB\-o\fR, \fB\-\-output OUT\fR
+Output will be written to the file \fIout\fR\.
+.
+.P
+Filtering options:
+.
+.TP
+\fB\-\-only\-species (SP1,SP2,SP3 | @FILE)\fR
+Alignment blocks will be filtered to contain only the specified species\. These can be given as a comma\-separated list or as a file, prefixed with \fB@\fR, from which a list of species will be read\.
+.
+.TP
+\fB\-\-with\-all\-species (SP1,SP2,SP3 | @FILE)\fR
+Only alignment blocks containing all the specified species will be matched\. These can be given as a comma\-separated list or as a file, prefixed with \fB@\fR, from which a list of species will be read\.
+.
+.TP
+\fB\-\-min\-sequences N\fR
+Only alignment blocks containing at least \fIn\fR sequences will be matched\.
+.
+.TP
+\fB\-\-min\-text\-size N\fR
+Only alignment blocks with a text size (including gaps) of at least \fIn\fR will be matched\.
+.
+.TP
+\fB\-\-max\-text\-size N\fR
+Only alignment blocks with a text size (including gaps) of at most \fIn\fR will be matched\.
+.
+.P
+Block processing options:
+.
+.TP
+\fB\-\-join\-blocks\fR
+If sequence filtering with \fB\-\-only\-species\fR removes a species which caused two adjacent blocks to be separate, this option will join them together into a single alignment block\. The filtered blocks must contain the same sequences in contiguous positions and on the same strand\.
+.
+.TP
+\fB\-\-remove\-gaps\fR
+If sequence filtering with \fB\-\-only\-species\fR leaves a block containing columns consisting only of gap characters (\fB\-\fR), these will be removed\.
+.
+.TP
+\fB\-\-parse\-extended\fR
+Parse \fBi\fR lines, giving information on the context of sequence lines, and \fBq\fR lines, giving quality scores\.
+.
+.TP
+\fB\-\-parse\-empty\fR
+Parse \fBe\fR lines, indicating cases where a species does not align with the current block but does align with blocks before and after it\.
+.
+.P
+Logging options:
+.
+.TP
+\fB\-q\fR, \fB\-\-quiet\fR
+Run quietly, with warnings suppressed\.
+.
+.TP
+\fB\-v\fR, \fB\-\-verbose\fR
+Run verbosely, with additional informational messages\.
+.
+.TP
+\fB\-\-debug\fR
+Log debugging information\.
+.
+.SH "EXAMPLES"
+Extract MAF blocks intersecting with a given interval:
+.
+.IP "" 4
+.
+.nf
+$ maf_extract \-d test/data \-\-interval mm8\.chr7:80082592\-80082766
+.
+.fi
+.
+.IP "" 0
+.
+.P
+As above, but operating on a single file:
+.
+.IP "" 4
+.
+.nf
+$ maf_extract \-m test/data/mm8_chr7_tiny\.maf \e
+      \-i test/data/mm8_chr7_tiny\.kct \e
+      \-\-interval mm8\.chr7:80082592\-80082766
+.
+.fi
+.
+.IP "" 0
+.
+.P
+Like the first case, but writing output to a file:
+.
+.IP "" 4
+.
+.nf
+$ maf_extract \-d test/data \-\-interval mm8\.chr7:80082592\-80082766 \e
+      \-\-output out\.maf
+.
+.fi
+.
+.IP "" 0
+.
+.P
+Extract a slice of MAF blocks over a given interval:
+.
+.IP "" 4
+.
+.nf
+$ maf_extract \-d test/data \-\-mode slice \e
+      \-\-interval mm8\.chr7:80082592\-80082766
+.
+.fi
+.
+.IP "" 0
+.
+.P
+Filter for sequences from only certain species:
+.
+.IP "" 4
+.
+.nf
+$ maf_extract \-d test/data \-\-interval mm8\.chr7:80082592\-80082766 \e
+      \-\-only\-species hg18,mm8,rheMac2
+.
+.fi
+.
+.IP "" 0
+.
+.P
+Extract only blocks with all specified species:
+.
+.IP "" 4
+.
+.nf
+$ maf_extract \-d test/data \-\-interval mm8\.chr7:80082471\-80082730 \e
+      \-\-with\-all\-species panTro2,loxAfr1
+.
+.fi
+.
+.IP "" 0
+.
+.P
+Extract blocks with at least a certain number of sequences:
+.
+.IP "" 4
+.
+.nf
+$ maf_extract \-d test/data \-\-interval mm8\.chr7:80082767\-80083008 \e
+      \-\-min\-sequences 6
+.
+.fi
+.
+.IP "" 0
+.
+.P
+Extract blocks with text sizes in a certain range:
+.
+.IP "" 4
+.
+.nf
+$ maf_extract \-d test/data \-\-interval mm8\.chr7:0\-80100000 \e
+      \-\-min\-text\-size 72 \-\-max\-text\-size 160
+.
+.fi
+.
+.IP "" 0
+.
+.SH "ENVIRONMENT"
+\fBmaf_index\fR is a Ruby program and relies on ordinary Ruby environment variables\.
+.
+.SH "BUGS"
+No provision exists for writing output to multiple files\.
+.
+.P
+FASTA description lines are always in the format \fB>source:start\-end\fR\.
+.
+.SH "COPYRIGHT"
+\fBmaf_index\fR is copyright (C) 2012 Clayton Wheeler\.
+.
+.SH "SEE ALSO"
+ruby(1), maf_index(1), maf_tile(1)

data/man/maf_extract.1.ronn ADDED

@@ -0,0 +1,213 @@
+maf_extract(1) -- extract blocks from MAF files
+===============================================
+## SYNOPSIS
+`maf_extract` -m MAF [-i INDEX] --interval SEQ:START-END [OPTIONS]
+`maf_extract` -m MAF [-i INDEX] --bed BED [OPTIONS]
+`maf_extract` -d MAFDIR --interval SEQ:START-END [OPTIONS]
+`maf_extract` -d MAFDIR --bed BED [OPTIONS]
+## DESCRIPTION
+**maf_extract** extracts alignment blocks from one or more indexed MAF
+files, according to either a genomic interval specified with
+`--interval` or multiple intervals given in a BED file specified with
+`--bed`.
+It can either match blocks intersecting the specified intervals with
+`--mode intersect`, the default, or extract slices of them which cover
+only the specified intervals, with `--mode slice`.
+Blocks and the sequences they contain can be filtered with a variety
+of options including `--only-species`, `--with-all-species`,
+`--min-sequences`, `--min-text-size`, and `--max-text-size`.
+With the `--join-blocks` option, adjacent parsed blocks can be joined if
+sequence filtering has removed a species causing them to be
+separated. The `--remove-gaps` option will remove columns containing
+only gaps (`-`).
+Blocks can be output in MAF format, with `--format maf` (the default),
+or FASTA format, with `--format fasta`. Output can be directed to a
+file with `--output`.
+This tool exposes almost all the random-access functionality of the
+Bio::MAF::Access class. The exception is MAF tiling, which is provided
+by maf_tile(1).
+## FILES
+A single MAF file can be processed by specifying it with `--maf`. Its
+accompanying index, created by maf_index(1), is specified with
+`--index`. If `--maf` is given but no index is specified, the entire
+file will be parsed to build a temporary in-memory index. This
+facilitates processing small, transient MAF files. However, on a large
+file this will incur a great deal of overhead; files expected to be
+used more than once should be indexed with maf_index(1).
+Alternatively, a directory of indexed MAF files can be specified with
+`--maf-dir`; in this case, they will all be used to satisfy queries.
+## OPTIONS
+MAF source options:
+ * `-m`, `--maf MAF`:
+   A single MAF file to process.
+ * `-i`, `--index INDEX`:
+   An index for the file specified with `--maf`, as created by
+   maf_index(1).
+ * `-d`, `--maf-dir DIR`:
+   A directory of indexed MAF files.
+Extraction options:
+ * `--mode (intersect | slice)`:
+   The extraction mode to use. With `--mode intersect`, any alignment
+   block intersecting the genomic intervals specified will be matched
+   in its entirety. With `--mode slice`, intersecting blocks will be
+   matched in the same way, but columns extending outside the
+   specified interval will be removed.
+ * `--bed BED`:
+   The specified file will be parsed as a BED file, and each interval
+   it contains will be matched in turn.
+ * `--interval SEQ:START-END`:
+   A single zero-based half-open genomic interval will be matched,
+   with sequence identifier <seq>, (inclusive) start position <start>,
+   and (exclusive) end position <end>.
+Output options:
+ * `-f`, `--format (maf | fasta)`:
+   Output will be written in the specified format, either MAF or
+   FASTA.
+ * `-o`, `--output OUT`:
+   Output will be written to the file <out>.
+Filtering options:
+ * `--only-species (SP1,SP2,SP3 | @FILE)`:
+   Alignment blocks will be filtered to contain only the specified
+   species. These can be given as a comma-separated list or as a file,
+   prefixed with `@`, from which a list of species will be read.
+ * `--with-all-species (SP1,SP2,SP3 | @FILE)`:
+   Only alignment blocks containing all the specified species will be
+   matched. These can be given as a comma-separated list or as a file,
+   prefixed with `@`, from which a list of species will be read.
+ * `--min-sequences N`:
+   Only alignment blocks containing at least <n> sequences will be
+   matched.
+ * `--min-text-size N`:
+   Only alignment blocks with a text size (including gaps) of at least
+   <n> will be matched.
+ * `--max-text-size N`:
+   Only alignment blocks with a text size (including gaps) of at most
+   <n> will be matched.
+Block processing options:
+ * `--join-blocks`:
+   If sequence filtering with `--only-species` removes a species which
+   caused two adjacent blocks to be separate, this option will join
+   them together into a single alignment block. The filtered blocks
+   must contain the same sequences in contiguous positions and on the
+   same strand.
+ * `--remove-gaps`:
+   If sequence filtering with `--only-species` leaves a block
+   containing columns consisting only of gap characters (`-`), these
+   will be removed.
+ * `--parse-extended`:
+   Parse `i` lines, giving information on the context of sequence
+   lines, and `q` lines, giving quality scores.
+ * `--parse-empty`:
+   Parse `e` lines, indicating cases where a species does not align
+   with the current block but does align with blocks before and after
+   it.
+Logging options:
+ * `-q`, `--quiet`:
+   Run quietly, with warnings suppressed.
+ * `-v`, `--verbose`:
+   Run verbosely, with additional informational messages.
+ * `--debug`:
+   Log debugging information.
+## EXAMPLES
+Extract MAF blocks intersecting with a given interval:
+    $ maf_extract -d test/data --interval mm8.chr7:80082592-80082766
+As above, but operating on a single file:
+    $ maf_extract -m test/data/mm8_chr7_tiny.maf \
+          -i test/data/mm8_chr7_tiny.kct \
+          --interval mm8.chr7:80082592-80082766
+Like the first case, but writing output to a file:
+    $ maf_extract -d test/data --interval mm8.chr7:80082592-80082766 \
+          --output out.maf
+Extract a slice of MAF blocks over a given interval:
+    $ maf_extract -d test/data --mode slice \
+          --interval mm8.chr7:80082592-80082766
+Filter for sequences from only certain species:
+    $ maf_extract -d test/data --interval mm8.chr7:80082592-80082766 \
+          --only-species hg18,mm8,rheMac2
+Extract only blocks with all specified species:
+    $ maf_extract -d test/data --interval mm8.chr7:80082471-80082730 \
+          --with-all-species panTro2,loxAfr1
+Extract blocks with at least a certain number of sequences:
+    $ maf_extract -d test/data --interval mm8.chr7:80082767-80083008 \
+          --min-sequences 6
+Extract blocks with text sizes in a certain range:
+    $ maf_extract -d test/data --interval mm8.chr7:0-80100000 \
+          --min-text-size 72 --max-text-size 160
+## ENVIRONMENT
+`maf_index` is a Ruby program and relies on ordinary Ruby environment
+variables.
+## BUGS
+No provision exists for writing output to multiple files.
+FASTA description lines are always in the format `>source:start-end`.
+## COPYRIGHT
+`maf_index` is copyright (C) 2012 Clayton Wheeler.
+## SEE ALSO
+ruby(1), maf_index(1), maf_tile(1)