RubyGems - genomer-plugin-view - Versions diffs - 0.0.2 - Mend

genomer-plugin-view 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

data/.document +5 -0
data/.gitignore +52 -0
data/Gemfile +2 -0
data/LICENSE.txt +20 -0
data/README.rdoc +19 -0
data/Rakefile +9 -0
data/features/agp/generation.feature +285 -0
data/features/fasta/contigs.feature +173 -0
data/features/fasta/single_sequence.feature +144 -0
data/features/mappings/core.feature +181 -0
data/features/support/env.rb +13 -0
data/features/table/cds_entries.feature +304 -0
data/features/table/core.feature +302 -0
data/features/table/feature_type.feature +180 -0
data/genomer-plugin-view.gemspec +34 -0
data/lib/genomer-plugin-view/agp.rb +62 -0
data/lib/genomer-plugin-view/fasta.rb +36 -0
data/lib/genomer-plugin-view/gff_record_helper.rb +61 -0
data/lib/genomer-plugin-view/mapping.rb +14 -0
data/lib/genomer-plugin-view/table.rb +56 -0
data/lib/genomer-plugin-view/version.rb +3 -0
data/lib/genomer-plugin-view.rb +29 -0
data/man/genomer-view-agp.ronn +46 -0
data/man/genomer-view.ronn +153 -0
data/spec/genomer-view-plugin/agp_spec.rb +79 -0
data/spec/genomer-view-plugin/fasta_spec.rb +96 -0
data/spec/genomer-view-plugin/gff_record_helper_spec.rb +244 -0
data/spec/genomer-view-plugin/mapping_spec.rb +89 -0
data/spec/genomer-view-plugin/table_spec.rb +279 -0
data/spec/genomer-view-plugin_spec.rb +103 -0
data/spec/spec_helper.rb +32 -0
metadata +192 -0

data/genomer-plugin-view.gemspec ADDED Viewed

@@ -0,0 +1,34 @@
+# -*- encoding: utf-8 -*-
+require File.expand_path("../lib/genomer-plugin-view/version", __FILE__)
+Gem::Specification.new do |s|
+  s.name        = "genomer-plugin-view"
+  s.version     = GenomerViewPlugin::VERSION
+  s.platform    = Gem::Platform::RUBY
+  s.homepage    = "http://github.com/michaelbarton/genomer-plugin-view"
+  s.license     = "MIT"
+  s.authors     = ["Michael Barton"]
+  s.email       = ["mail@michaelbarton.me.uk"]
+  s.summary     = %Q{Provide different views of scaffold.}
+  s.description = %Q{Convert genome scaffold into different sequence format views}
+  s.required_rubygems_version = "~> 1.8.0"
+  s.rubyforge_project         = "genomer-view-plugin"
+  s.add_dependency "genomer", ">= 0.0.5"
+  # Specs
+  s.add_development_dependency "rspec",                   "~> 2.9.0"
+  s.add_development_dependency "rr",                      "~> 1.0.4"
+  s.add_development_dependency "scaffolder-test-helpers", "~> 0.4.1"
+  s.add_development_dependency "heredoc_unindent",        "~> 1.1.2"
+  # Features
+  s.add_development_dependency "cucumber", "~> 1.1.9"
+  s.add_development_dependency "aruba",    "~> 0.4.11"
+  s.files        = `git ls-files`.split("\n")
+  s.executables  = `git ls-files`.split("\n").map{|f| f =~ /^bin\/(.*)/ ? $1 : nil}.compact
+  s.require_path = 'lib'
+end

data/lib/genomer-plugin-view/agp.rb ADDED Viewed

@@ -0,0 +1,62 @@
+require 'genomer'
+class GenomerPluginView::Agp < Genomer::Plugin
+  def run
+    header = "##agp-version	2.0"
+    entries.unshift(header).join("\n") + "\n"
+  end
+  def locations(seq,regex)
+    seq.upcase.enum_for(:scan, regex).map do
+      (Regexp.last_match.begin(0)+1)..(Regexp.last_match.end(0))
+    end
+  end
+  def entries
+    cumulative_length  = 1
+    count   = 0
+    contigs = 0
+    scaffold.map do |entry|
+      case entry.entry_type
+      when :unresolved then
+        length = entry.sequence.length
+        count += 1
+        start = cumulative_length
+        stop  = cumulative_length += length
+        gap(start, stop - 1, count, 'specified')
+      when :sequence   then
+        seq = entry.sequence.upcase
+        seq_regions = locations(seq,/[^N]+/).map{|i| [:contig,i]}
+        gap_regions = locations(seq,/N+/).map{|i| [:gap,i]}
+        entries = (seq_regions + gap_regions).sort_by{|_,loc| loc.to_a}
+        entries.map do |(type,location)|
+          count += 1
+          length = (location.end - location.begin)
+          entry = case type
+                  when :contig then
+                    contigs += 1
+                    contig(length, cumulative_length, count, contigs)
+                  when :gap    then
+                    start = cumulative_length
+                    stop  = cumulative_length + length
+                    gap(start, stop, count, 'internal')
+                  end
+          cumulative_length += length + 1
+          entry
+        end
+      end
+    end
+  end
+  def contig(length, cum, count, no)
+    %W|scaffold #{cum} #{cum + length} #{count} W #{sprintf("contig%05d",no)} 1 #{length+1} +| * "\t"
+  end
+  def gap(start, stop, count, type)
+    %W|scaffold #{start} #{stop} #{count} N #{stop - start + 1} scaffold yes #{type}| * "\t"
+  end
+end

data/lib/genomer-plugin-view/fasta.rb ADDED Viewed

@@ -0,0 +1,36 @@
+require 'genomer'
+class GenomerPluginView::Fasta < Genomer::Plugin
+  def run
+    if flags[:contigs]
+      flags.delete(:contigs)
+      sequence.
+        split(/[Nn]+/).
+        map{|s| Bio::Sequence.new(s) }.
+        each_with_index.
+        map{|s,i| s.output(:fasta,:header => header(sprintf("contig%05d",i+1))) }.
+        join
+    else
+      Bio::Sequence.new(sequence).output(:fasta,:header => header(identifier))
+    end
+  end
+  def header(identifier)
+    (identifier + ' ' + header_flags).strip
+  end
+  def identifier
+    flags[:identifier] ? flags.delete(:identifier) : '.'
+  end
+  def header_flags
+    flags.map{|k,v| "[#{k}=#{v}]" }.join(' ')
+  end
+  def sequence
+    scaffold.map{|entry| entry.sequence}.join
+  end
+end

data/lib/genomer-plugin-view/gff_record_helper.rb ADDED Viewed

@@ -0,0 +1,61 @@
+require 'bio'
+module GenomerPluginView::GffRecordHelper
+  DEFAULT_GFF_MAPPING = {'product' => 'product', 'Note' => 'note' }
+  GFF_TO_TABLE = {
+    'gene' => {
+      'ID'   => 'locus_tag',
+      'Name' => 'gene'
+    },
+    'CDS' => DEFAULT_GFF_MAPPING.merge({
+      'ID'        => 'protein_id',
+      'ec_number' => 'EC_number',
+      'function'  => 'function',
+    }),
+    'miscRNA' => DEFAULT_GFF_MAPPING,
+    'rRNA'    => DEFAULT_GFF_MAPPING,
+    'tmRNA'   => DEFAULT_GFF_MAPPING,
+    'tRNA'    => DEFAULT_GFF_MAPPING
+  }
+  def negative_strand?
+    self.strand == '-'
+  end
+  def coordinates
+    if negative_strand?
+      [self.end,self.start,self.feature]
+    else
+      [self.start,self.end,self.feature]
+    end
+  end
+  def to_genbank_table_entry
+    delimiter = "\t"
+    indent    = delimiter * 2
+    entries = table_attributes.inject([coordinates]) do |array,atr|
+      array << atr.unshift(indent)
+    end
+    return entries.map{|line| line * delimiter} * "\n" + "\n"
+  end
+  def valid?
+    GFF_TO_TABLE.include?(feature)
+  end
+  def table_attributes
+    raise Genomer::Error, "Unknown feature type '#{feature}'" unless valid?
+    attributes.map do |(k,v)|
+      k = GFF_TO_TABLE[feature][k]
+      k.nil? ? nil : [k,v]
+    end.compact
+  end
+end
+Bio::GFF::GFF3::Record.send(:include, GenomerPluginView::GffRecordHelper)

data/lib/genomer-plugin-view/mapping.rb ADDED Viewed

@@ -0,0 +1,14 @@
+require 'genomer'
+class GenomerPluginView::Mapping < Genomer::Plugin
+  def run
+    original = annotations.map(&:id).map(&:clone)
+    updated  = annotations(GenomerPluginView.convert_command_line_flags(flags)).map(&:id)
+    original.zip(updated).
+      map{|i| i.join("\t") }.
+      join("\n")
+  end
+end

data/lib/genomer-plugin-view/table.rb ADDED Viewed

@@ -0,0 +1,56 @@
+require 'genomer'
+require 'genomer-plugin-view/gff_record_helper'
+class GenomerPluginView::Table < Genomer::Plugin
+  def run
+    options = GenomerPluginView.convert_command_line_flags(flags)
+    header = ">Feature\t#{options[:identifier]}\tannotation_table\n"
+    attns = annotations(options)
+    attns = create_encoded_features(attns, options[:encoded]) if options[:encoded]
+    attns.inject(header) do |table,attn|
+      table << attn.to_genbank_table_entry
+    end
+  end
+  SUPPORTED_FEATURE_TYPES = ['CDS','rRNA','tRNA','miscRNA','tmRNA']
+  def create_encoded_features(genes,prefix)
+    features = genes.map do |gene|
+      feature = gene.clone
+      attrs   = Hash[feature.attributes]
+      if id = attrs['ID']
+        attrs['ID'] = (prefix.is_a?(String) ? prefix + id : id)
+      end
+      feature.feature = attrs['feature_type'] || 'CDS'
+      unless SUPPORTED_FEATURE_TYPES.include?(feature.feature)
+        raise Genomer::Error, "Unknown feature_type '#{feature.feature}'"
+      end
+      if feature.feature == "CDS"
+        name, prdt, ftn = attrs['Name'], attrs['product'], attrs['function']
+        if name
+          name = name.clone
+          name[0] = name[0].upcase
+          prdt, ftn = name,prdt
+        end
+        attrs.delete('Name')
+        attrs['product']  = prdt
+        attrs['function'] = ftn
+      end
+      feature.attributes = attrs.to_a.reject{|(_,value)| value.nil? }
+      feature
+    end
+    genes.zip(features).flatten
+  end
+end

data/lib/genomer-plugin-view/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+class GenomerViewPlugin
+  VERSION = "0.0.2"
+end

data/lib/genomer-plugin-view.rb ADDED Viewed

@@ -0,0 +1,29 @@
+require 'genomer'
+class GenomerPluginView < Genomer::Plugin
+  def run
+    self.class.fetch_view(arguments.shift).new(arguments,flags).run
+  end
+  def self.fetch_view(view)
+    require 'genomer-plugin-view/' + view
+    const_get(view.capitalize)
+  end
+  def self.convert_command_line_flags(flags)
+    flags.inject(Hash.new) do |hash,(k,v)|
+      k = case k
+      when :identifier                then k
+      when :prefix                    then k
+      when :generate_encoded_features then :encoded
+      when :reset_locus_numbering     then :reset
+      else nil
+      end
+      hash[k] = v if k
+      hash
+    end
+  end
+end

data/man/genomer-view-agp.ronn ADDED Viewed

@@ -0,0 +1,46 @@
+genomer-view-agp(1) -- Generate agp file views of scaffold
+==========================================================
+## SYNOPSIS
+`genomer view agp`
+## DESCRIPTION
+**Genomer-view-agp** produces an AGP view of a scaffold. This format shows the
+positions of gaps and contigs in the scaffold. More details on this format can
+be found on the [AGP specification page](http://www.ncbi.nlm.nih.gov/projects/genome/assembly/agp/AGP_Specification.shtml "AGP Specification").
+This command converts the scaffold into AGP format as follows:
+  * contigs:
+    Contiguous sequences of non-N nucleotides generate corresponding sequence
+    entries in the AGP file. Note: two sequence positioned next to each other
+    in the scaffold file do not however produce a single contig entry.
+  * internal contig gaps:
+    Regions of N characters in scaffold sequences are converted 'scaffold' gaps
+    in the AGP file. The 'Linkage Evidence' field is set to "internal" and this
+    should be set to the correct AGP field type after generation. See the AGP
+    Specification for allowed evidence types.
+  * unresolved regions:
+    'Unresolved' entries in the 'scaffold.yml' file result in a **scaffold**
+    gap entry in AGP file. The 'Linkage Evidence' field is set to "specified"
+    and this should be set to the correct AGP field type after generation. See
+    the AGP Specification for allowed evidence types.
+## EXAMPLES
+    $ genomer view agp
+## BUGS
+**Genomer-view** is written in Ruby and depends on the genomer gem. See the
+Gemfile in the genomer-plugin-view gem install directory for version details.
+## COPYRIGHT
+**Genomer** is Copyright (C) 2012 Michael Barton <http://michaelbarton.me.uk>

data/man/genomer-view.ronn ADDED Viewed

@@ -0,0 +1,153 @@
+genomer-view(1) -- Generate file format views of scaffold and annotations
+=========================================================================
+## SYNOPSIS
+`genomer view` <flatfile-type> [<options>...]
+## DESCRIPTION
+**Genomer-view** assembles the scaffold and associated annotations to produce
+common database file formats. The generated file format view is specified by
+the **flat-file** argument.
+## OPTIONS
+  * `--identifier`=[<identifier>]:
+    The sequence identifier to include in generated flatfile outputs.
+  * `--strain`=[<strain>]:
+    The strain of the source organism.
+  * `--organism`=[<organism>]:
+    The genus and species, enclosed in single quotes, of the source organism.
+  * `--prefix`=[<gene-prefix>]:
+    Prepend all ID attributes from the annotation file with <gene-prefix> in
+    the generated output.
+  * `--reset_locus_numbering`:
+    Reset gene ID to begin at 1 from the start of the sequence in the generated
+    output file.
+  * `--generate_encoded_features`=[<feature-prefix>]:
+    Generate corresponding 1:1 encoded feature entries from the genes entries
+    in the annotation file. These will commonly be CDS entries but RNA type
+    entries are also supported. The feature IDs are generated from the
+    corresponding gene ID prefixed with the <feature-prefix>.
+## GFF NINTH COLUMN ATTRIBUTES
+The annotation file should be in GFF3 format and contain the annotations for
+the scaffolded contigs. The default location for this file is
+**assembly/annotations.gff**. The following attributes in the GFF3 file are
+treated specially by genomer when generating flat file output.
+### GFF DEFINED ATTRIBUTES
+These attributes have a predefined meaning in the GFF specification. These all
+begin with an upper case letter.
+  * `ID`:
+    Used to specify the ID of annotations in the output. If the
+    `--generate_encoded_features` option is passed, the encoded features have
+    an ID generated from this field prefixed with the <feature-prefix>
+    argument. This field should be unique in the annotation file.
+  * `Name`:
+    Used to specify the four letter annotation name, e.g. pilO. The lower case
+    version is used for gene names. If the `--generate_encoded_features` option
+    is passed, additonal encoded feature entries have the `product` field
+    generated from this capitalised version of this attribute. This need not be
+    unique in the file.
+  * `Note`:
+    Used to populate the **Note** field for entries when the
+    `--generate_encoded_features` option is passed.
+### GENOMER ATTRIBUTES
+These attributes are specific to genomer and should begin with a lower case
+letter. Many of these attributes have a corresponding relationship with fields
+in genbank table format, however a caveat to this is outlined in the next
+section.
+  * `product`:
+    Used to populate the **product** field for encoded features when the
+    `--generate_encoded_features` option is passed. If the **Name** attribute
+    is also present then the **funtion** field is instead populated with this
+    value.
+  * `entry_type`:
+    When the gene product is not a CDS this field can be used, when the
+    `--generate_encoded_features` option is passed, as the corresponding entry
+    type instead of `CDS`. The genbank specification list examples for `rRNA`,
+    `tmRNA`, `tRNA`, and `miscRNA`. If you require other feature type
+    implemented, please contact me through the website below.
+  * `ec_number`:
+    Used to populate the protein **EC\_number** field for CDS entries when the
+    `--generate_encoded_features` option is passed.
+  * `function`:
+    Used to populate the **function** field for encoded entries when the
+    `--generate_encoded_features` option is passed. This is overwritten in the
+    table output by the **product** attribute if both the **Name** and
+    **product** attributes are present. See the next section for an explanation
+    of this.
+### OVERLAP BETWEEN NAME, PRODUCT AND FUNCTION FIELD
+The genbank annotation table **product** fields may contain either a short four
+letter name (e.g. pilO) or a longer gene description (e.g. pilus assembly
+protein). This presents a problem where data may need to be juggled between the
+**Name**, **product** and **function** fields depending on what is information
+is avaiable.
+Genomer view solves this problem by prioritising these fields in the following
+order: **Name** > **product** > **function**. If the **Name** attribute is
+present this will be used for the **product** field in the resulting genbank
+table. If the **product** attribute is also present at the same time this will
+instead be used to fill out the **function** field in the genbank table. If
+only the **product** and **function** attributes are present then these then
+map to corresponding fields in genbank table.
+### RECOMMENDED FORMAT FOR ANNOTATIONS
+All entries should contain a unique `ID` attribute. A `Name` field be used
+whenever an appropriate four letter name is also available, e.g. 'pilO'. The ID
+field alone is sufficent for generating a gene-only annotation table. Generally
+however you will want to generate the encoded annotations also using the
+`--generate_encoded_annotations` command line flag..
+The majority of encoded annotations will be CDS entries but most genomes will
+also contain RNA non-coding features. CDS annotations should contain either a
+`product` and/or `Name` field to match the genbank requirements. In general it
+may be easier to fill out all the `product` field for entries then add names
+for entries where possible.
+## EXAMPLES
+Assemble the scaffold sequence into Fasta format. Set the Fasta header to
+include the sequence identifier, strain, and organism.
+    $ genomer view fasta --identifier PRJNA68653 --strain='R124' \
+      --organism='Pseudomonas fluorescens'
+Assemble annotations into GenBank Table format suitable for use with `tbl2asn`.
+Reset the gene order numbering to begin at the sequence start and prefix each
+gene ID with 'I1A\_'. Set the organism identifier at the top of the feature
+table to be 'PRJNA68653'.
+    $ genomer view table --identifier PRJNA68653 --reset_locus_numbering \
+        --prefix='I1A_'
+## BUGS
+**Genomer-view** is written in Ruby and depends on the genomer gem. See the
+Gemfile in the genomer-plugin-view gem install directory for version details.
+## COPYRIGHT
+**Genomer** is Copyright (C) 2012 Michael Barton <http://michaelbarton.me.uk>