genomer-plugin-view 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,34 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require File.expand_path("../lib/genomer-plugin-view/version", __FILE__)
3
+
4
+ Gem::Specification.new do |s|
5
+
6
+ s.name = "genomer-plugin-view"
7
+ s.version = GenomerViewPlugin::VERSION
8
+ s.platform = Gem::Platform::RUBY
9
+ s.homepage = "http://github.com/michaelbarton/genomer-plugin-view"
10
+ s.license = "MIT"
11
+ s.authors = ["Michael Barton"]
12
+ s.email = ["mail@michaelbarton.me.uk"]
13
+ s.summary = %Q{Provide different views of scaffold.}
14
+ s.description = %Q{Convert genome scaffold into different sequence format views}
15
+
16
+ s.required_rubygems_version = "~> 1.8.0"
17
+ s.rubyforge_project = "genomer-view-plugin"
18
+
19
+ s.add_dependency "genomer", ">= 0.0.5"
20
+
21
+ # Specs
22
+ s.add_development_dependency "rspec", "~> 2.9.0"
23
+ s.add_development_dependency "rr", "~> 1.0.4"
24
+ s.add_development_dependency "scaffolder-test-helpers", "~> 0.4.1"
25
+ s.add_development_dependency "heredoc_unindent", "~> 1.1.2"
26
+
27
+ # Features
28
+ s.add_development_dependency "cucumber", "~> 1.1.9"
29
+ s.add_development_dependency "aruba", "~> 0.4.11"
30
+
31
+ s.files = `git ls-files`.split("\n")
32
+ s.executables = `git ls-files`.split("\n").map{|f| f =~ /^bin\/(.*)/ ? $1 : nil}.compact
33
+ s.require_path = 'lib'
34
+ end
@@ -0,0 +1,62 @@
1
+ require 'genomer'
2
+
3
+ class GenomerPluginView::Agp < Genomer::Plugin
4
+
5
+ def run
6
+ header = "##agp-version 2.0"
7
+ entries.unshift(header).join("\n") + "\n"
8
+ end
9
+
10
+ def locations(seq,regex)
11
+ seq.upcase.enum_for(:scan, regex).map do
12
+ (Regexp.last_match.begin(0)+1)..(Regexp.last_match.end(0))
13
+ end
14
+ end
15
+
16
+ def entries
17
+ cumulative_length = 1
18
+ count = 0
19
+ contigs = 0
20
+
21
+ scaffold.map do |entry|
22
+ case entry.entry_type
23
+ when :unresolved then
24
+ length = entry.sequence.length
25
+ count += 1
26
+ start = cumulative_length
27
+ stop = cumulative_length += length
28
+ gap(start, stop - 1, count, 'specified')
29
+ when :sequence then
30
+ seq = entry.sequence.upcase
31
+ seq_regions = locations(seq,/[^N]+/).map{|i| [:contig,i]}
32
+ gap_regions = locations(seq,/N+/).map{|i| [:gap,i]}
33
+ entries = (seq_regions + gap_regions).sort_by{|_,loc| loc.to_a}
34
+
35
+ entries.map do |(type,location)|
36
+ count += 1
37
+ length = (location.end - location.begin)
38
+ entry = case type
39
+ when :contig then
40
+ contigs += 1
41
+ contig(length, cumulative_length, count, contigs)
42
+ when :gap then
43
+ start = cumulative_length
44
+ stop = cumulative_length + length
45
+ gap(start, stop, count, 'internal')
46
+ end
47
+ cumulative_length += length + 1
48
+ entry
49
+ end
50
+ end
51
+ end
52
+ end
53
+
54
+ def contig(length, cum, count, no)
55
+ %W|scaffold #{cum} #{cum + length} #{count} W #{sprintf("contig%05d",no)} 1 #{length+1} +| * "\t"
56
+ end
57
+
58
+ def gap(start, stop, count, type)
59
+ %W|scaffold #{start} #{stop} #{count} N #{stop - start + 1} scaffold yes #{type}| * "\t"
60
+ end
61
+
62
+ end
@@ -0,0 +1,36 @@
1
+ require 'genomer'
2
+
3
+ class GenomerPluginView::Fasta < Genomer::Plugin
4
+
5
+ def run
6
+ if flags[:contigs]
7
+ flags.delete(:contigs)
8
+
9
+ sequence.
10
+ split(/[Nn]+/).
11
+ map{|s| Bio::Sequence.new(s) }.
12
+ each_with_index.
13
+ map{|s,i| s.output(:fasta,:header => header(sprintf("contig%05d",i+1))) }.
14
+ join
15
+ else
16
+ Bio::Sequence.new(sequence).output(:fasta,:header => header(identifier))
17
+ end
18
+ end
19
+
20
+ def header(identifier)
21
+ (identifier + ' ' + header_flags).strip
22
+ end
23
+
24
+ def identifier
25
+ flags[:identifier] ? flags.delete(:identifier) : '.'
26
+ end
27
+
28
+ def header_flags
29
+ flags.map{|k,v| "[#{k}=#{v}]" }.join(' ')
30
+ end
31
+
32
+ def sequence
33
+ scaffold.map{|entry| entry.sequence}.join
34
+ end
35
+
36
+ end
@@ -0,0 +1,61 @@
1
+ require 'bio'
2
+
3
+ module GenomerPluginView::GffRecordHelper
4
+
5
+
6
+ DEFAULT_GFF_MAPPING = {'product' => 'product', 'Note' => 'note' }
7
+
8
+ GFF_TO_TABLE = {
9
+ 'gene' => {
10
+ 'ID' => 'locus_tag',
11
+ 'Name' => 'gene'
12
+ },
13
+ 'CDS' => DEFAULT_GFF_MAPPING.merge({
14
+ 'ID' => 'protein_id',
15
+ 'ec_number' => 'EC_number',
16
+ 'function' => 'function',
17
+ }),
18
+ 'miscRNA' => DEFAULT_GFF_MAPPING,
19
+ 'rRNA' => DEFAULT_GFF_MAPPING,
20
+ 'tmRNA' => DEFAULT_GFF_MAPPING,
21
+ 'tRNA' => DEFAULT_GFF_MAPPING
22
+ }
23
+
24
+ def negative_strand?
25
+ self.strand == '-'
26
+ end
27
+
28
+ def coordinates
29
+ if negative_strand?
30
+ [self.end,self.start,self.feature]
31
+ else
32
+ [self.start,self.end,self.feature]
33
+ end
34
+ end
35
+
36
+ def to_genbank_table_entry
37
+
38
+ delimiter = "\t"
39
+ indent = delimiter * 2
40
+
41
+ entries = table_attributes.inject([coordinates]) do |array,atr|
42
+ array << atr.unshift(indent)
43
+ end
44
+ return entries.map{|line| line * delimiter} * "\n" + "\n"
45
+ end
46
+
47
+ def valid?
48
+ GFF_TO_TABLE.include?(feature)
49
+ end
50
+
51
+ def table_attributes
52
+ raise Genomer::Error, "Unknown feature type '#{feature}'" unless valid?
53
+ attributes.map do |(k,v)|
54
+ k = GFF_TO_TABLE[feature][k]
55
+ k.nil? ? nil : [k,v]
56
+ end.compact
57
+ end
58
+
59
+ end
60
+
61
+ Bio::GFF::GFF3::Record.send(:include, GenomerPluginView::GffRecordHelper)
@@ -0,0 +1,14 @@
1
+ require 'genomer'
2
+
3
+ class GenomerPluginView::Mapping < Genomer::Plugin
4
+
5
+ def run
6
+ original = annotations.map(&:id).map(&:clone)
7
+ updated = annotations(GenomerPluginView.convert_command_line_flags(flags)).map(&:id)
8
+
9
+ original.zip(updated).
10
+ map{|i| i.join("\t") }.
11
+ join("\n")
12
+ end
13
+
14
+ end
@@ -0,0 +1,56 @@
1
+ require 'genomer'
2
+ require 'genomer-plugin-view/gff_record_helper'
3
+
4
+ class GenomerPluginView::Table < Genomer::Plugin
5
+
6
+ def run
7
+ options = GenomerPluginView.convert_command_line_flags(flags)
8
+
9
+ header = ">Feature\t#{options[:identifier]}\tannotation_table\n"
10
+
11
+ attns = annotations(options)
12
+ attns = create_encoded_features(attns, options[:encoded]) if options[:encoded]
13
+
14
+ attns.inject(header) do |table,attn|
15
+ table << attn.to_genbank_table_entry
16
+ end
17
+ end
18
+
19
+ SUPPORTED_FEATURE_TYPES = ['CDS','rRNA','tRNA','miscRNA','tmRNA']
20
+
21
+ def create_encoded_features(genes,prefix)
22
+ features = genes.map do |gene|
23
+ feature = gene.clone
24
+ attrs = Hash[feature.attributes]
25
+
26
+ if id = attrs['ID']
27
+ attrs['ID'] = (prefix.is_a?(String) ? prefix + id : id)
28
+ end
29
+
30
+ feature.feature = attrs['feature_type'] || 'CDS'
31
+
32
+ unless SUPPORTED_FEATURE_TYPES.include?(feature.feature)
33
+ raise Genomer::Error, "Unknown feature_type '#{feature.feature}'"
34
+ end
35
+
36
+ if feature.feature == "CDS"
37
+ name, prdt, ftn = attrs['Name'], attrs['product'], attrs['function']
38
+
39
+ if name
40
+ name = name.clone
41
+ name[0] = name[0].upcase
42
+ prdt, ftn = name,prdt
43
+ end
44
+
45
+ attrs.delete('Name')
46
+ attrs['product'] = prdt
47
+ attrs['function'] = ftn
48
+ end
49
+
50
+ feature.attributes = attrs.to_a.reject{|(_,value)| value.nil? }
51
+ feature
52
+ end
53
+ genes.zip(features).flatten
54
+ end
55
+
56
+ end
@@ -0,0 +1,3 @@
1
+ class GenomerViewPlugin
2
+ VERSION = "0.0.2"
3
+ end
@@ -0,0 +1,29 @@
1
+ require 'genomer'
2
+
3
+ class GenomerPluginView < Genomer::Plugin
4
+
5
+ def run
6
+ self.class.fetch_view(arguments.shift).new(arguments,flags).run
7
+ end
8
+
9
+ def self.fetch_view(view)
10
+ require 'genomer-plugin-view/' + view
11
+ const_get(view.capitalize)
12
+ end
13
+
14
+ def self.convert_command_line_flags(flags)
15
+ flags.inject(Hash.new) do |hash,(k,v)|
16
+ k = case k
17
+ when :identifier then k
18
+ when :prefix then k
19
+ when :generate_encoded_features then :encoded
20
+ when :reset_locus_numbering then :reset
21
+ else nil
22
+ end
23
+
24
+ hash[k] = v if k
25
+ hash
26
+ end
27
+ end
28
+
29
+ end
@@ -0,0 +1,46 @@
1
+ genomer-view-agp(1) -- Generate agp file views of scaffold
2
+ ==========================================================
3
+
4
+ ## SYNOPSIS
5
+
6
+ `genomer view agp`
7
+
8
+ ## DESCRIPTION
9
+
10
+ **Genomer-view-agp** produces an AGP view of a scaffold. This format shows the
11
+ positions of gaps and contigs in the scaffold. More details on this format can
12
+ be found on the [AGP specification page](http://www.ncbi.nlm.nih.gov/projects/genome/assembly/agp/AGP_Specification.shtml "AGP Specification").
13
+
14
+ This command converts the scaffold into AGP format as follows:
15
+
16
+ * contigs:
17
+ Contiguous sequences of non-N nucleotides generate corresponding sequence
18
+ entries in the AGP file. Note: two sequence positioned next to each other
19
+ in the scaffold file do not however produce a single contig entry.
20
+
21
+ * internal contig gaps:
22
+
23
+ Regions of N characters in scaffold sequences are converted 'scaffold' gaps
24
+ in the AGP file. The 'Linkage Evidence' field is set to "internal" and this
25
+ should be set to the correct AGP field type after generation. See the AGP
26
+ Specification for allowed evidence types.
27
+
28
+ * unresolved regions:
29
+
30
+ 'Unresolved' entries in the 'scaffold.yml' file result in a **scaffold**
31
+ gap entry in AGP file. The 'Linkage Evidence' field is set to "specified"
32
+ and this should be set to the correct AGP field type after generation. See
33
+ the AGP Specification for allowed evidence types.
34
+
35
+ ## EXAMPLES
36
+
37
+ $ genomer view agp
38
+
39
+ ## BUGS
40
+
41
+ **Genomer-view** is written in Ruby and depends on the genomer gem. See the
42
+ Gemfile in the genomer-plugin-view gem install directory for version details.
43
+
44
+ ## COPYRIGHT
45
+
46
+ **Genomer** is Copyright (C) 2012 Michael Barton <http://michaelbarton.me.uk>
@@ -0,0 +1,153 @@
1
+ genomer-view(1) -- Generate file format views of scaffold and annotations
2
+ =========================================================================
3
+
4
+ ## SYNOPSIS
5
+
6
+ `genomer view` <flatfile-type> [<options>...]
7
+
8
+ ## DESCRIPTION
9
+
10
+ **Genomer-view** assembles the scaffold and associated annotations to produce
11
+ common database file formats. The generated file format view is specified by
12
+ the **flat-file** argument.
13
+
14
+ ## OPTIONS
15
+
16
+ * `--identifier`=[<identifier>]:
17
+ The sequence identifier to include in generated flatfile outputs.
18
+
19
+ * `--strain`=[<strain>]:
20
+ The strain of the source organism.
21
+
22
+ * `--organism`=[<organism>]:
23
+ The genus and species, enclosed in single quotes, of the source organism.
24
+
25
+ * `--prefix`=[<gene-prefix>]:
26
+ Prepend all ID attributes from the annotation file with <gene-prefix> in
27
+ the generated output.
28
+
29
+ * `--reset_locus_numbering`:
30
+ Reset gene ID to begin at 1 from the start of the sequence in the generated
31
+ output file.
32
+
33
+ * `--generate_encoded_features`=[<feature-prefix>]:
34
+ Generate corresponding 1:1 encoded feature entries from the genes entries
35
+ in the annotation file. These will commonly be CDS entries but RNA type
36
+ entries are also supported. The feature IDs are generated from the
37
+ corresponding gene ID prefixed with the <feature-prefix>.
38
+
39
+ ## GFF NINTH COLUMN ATTRIBUTES
40
+
41
+ The annotation file should be in GFF3 format and contain the annotations for
42
+ the scaffolded contigs. The default location for this file is
43
+ **assembly/annotations.gff**. The following attributes in the GFF3 file are
44
+ treated specially by genomer when generating flat file output.
45
+
46
+ ### GFF DEFINED ATTRIBUTES
47
+
48
+ These attributes have a predefined meaning in the GFF specification. These all
49
+ begin with an upper case letter.
50
+
51
+ * `ID`:
52
+ Used to specify the ID of annotations in the output. If the
53
+ `--generate_encoded_features` option is passed, the encoded features have
54
+ an ID generated from this field prefixed with the <feature-prefix>
55
+ argument. This field should be unique in the annotation file.
56
+
57
+ * `Name`:
58
+ Used to specify the four letter annotation name, e.g. pilO. The lower case
59
+ version is used for gene names. If the `--generate_encoded_features` option
60
+ is passed, additonal encoded feature entries have the `product` field
61
+ generated from this capitalised version of this attribute. This need not be
62
+ unique in the file.
63
+
64
+ * `Note`:
65
+ Used to populate the **Note** field for entries when the
66
+ `--generate_encoded_features` option is passed.
67
+
68
+ ### GENOMER ATTRIBUTES
69
+
70
+ These attributes are specific to genomer and should begin with a lower case
71
+ letter. Many of these attributes have a corresponding relationship with fields
72
+ in genbank table format, however a caveat to this is outlined in the next
73
+ section.
74
+
75
+ * `product`:
76
+ Used to populate the **product** field for encoded features when the
77
+ `--generate_encoded_features` option is passed. If the **Name** attribute
78
+ is also present then the **funtion** field is instead populated with this
79
+ value.
80
+
81
+ * `entry_type`:
82
+ When the gene product is not a CDS this field can be used, when the
83
+ `--generate_encoded_features` option is passed, as the corresponding entry
84
+ type instead of `CDS`. The genbank specification list examples for `rRNA`,
85
+ `tmRNA`, `tRNA`, and `miscRNA`. If you require other feature type
86
+ implemented, please contact me through the website below.
87
+
88
+ * `ec_number`:
89
+ Used to populate the protein **EC\_number** field for CDS entries when the
90
+ `--generate_encoded_features` option is passed.
91
+
92
+ * `function`:
93
+ Used to populate the **function** field for encoded entries when the
94
+ `--generate_encoded_features` option is passed. This is overwritten in the
95
+ table output by the **product** attribute if both the **Name** and
96
+ **product** attributes are present. See the next section for an explanation
97
+ of this.
98
+
99
+
100
+ ### OVERLAP BETWEEN NAME, PRODUCT AND FUNCTION FIELD
101
+
102
+ The genbank annotation table **product** fields may contain either a short four
103
+ letter name (e.g. pilO) or a longer gene description (e.g. pilus assembly
104
+ protein). This presents a problem where data may need to be juggled between the
105
+ **Name**, **product** and **function** fields depending on what is information
106
+ is avaiable.
107
+
108
+ Genomer view solves this problem by prioritising these fields in the following
109
+ order: **Name** > **product** > **function**. If the **Name** attribute is
110
+ present this will be used for the **product** field in the resulting genbank
111
+ table. If the **product** attribute is also present at the same time this will
112
+ instead be used to fill out the **function** field in the genbank table. If
113
+ only the **product** and **function** attributes are present then these then
114
+ map to corresponding fields in genbank table.
115
+
116
+ ### RECOMMENDED FORMAT FOR ANNOTATIONS
117
+
118
+ All entries should contain a unique `ID` attribute. A `Name` field be used
119
+ whenever an appropriate four letter name is also available, e.g. 'pilO'. The ID
120
+ field alone is sufficent for generating a gene-only annotation table. Generally
121
+ however you will want to generate the encoded annotations also using the
122
+ `--generate_encoded_annotations` command line flag..
123
+
124
+ The majority of encoded annotations will be CDS entries but most genomes will
125
+ also contain RNA non-coding features. CDS annotations should contain either a
126
+ `product` and/or `Name` field to match the genbank requirements. In general it
127
+ may be easier to fill out all the `product` field for entries then add names
128
+ for entries where possible.
129
+
130
+ ## EXAMPLES
131
+
132
+ Assemble the scaffold sequence into Fasta format. Set the Fasta header to
133
+ include the sequence identifier, strain, and organism.
134
+
135
+ $ genomer view fasta --identifier PRJNA68653 --strain='R124' \
136
+ --organism='Pseudomonas fluorescens'
137
+
138
+ Assemble annotations into GenBank Table format suitable for use with `tbl2asn`.
139
+ Reset the gene order numbering to begin at the sequence start and prefix each
140
+ gene ID with 'I1A\_'. Set the organism identifier at the top of the feature
141
+ table to be 'PRJNA68653'.
142
+
143
+ $ genomer view table --identifier PRJNA68653 --reset_locus_numbering \
144
+ --prefix='I1A_'
145
+
146
+ ## BUGS
147
+
148
+ **Genomer-view** is written in Ruby and depends on the genomer gem. See the
149
+ Gemfile in the genomer-plugin-view gem install directory for version details.
150
+
151
+ ## COPYRIGHT
152
+
153
+ **Genomer** is Copyright (C) 2012 Michael Barton <http://michaelbarton.me.uk>