genomer-plugin-view 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,34 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require File.expand_path("../lib/genomer-plugin-view/version", __FILE__)
3
+
4
+ Gem::Specification.new do |s|
5
+
6
+ s.name = "genomer-plugin-view"
7
+ s.version = GenomerViewPlugin::VERSION
8
+ s.platform = Gem::Platform::RUBY
9
+ s.homepage = "http://github.com/michaelbarton/genomer-plugin-view"
10
+ s.license = "MIT"
11
+ s.authors = ["Michael Barton"]
12
+ s.email = ["mail@michaelbarton.me.uk"]
13
+ s.summary = %Q{Provide different views of scaffold.}
14
+ s.description = %Q{Convert genome scaffold into different sequence format views}
15
+
16
+ s.required_rubygems_version = "~> 1.8.0"
17
+ s.rubyforge_project = "genomer-view-plugin"
18
+
19
+ s.add_dependency "genomer", ">= 0.0.5"
20
+
21
+ # Specs
22
+ s.add_development_dependency "rspec", "~> 2.9.0"
23
+ s.add_development_dependency "rr", "~> 1.0.4"
24
+ s.add_development_dependency "scaffolder-test-helpers", "~> 0.4.1"
25
+ s.add_development_dependency "heredoc_unindent", "~> 1.1.2"
26
+
27
+ # Features
28
+ s.add_development_dependency "cucumber", "~> 1.1.9"
29
+ s.add_development_dependency "aruba", "~> 0.4.11"
30
+
31
+ s.files = `git ls-files`.split("\n")
32
+ s.executables = `git ls-files`.split("\n").map{|f| f =~ /^bin\/(.*)/ ? $1 : nil}.compact
33
+ s.require_path = 'lib'
34
+ end
@@ -0,0 +1,62 @@
1
+ require 'genomer'
2
+
3
+ class GenomerPluginView::Agp < Genomer::Plugin
4
+
5
+ def run
6
+ header = "##agp-version 2.0"
7
+ entries.unshift(header).join("\n") + "\n"
8
+ end
9
+
10
+ def locations(seq,regex)
11
+ seq.upcase.enum_for(:scan, regex).map do
12
+ (Regexp.last_match.begin(0)+1)..(Regexp.last_match.end(0))
13
+ end
14
+ end
15
+
16
+ def entries
17
+ cumulative_length = 1
18
+ count = 0
19
+ contigs = 0
20
+
21
+ scaffold.map do |entry|
22
+ case entry.entry_type
23
+ when :unresolved then
24
+ length = entry.sequence.length
25
+ count += 1
26
+ start = cumulative_length
27
+ stop = cumulative_length += length
28
+ gap(start, stop - 1, count, 'specified')
29
+ when :sequence then
30
+ seq = entry.sequence.upcase
31
+ seq_regions = locations(seq,/[^N]+/).map{|i| [:contig,i]}
32
+ gap_regions = locations(seq,/N+/).map{|i| [:gap,i]}
33
+ entries = (seq_regions + gap_regions).sort_by{|_,loc| loc.to_a}
34
+
35
+ entries.map do |(type,location)|
36
+ count += 1
37
+ length = (location.end - location.begin)
38
+ entry = case type
39
+ when :contig then
40
+ contigs += 1
41
+ contig(length, cumulative_length, count, contigs)
42
+ when :gap then
43
+ start = cumulative_length
44
+ stop = cumulative_length + length
45
+ gap(start, stop, count, 'internal')
46
+ end
47
+ cumulative_length += length + 1
48
+ entry
49
+ end
50
+ end
51
+ end
52
+ end
53
+
54
+ def contig(length, cum, count, no)
55
+ %W|scaffold #{cum} #{cum + length} #{count} W #{sprintf("contig%05d",no)} 1 #{length+1} +| * "\t"
56
+ end
57
+
58
+ def gap(start, stop, count, type)
59
+ %W|scaffold #{start} #{stop} #{count} N #{stop - start + 1} scaffold yes #{type}| * "\t"
60
+ end
61
+
62
+ end
@@ -0,0 +1,36 @@
1
+ require 'genomer'
2
+
3
+ class GenomerPluginView::Fasta < Genomer::Plugin
4
+
5
+ def run
6
+ if flags[:contigs]
7
+ flags.delete(:contigs)
8
+
9
+ sequence.
10
+ split(/[Nn]+/).
11
+ map{|s| Bio::Sequence.new(s) }.
12
+ each_with_index.
13
+ map{|s,i| s.output(:fasta,:header => header(sprintf("contig%05d",i+1))) }.
14
+ join
15
+ else
16
+ Bio::Sequence.new(sequence).output(:fasta,:header => header(identifier))
17
+ end
18
+ end
19
+
20
+ def header(identifier)
21
+ (identifier + ' ' + header_flags).strip
22
+ end
23
+
24
+ def identifier
25
+ flags[:identifier] ? flags.delete(:identifier) : '.'
26
+ end
27
+
28
+ def header_flags
29
+ flags.map{|k,v| "[#{k}=#{v}]" }.join(' ')
30
+ end
31
+
32
+ def sequence
33
+ scaffold.map{|entry| entry.sequence}.join
34
+ end
35
+
36
+ end
@@ -0,0 +1,61 @@
1
+ require 'bio'
2
+
3
+ module GenomerPluginView::GffRecordHelper
4
+
5
+
6
+ DEFAULT_GFF_MAPPING = {'product' => 'product', 'Note' => 'note' }
7
+
8
+ GFF_TO_TABLE = {
9
+ 'gene' => {
10
+ 'ID' => 'locus_tag',
11
+ 'Name' => 'gene'
12
+ },
13
+ 'CDS' => DEFAULT_GFF_MAPPING.merge({
14
+ 'ID' => 'protein_id',
15
+ 'ec_number' => 'EC_number',
16
+ 'function' => 'function',
17
+ }),
18
+ 'miscRNA' => DEFAULT_GFF_MAPPING,
19
+ 'rRNA' => DEFAULT_GFF_MAPPING,
20
+ 'tmRNA' => DEFAULT_GFF_MAPPING,
21
+ 'tRNA' => DEFAULT_GFF_MAPPING
22
+ }
23
+
24
+ def negative_strand?
25
+ self.strand == '-'
26
+ end
27
+
28
+ def coordinates
29
+ if negative_strand?
30
+ [self.end,self.start,self.feature]
31
+ else
32
+ [self.start,self.end,self.feature]
33
+ end
34
+ end
35
+
36
+ def to_genbank_table_entry
37
+
38
+ delimiter = "\t"
39
+ indent = delimiter * 2
40
+
41
+ entries = table_attributes.inject([coordinates]) do |array,atr|
42
+ array << atr.unshift(indent)
43
+ end
44
+ return entries.map{|line| line * delimiter} * "\n" + "\n"
45
+ end
46
+
47
+ def valid?
48
+ GFF_TO_TABLE.include?(feature)
49
+ end
50
+
51
+ def table_attributes
52
+ raise Genomer::Error, "Unknown feature type '#{feature}'" unless valid?
53
+ attributes.map do |(k,v)|
54
+ k = GFF_TO_TABLE[feature][k]
55
+ k.nil? ? nil : [k,v]
56
+ end.compact
57
+ end
58
+
59
+ end
60
+
61
+ Bio::GFF::GFF3::Record.send(:include, GenomerPluginView::GffRecordHelper)
@@ -0,0 +1,14 @@
1
+ require 'genomer'
2
+
3
+ class GenomerPluginView::Mapping < Genomer::Plugin
4
+
5
+ def run
6
+ original = annotations.map(&:id).map(&:clone)
7
+ updated = annotations(GenomerPluginView.convert_command_line_flags(flags)).map(&:id)
8
+
9
+ original.zip(updated).
10
+ map{|i| i.join("\t") }.
11
+ join("\n")
12
+ end
13
+
14
+ end
@@ -0,0 +1,56 @@
1
+ require 'genomer'
2
+ require 'genomer-plugin-view/gff_record_helper'
3
+
4
+ class GenomerPluginView::Table < Genomer::Plugin
5
+
6
+ def run
7
+ options = GenomerPluginView.convert_command_line_flags(flags)
8
+
9
+ header = ">Feature\t#{options[:identifier]}\tannotation_table\n"
10
+
11
+ attns = annotations(options)
12
+ attns = create_encoded_features(attns, options[:encoded]) if options[:encoded]
13
+
14
+ attns.inject(header) do |table,attn|
15
+ table << attn.to_genbank_table_entry
16
+ end
17
+ end
18
+
19
+ SUPPORTED_FEATURE_TYPES = ['CDS','rRNA','tRNA','miscRNA','tmRNA']
20
+
21
+ def create_encoded_features(genes,prefix)
22
+ features = genes.map do |gene|
23
+ feature = gene.clone
24
+ attrs = Hash[feature.attributes]
25
+
26
+ if id = attrs['ID']
27
+ attrs['ID'] = (prefix.is_a?(String) ? prefix + id : id)
28
+ end
29
+
30
+ feature.feature = attrs['feature_type'] || 'CDS'
31
+
32
+ unless SUPPORTED_FEATURE_TYPES.include?(feature.feature)
33
+ raise Genomer::Error, "Unknown feature_type '#{feature.feature}'"
34
+ end
35
+
36
+ if feature.feature == "CDS"
37
+ name, prdt, ftn = attrs['Name'], attrs['product'], attrs['function']
38
+
39
+ if name
40
+ name = name.clone
41
+ name[0] = name[0].upcase
42
+ prdt, ftn = name,prdt
43
+ end
44
+
45
+ attrs.delete('Name')
46
+ attrs['product'] = prdt
47
+ attrs['function'] = ftn
48
+ end
49
+
50
+ feature.attributes = attrs.to_a.reject{|(_,value)| value.nil? }
51
+ feature
52
+ end
53
+ genes.zip(features).flatten
54
+ end
55
+
56
+ end
@@ -0,0 +1,3 @@
1
+ class GenomerViewPlugin
2
+ VERSION = "0.0.2"
3
+ end
@@ -0,0 +1,29 @@
1
+ require 'genomer'
2
+
3
+ class GenomerPluginView < Genomer::Plugin
4
+
5
+ def run
6
+ self.class.fetch_view(arguments.shift).new(arguments,flags).run
7
+ end
8
+
9
+ def self.fetch_view(view)
10
+ require 'genomer-plugin-view/' + view
11
+ const_get(view.capitalize)
12
+ end
13
+
14
+ def self.convert_command_line_flags(flags)
15
+ flags.inject(Hash.new) do |hash,(k,v)|
16
+ k = case k
17
+ when :identifier then k
18
+ when :prefix then k
19
+ when :generate_encoded_features then :encoded
20
+ when :reset_locus_numbering then :reset
21
+ else nil
22
+ end
23
+
24
+ hash[k] = v if k
25
+ hash
26
+ end
27
+ end
28
+
29
+ end
@@ -0,0 +1,46 @@
1
+ genomer-view-agp(1) -- Generate agp file views of scaffold
2
+ ==========================================================
3
+
4
+ ## SYNOPSIS
5
+
6
+ `genomer view agp`
7
+
8
+ ## DESCRIPTION
9
+
10
+ **Genomer-view-agp** produces an AGP view of a scaffold. This format shows the
11
+ positions of gaps and contigs in the scaffold. More details on this format can
12
+ be found on the [AGP specification page](http://www.ncbi.nlm.nih.gov/projects/genome/assembly/agp/AGP_Specification.shtml "AGP Specification").
13
+
14
+ This command converts the scaffold into AGP format as follows:
15
+
16
+ * contigs:
17
+ Contiguous sequences of non-N nucleotides generate corresponding sequence
18
+ entries in the AGP file. Note: two sequence positioned next to each other
19
+ in the scaffold file do not however produce a single contig entry.
20
+
21
+ * internal contig gaps:
22
+
23
+ Regions of N characters in scaffold sequences are converted 'scaffold' gaps
24
+ in the AGP file. The 'Linkage Evidence' field is set to "internal" and this
25
+ should be set to the correct AGP field type after generation. See the AGP
26
+ Specification for allowed evidence types.
27
+
28
+ * unresolved regions:
29
+
30
+ 'Unresolved' entries in the 'scaffold.yml' file result in a **scaffold**
31
+ gap entry in AGP file. The 'Linkage Evidence' field is set to "specified"
32
+ and this should be set to the correct AGP field type after generation. See
33
+ the AGP Specification for allowed evidence types.
34
+
35
+ ## EXAMPLES
36
+
37
+ $ genomer view agp
38
+
39
+ ## BUGS
40
+
41
+ **Genomer-view** is written in Ruby and depends on the genomer gem. See the
42
+ Gemfile in the genomer-plugin-view gem install directory for version details.
43
+
44
+ ## COPYRIGHT
45
+
46
+ **Genomer** is Copyright (C) 2012 Michael Barton <http://michaelbarton.me.uk>
@@ -0,0 +1,153 @@
1
+ genomer-view(1) -- Generate file format views of scaffold and annotations
2
+ =========================================================================
3
+
4
+ ## SYNOPSIS
5
+
6
+ `genomer view` <flatfile-type> [<options>...]
7
+
8
+ ## DESCRIPTION
9
+
10
+ **Genomer-view** assembles the scaffold and associated annotations to produce
11
+ common database file formats. The generated file format view is specified by
12
+ the **flat-file** argument.
13
+
14
+ ## OPTIONS
15
+
16
+ * `--identifier`=[<identifier>]:
17
+ The sequence identifier to include in generated flatfile outputs.
18
+
19
+ * `--strain`=[<strain>]:
20
+ The strain of the source organism.
21
+
22
+ * `--organism`=[<organism>]:
23
+ The genus and species, enclosed in single quotes, of the source organism.
24
+
25
+ * `--prefix`=[<gene-prefix>]:
26
+ Prepend all ID attributes from the annotation file with <gene-prefix> in
27
+ the generated output.
28
+
29
+ * `--reset_locus_numbering`:
30
+ Reset gene ID to begin at 1 from the start of the sequence in the generated
31
+ output file.
32
+
33
+ * `--generate_encoded_features`=[<feature-prefix>]:
34
+ Generate corresponding 1:1 encoded feature entries from the genes entries
35
+ in the annotation file. These will commonly be CDS entries but RNA type
36
+ entries are also supported. The feature IDs are generated from the
37
+ corresponding gene ID prefixed with the <feature-prefix>.
38
+
39
+ ## GFF NINTH COLUMN ATTRIBUTES
40
+
41
+ The annotation file should be in GFF3 format and contain the annotations for
42
+ the scaffolded contigs. The default location for this file is
43
+ **assembly/annotations.gff**. The following attributes in the GFF3 file are
44
+ treated specially by genomer when generating flat file output.
45
+
46
+ ### GFF DEFINED ATTRIBUTES
47
+
48
+ These attributes have a predefined meaning in the GFF specification. These all
49
+ begin with an upper case letter.
50
+
51
+ * `ID`:
52
+ Used to specify the ID of annotations in the output. If the
53
+ `--generate_encoded_features` option is passed, the encoded features have
54
+ an ID generated from this field prefixed with the <feature-prefix>
55
+ argument. This field should be unique in the annotation file.
56
+
57
+ * `Name`:
58
+ Used to specify the four letter annotation name, e.g. pilO. The lower case
59
+ version is used for gene names. If the `--generate_encoded_features` option
60
+ is passed, additonal encoded feature entries have the `product` field
61
+ generated from this capitalised version of this attribute. This need not be
62
+ unique in the file.
63
+
64
+ * `Note`:
65
+ Used to populate the **Note** field for entries when the
66
+ `--generate_encoded_features` option is passed.
67
+
68
+ ### GENOMER ATTRIBUTES
69
+
70
+ These attributes are specific to genomer and should begin with a lower case
71
+ letter. Many of these attributes have a corresponding relationship with fields
72
+ in genbank table format, however a caveat to this is outlined in the next
73
+ section.
74
+
75
+ * `product`:
76
+ Used to populate the **product** field for encoded features when the
77
+ `--generate_encoded_features` option is passed. If the **Name** attribute
78
+ is also present then the **funtion** field is instead populated with this
79
+ value.
80
+
81
+ * `entry_type`:
82
+ When the gene product is not a CDS this field can be used, when the
83
+ `--generate_encoded_features` option is passed, as the corresponding entry
84
+ type instead of `CDS`. The genbank specification list examples for `rRNA`,
85
+ `tmRNA`, `tRNA`, and `miscRNA`. If you require other feature type
86
+ implemented, please contact me through the website below.
87
+
88
+ * `ec_number`:
89
+ Used to populate the protein **EC\_number** field for CDS entries when the
90
+ `--generate_encoded_features` option is passed.
91
+
92
+ * `function`:
93
+ Used to populate the **function** field for encoded entries when the
94
+ `--generate_encoded_features` option is passed. This is overwritten in the
95
+ table output by the **product** attribute if both the **Name** and
96
+ **product** attributes are present. See the next section for an explanation
97
+ of this.
98
+
99
+
100
+ ### OVERLAP BETWEEN NAME, PRODUCT AND FUNCTION FIELD
101
+
102
+ The genbank annotation table **product** fields may contain either a short four
103
+ letter name (e.g. pilO) or a longer gene description (e.g. pilus assembly
104
+ protein). This presents a problem where data may need to be juggled between the
105
+ **Name**, **product** and **function** fields depending on what is information
106
+ is avaiable.
107
+
108
+ Genomer view solves this problem by prioritising these fields in the following
109
+ order: **Name** > **product** > **function**. If the **Name** attribute is
110
+ present this will be used for the **product** field in the resulting genbank
111
+ table. If the **product** attribute is also present at the same time this will
112
+ instead be used to fill out the **function** field in the genbank table. If
113
+ only the **product** and **function** attributes are present then these then
114
+ map to corresponding fields in genbank table.
115
+
116
+ ### RECOMMENDED FORMAT FOR ANNOTATIONS
117
+
118
+ All entries should contain a unique `ID` attribute. A `Name` field be used
119
+ whenever an appropriate four letter name is also available, e.g. 'pilO'. The ID
120
+ field alone is sufficent for generating a gene-only annotation table. Generally
121
+ however you will want to generate the encoded annotations also using the
122
+ `--generate_encoded_annotations` command line flag..
123
+
124
+ The majority of encoded annotations will be CDS entries but most genomes will
125
+ also contain RNA non-coding features. CDS annotations should contain either a
126
+ `product` and/or `Name` field to match the genbank requirements. In general it
127
+ may be easier to fill out all the `product` field for entries then add names
128
+ for entries where possible.
129
+
130
+ ## EXAMPLES
131
+
132
+ Assemble the scaffold sequence into Fasta format. Set the Fasta header to
133
+ include the sequence identifier, strain, and organism.
134
+
135
+ $ genomer view fasta --identifier PRJNA68653 --strain='R124' \
136
+ --organism='Pseudomonas fluorescens'
137
+
138
+ Assemble annotations into GenBank Table format suitable for use with `tbl2asn`.
139
+ Reset the gene order numbering to begin at the sequence start and prefix each
140
+ gene ID with 'I1A\_'. Set the organism identifier at the top of the feature
141
+ table to be 'PRJNA68653'.
142
+
143
+ $ genomer view table --identifier PRJNA68653 --reset_locus_numbering \
144
+ --prefix='I1A_'
145
+
146
+ ## BUGS
147
+
148
+ **Genomer-view** is written in Ruby and depends on the genomer gem. See the
149
+ Gemfile in the genomer-plugin-view gem install directory for version details.
150
+
151
+ ## COPYRIGHT
152
+
153
+ **Genomer** is Copyright (C) 2012 Michael Barton <http://michaelbarton.me.uk>