genomer-plugin-validate 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +4 -0
- data/Gemfile +4 -0
- data/Rakefile +9 -0
- data/VERSION +1 -0
- data/features/annotations/bad-product-field.feature +91 -0
- data/features/annotations/command-line-interface.feature +19 -0
- data/features/annotations/duplicate_id.feature +144 -0
- data/features/annotations/identical_locations.feature +74 -0
- data/features/annotations/incorrect-attributes.feature +135 -0
- data/features/annotations/missing_attributes.feature +75 -0
- data/features/annotations/name.feature +40 -0
- data/features/command-line-interface.feature +36 -0
- data/features/support/env.rb +13 -0
- data/genomer-plugin-validate.gemspec +28 -0
- data/lib/extensions/string.rb +12 -0
- data/lib/genomer-plugin-validate.rb +33 -0
- data/lib/genomer-plugin-validate/group.rb +17 -0
- data/lib/genomer-plugin-validate/group/annotations.rb +20 -0
- data/lib/genomer-plugin-validate/validator.rb +27 -0
- data/lib/genomer-plugin-validate/validator/bad_product_field.rb +45 -0
- data/lib/genomer-plugin-validate/validator/duplicate_coordinates.rb +12 -0
- data/lib/genomer-plugin-validate/validator/duplicate_id.rb +11 -0
- data/lib/genomer-plugin-validate/validator/gff3_attributes.rb +16 -0
- data/lib/genomer-plugin-validate/validator/missing_id.rb +13 -0
- data/lib/genomer-plugin-validate/validator/no_name_or_product.rb +13 -0
- data/lib/genomer-plugin-validate/validator/uppercase_name.rb +13 -0
- data/lib/genomer-plugin-validate/validator/view_attributes.rb +16 -0
- data/man/genomer-validate.ronn +100 -0
- data/spec/genomer-plugin-validate/group/annotations_spec.rb +18 -0
- data/spec/genomer-plugin-validate/group_spec.rb +24 -0
- data/spec/genomer-plugin-validate/validator/bad_product_field_spec.rb +93 -0
- data/spec/genomer-plugin-validate/validator/duplicate_coordinates_spec.rb +24 -0
- data/spec/genomer-plugin-validate/validator/duplicate_id_spec.rb +34 -0
- data/spec/genomer-plugin-validate/validator/gff_attributes_spec.rb +32 -0
- data/spec/genomer-plugin-validate/validator/missing_id_spec.rb +27 -0
- data/spec/genomer-plugin-validate/validator/no_name_or_product_spec.rb +28 -0
- data/spec/genomer-plugin-validate/validator/uppercase_name_spec.rb +22 -0
- data/spec/genomer-plugin-validate/validator/view_attributes_spec.rb +31 -0
- data/spec/genomer-plugin-validate/validator_spec.rb +107 -0
- data/spec/genomer-plugin-validate_spec.rb +92 -0
- data/spec/spec_helper.rb +35 -0
- data/spec/validator_run_matcher.rb +25 -0
- metadata +244 -0
@@ -0,0 +1,75 @@
|
|
1
|
+
Feature: Validating annotation files for missing attributes
|
2
|
+
In order to submit genome annotations
|
3
|
+
A user can use the "annotation" command to detect missing ID, Name, product
|
4
|
+
to ensure that their annotation file contains no errors
|
5
|
+
|
6
|
+
@disable-bundler
|
7
|
+
Scenario: Validating an annotations file with a missing ID attribute
|
8
|
+
Given I successfully run `genomer init project`
|
9
|
+
And I cd to "project"
|
10
|
+
And I write to "assembly/scaffold.yml" with:
|
11
|
+
"""
|
12
|
+
---
|
13
|
+
- sequence:
|
14
|
+
source: contig1
|
15
|
+
"""
|
16
|
+
And I write to "assembly/sequence.fna" with:
|
17
|
+
"""
|
18
|
+
>contig1
|
19
|
+
AAAAATTTTTGGGGGCCCCC
|
20
|
+
"""
|
21
|
+
And I write to "assembly/annotations.gff" with:
|
22
|
+
"""
|
23
|
+
##gff-version 3
|
24
|
+
contig1 . gene 1 3 . + 1 ID=gene1;Name=something
|
25
|
+
contig1 . gene 4 6 . + 1 Name=something
|
26
|
+
"""
|
27
|
+
And I append to "Gemfile" with:
|
28
|
+
"""
|
29
|
+
gem 'genomer-plugin-validate', :path => '../../../'
|
30
|
+
"""
|
31
|
+
When I run `genomer validate annotations`
|
32
|
+
Then the exit status should be 0
|
33
|
+
And the output should contain:
|
34
|
+
"""
|
35
|
+
Annotations found with missing ID attribute
|
36
|
+
|
37
|
+
"""
|
38
|
+
|
39
|
+
@disable-bundler
|
40
|
+
Scenario: Validating an annotations file with a missing Name or product attributes
|
41
|
+
Given I successfully run `genomer init project`
|
42
|
+
And I cd to "project"
|
43
|
+
And I write to "assembly/scaffold.yml" with:
|
44
|
+
"""
|
45
|
+
---
|
46
|
+
- sequence:
|
47
|
+
source: contig1
|
48
|
+
"""
|
49
|
+
And I write to "assembly/sequence.fna" with:
|
50
|
+
"""
|
51
|
+
>contig1
|
52
|
+
AAAAATTTTTGGGGGCCCCC
|
53
|
+
"""
|
54
|
+
And I write to "assembly/annotations.gff" with:
|
55
|
+
"""
|
56
|
+
##gff-version 3
|
57
|
+
contig1 . gene 1 2 . + 1 ID=gene1;Name=something
|
58
|
+
contig1 . gene 3 4 . + 1 ID=gene2;product=something
|
59
|
+
contig1 . gene 5 6 . + 1 ID=gene3;product=something;Name=else
|
60
|
+
contig1 . gene 7 8 . + 1 ID=gene4
|
61
|
+
"""
|
62
|
+
And I append to "Gemfile" with:
|
63
|
+
"""
|
64
|
+
gem 'genomer-plugin-validate', :path => '../../../'
|
65
|
+
"""
|
66
|
+
When I run `genomer validate annotations`
|
67
|
+
Then the exit status should be 0
|
68
|
+
And the output should not contain "gene1"
|
69
|
+
And the output should not contain "gene2"
|
70
|
+
And the output should not contain "gene3"
|
71
|
+
And the output should contain:
|
72
|
+
"""
|
73
|
+
No 'Name' or 'product' attribute for annotation 'gene4'
|
74
|
+
"""
|
75
|
+
|
@@ -0,0 +1,40 @@
|
|
1
|
+
Feature: Validating annotation files for incorrect names
|
2
|
+
In order to submit genome annotations
|
3
|
+
A user can use the "annotation" command to detect uppercase gene names
|
4
|
+
to ensure that their annotation file contains none of these
|
5
|
+
|
6
|
+
@disable-bundler
|
7
|
+
Scenario: Validating an annotations file with an uppercase name attribute
|
8
|
+
Given I successfully run `genomer init project`
|
9
|
+
And I cd to "project"
|
10
|
+
And I write to "assembly/scaffold.yml" with:
|
11
|
+
"""
|
12
|
+
---
|
13
|
+
- sequence:
|
14
|
+
source: contig1
|
15
|
+
"""
|
16
|
+
And I write to "assembly/sequence.fna" with:
|
17
|
+
"""
|
18
|
+
>contig1
|
19
|
+
AAAAATTTTTGGGGGCCCCC
|
20
|
+
"""
|
21
|
+
And I write to "assembly/annotations.gff" with:
|
22
|
+
"""
|
23
|
+
##gff-version 3
|
24
|
+
contig1 . gene 1 3 . + 1 Name=Uppercase;ID=1
|
25
|
+
contig1 . gene 4 6 . + 1 Name=lowercase;ID=2
|
26
|
+
"""
|
27
|
+
And I append to "Gemfile" with:
|
28
|
+
"""
|
29
|
+
gem 'genomer-plugin-validate', :path => '../../../'
|
30
|
+
"""
|
31
|
+
When I run `genomer validate annotations`
|
32
|
+
Then the exit status should be 0
|
33
|
+
And the output should contain:
|
34
|
+
"""
|
35
|
+
Illegal capitalised Name attribute 'Uppercase' for '1'
|
36
|
+
"""
|
37
|
+
And the output should not contain:
|
38
|
+
"""
|
39
|
+
Illegal capitalised Name attribute 'lowercase' for '2'
|
40
|
+
"""
|
@@ -0,0 +1,36 @@
|
|
1
|
+
Feature: The validator command line interface
|
2
|
+
In order to generate correct genomer builds
|
3
|
+
A user can use the "validator" plugin at the command line
|
4
|
+
to validate their genome build
|
5
|
+
|
6
|
+
@disable-bundler
|
7
|
+
Scenario: Running with just the 'validate' command
|
8
|
+
Given I successfully run `genomer init project`
|
9
|
+
And I cd to "project"
|
10
|
+
And I append to "Gemfile" with:
|
11
|
+
"""
|
12
|
+
gem 'genomer-plugin-validate', :path => '../../../'
|
13
|
+
"""
|
14
|
+
When I run `genomer validate`
|
15
|
+
Then the exit status should be 0
|
16
|
+
And the output should contain:
|
17
|
+
"""
|
18
|
+
USAGE: genomer validate <GROUP>
|
19
|
+
|
20
|
+
Available validation groups:
|
21
|
+
"""
|
22
|
+
|
23
|
+
@disable-bundler
|
24
|
+
Scenario: Running with an unknown validation group
|
25
|
+
Given I successfully run `genomer init project`
|
26
|
+
And I cd to "project"
|
27
|
+
And I append to "Gemfile" with:
|
28
|
+
"""
|
29
|
+
gem 'genomer-plugin-validate', :path => '../../../'
|
30
|
+
"""
|
31
|
+
When I run `genomer validate unknown`
|
32
|
+
Then the exit status should be 1
|
33
|
+
And the output should contain:
|
34
|
+
"""
|
35
|
+
Error. Unknown validation group 'unknown'
|
36
|
+
"""
|
@@ -0,0 +1,13 @@
|
|
1
|
+
require 'bundler'
|
2
|
+
begin
|
3
|
+
Bundler.setup(:default, :development)
|
4
|
+
rescue Bundler::BundlerError => e
|
5
|
+
$stderr.puts e.message
|
6
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
7
|
+
exit e.status_code
|
8
|
+
end
|
9
|
+
|
10
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__) + '/../../lib')
|
11
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__) + '/../../spec')
|
12
|
+
|
13
|
+
require 'aruba/cucumber'
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
Gem::Specification.new do |s|
|
4
|
+
s.name = "genomer-plugin-validate"
|
5
|
+
s.version = File.read('VERSION')
|
6
|
+
s.authors = ["Michael Barton"]
|
7
|
+
s.email = ["mail@michaelbarton.me.uk"]
|
8
|
+
s.homepage = ""
|
9
|
+
s.summary = %q{Validate assembly files for errors}
|
10
|
+
s.description = %q{Test assembly files for common errors which may lead to incorrect assembly}
|
11
|
+
|
12
|
+
s.rubyforge_project = "genomer-plugin-validate"
|
13
|
+
|
14
|
+
s.files = `git ls-files`.split("\n")
|
15
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
16
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
17
|
+
s.require_paths = ["lib"]
|
18
|
+
|
19
|
+
s.add_runtime_dependency "genomer", ">= 0.0.4"
|
20
|
+
s.add_runtime_dependency "heredoc_unindent", "~> 1.1.0"
|
21
|
+
|
22
|
+
s.add_development_dependency "rake"
|
23
|
+
s.add_development_dependency "rspec", "~> 2.8.0"
|
24
|
+
s.add_development_dependency "scaffolder-test-helpers", "~> 0.4.1"
|
25
|
+
s.add_development_dependency "cucumber", "~> 1.1.4"
|
26
|
+
s.add_development_dependency "aruba", "~> 0.4.11"
|
27
|
+
s.add_development_dependency "rr", "~> 1.0.4"
|
28
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
require "genomer"
|
2
|
+
require "heredoc_unindent"
|
3
|
+
|
4
|
+
class GenomerPluginValidate < Genomer::Plugin
|
5
|
+
require 'genomer-plugin-validate/validator'
|
6
|
+
require 'genomer-plugin-validate/group'
|
7
|
+
|
8
|
+
def run
|
9
|
+
name = arguments.shift
|
10
|
+
return self.class.help_message if name.nil?
|
11
|
+
|
12
|
+
group = Group.groups[name]
|
13
|
+
raise Genomer::Error, "Unknown validation group '#{name}'" if group.nil?
|
14
|
+
|
15
|
+
group.validators.map{|i| Validator.validators[i]}.map do |v|
|
16
|
+
v.new(arguments,flags).run
|
17
|
+
end.flatten * "\n"
|
18
|
+
end
|
19
|
+
|
20
|
+
def self.help_message
|
21
|
+
msg = <<-EOS.unindent
|
22
|
+
USAGE: genomer validate <GROUP>
|
23
|
+
|
24
|
+
Available validation groups:
|
25
|
+
EOS
|
26
|
+
msg << Group.groups.map do |(k,v)|
|
27
|
+
str = ' '
|
28
|
+
str << k.ljust(15)
|
29
|
+
str << v.description
|
30
|
+
end * "\n"
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module GenomerPluginValidate::Group
|
2
|
+
|
3
|
+
def self.load
|
4
|
+
path = File.join(File.dirname(__FILE__),'..','genomer-plugin-validate','group','*')
|
5
|
+
Dir[path].each do |i|
|
6
|
+
require i if i =~ /\.rb/
|
7
|
+
end
|
8
|
+
end
|
9
|
+
|
10
|
+
def self.groups
|
11
|
+
load
|
12
|
+
Hash[constants.map do |name|
|
13
|
+
[name.to_s.downcase,const_get(name)]
|
14
|
+
end]
|
15
|
+
end
|
16
|
+
|
17
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
class GenomerPluginValidate::Group::Annotations
|
2
|
+
|
3
|
+
def self.description
|
4
|
+
"Validate GFF3 annotations file"
|
5
|
+
end
|
6
|
+
|
7
|
+
def self.validators
|
8
|
+
[
|
9
|
+
:duplicate_id,
|
10
|
+
:missing_id,
|
11
|
+
:no_name_or_product,
|
12
|
+
:gff3_attributes,
|
13
|
+
:view_attributes,
|
14
|
+
:duplicate_coordinates,
|
15
|
+
:uppercase_name,
|
16
|
+
:bad_product_field
|
17
|
+
]
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
require 'extensions/string'
|
2
|
+
|
3
|
+
module GenomerPluginValidate::Validator
|
4
|
+
|
5
|
+
def self.load
|
6
|
+
path = File.join(File.dirname(__FILE__),'..','genomer-plugin-validate','validator','*')
|
7
|
+
Dir[path].each do |i|
|
8
|
+
require i if i =~ /\.rb/
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
def self.validators
|
13
|
+
load
|
14
|
+
Hash[constants.map do |name|
|
15
|
+
[name.to_s.underscore.to_sym, const_get(name)]
|
16
|
+
end]
|
17
|
+
end
|
18
|
+
|
19
|
+
def annotations_by_attribute(attr)
|
20
|
+
annotations.inject(Hash.new{|h,k| h[k] = []}) do |hash,attn|
|
21
|
+
attr_value = attn.get_attribute(attr) ? attn.get_attribute(attr).to_s : nil
|
22
|
+
hash[attr_value] <<= attn
|
23
|
+
hash
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
class GenomerPluginValidate::Validator::BadProductField < Genomer::Plugin
|
2
|
+
include GenomerPluginValidate::Validator
|
3
|
+
|
4
|
+
ERROR = "Bad product field for '%s:' "
|
5
|
+
|
6
|
+
def run
|
7
|
+
[
|
8
|
+
hypothetical_products,
|
9
|
+
domain_related_like_ending_products,
|
10
|
+
nterm_products,
|
11
|
+
all_caps_products
|
12
|
+
].flatten
|
13
|
+
end
|
14
|
+
|
15
|
+
def products_matching(re)
|
16
|
+
annotations_by_attribute("product").
|
17
|
+
map{|(product,entries)| entries.map{|i| [i.id,product]}}.
|
18
|
+
flatten(1).
|
19
|
+
map{|(id,product)| [id, re.match(product)]}.
|
20
|
+
select{|(_,match)| match}.
|
21
|
+
map{|(id,match)| [id,match.to_a[1].downcase]}
|
22
|
+
end
|
23
|
+
|
24
|
+
def hypothetical_products
|
25
|
+
products_matching(/^([Hh]ypothetical)(?! protein)/).
|
26
|
+
map{|i| (ERROR + "start with 'putative' instead of '%s.'") % i}
|
27
|
+
end
|
28
|
+
|
29
|
+
def domain_related_like_ending_products
|
30
|
+
products_matching(/([Dd]omain|[Rr]elated|[Ll]ike).?$/).
|
31
|
+
map{|i| (ERROR + "products ending with '%s' are not allowed.") % i}
|
32
|
+
end
|
33
|
+
|
34
|
+
def nterm_products
|
35
|
+
products_matching(/(?!\B)([Nn][-\s][Tt]erm(inal)?)/).
|
36
|
+
map{|i| (ERROR + "'N-terminal' or variations are not allowed.") % i}
|
37
|
+
end
|
38
|
+
|
39
|
+
def all_caps_products
|
40
|
+
products_matching(/^([A-Z\s-]+)$/).
|
41
|
+
map(&:first).
|
42
|
+
map{|i| (ERROR + "all caps product fields are not allowed.") % i}
|
43
|
+
end
|
44
|
+
|
45
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
class GenomerPluginValidate::Validator::DuplicateCoordinates < Genomer::Plugin
|
2
|
+
|
3
|
+
def run
|
4
|
+
annotations.
|
5
|
+
group_by{|attn| [attn.start, attn.end].sort }.
|
6
|
+
select{|_,v| v.length > 1}.
|
7
|
+
map{|(coords,attns)| attns.map(&:id).sort }.
|
8
|
+
map{|attns| attns.map{|attns| "'#{attns}'"}.join(', ') }.
|
9
|
+
map{|attns| "Identical locations for #{attns}" }
|
10
|
+
end
|
11
|
+
|
12
|
+
end
|
@@ -0,0 +1,11 @@
|
|
1
|
+
class GenomerPluginValidate::Validator::DuplicateID < Genomer::Plugin
|
2
|
+
include GenomerPluginValidate::Validator
|
3
|
+
|
4
|
+
def run
|
5
|
+
annotations_by_attribute('ID').
|
6
|
+
select{|_,v| v.length > 1}.
|
7
|
+
select{|k,_| ! k.nil? }.
|
8
|
+
map{|(id,_)| "Duplicate ID '#{id}'" }
|
9
|
+
end
|
10
|
+
|
11
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
class GenomerPluginValidate::Validator::Gff3Attributes < Genomer::Plugin
|
2
|
+
|
3
|
+
def valid_gff3_attributes
|
4
|
+
%w|ID Name Alias Parent Target Gap Derives_from Note
|
5
|
+
Dbxref Ontology_term Is_circular|
|
6
|
+
end
|
7
|
+
|
8
|
+
def run
|
9
|
+
annotations.
|
10
|
+
map{|attn| attn.attributes.map{|(k,v)| [k,attn] }}.
|
11
|
+
flatten(1).
|
12
|
+
select{|(term,_)| term =~ (/^[A-Z]/) }.
|
13
|
+
reject{|(term,_)| valid_gff3_attributes.include? term }.
|
14
|
+
map{|(term,attn)| "Illegal GFF3 attribute '#{term}' for '#{attn.id}'"}
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
class GenomerPluginValidate::Validator::MissingID < Genomer::Plugin
|
2
|
+
include GenomerPluginValidate::Validator
|
3
|
+
|
4
|
+
def run
|
5
|
+
missing = annotations_by_attribute('ID').detect{|k,_| k.nil? }
|
6
|
+
if missing
|
7
|
+
["Annotations found with missing ID attribute"]
|
8
|
+
else
|
9
|
+
[]
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
class GenomerPluginValidate::Validator::NoNameOrProduct < Genomer::Plugin
|
2
|
+
include GenomerPluginValidate::Validator
|
3
|
+
|
4
|
+
def run
|
5
|
+
no_product = annotations_by_attribute('product')[nil].map(&:id)
|
6
|
+
no_name = annotations_by_attribute('Name')[nil].map(&:id)
|
7
|
+
|
8
|
+
(no_name & no_product).map do |id|
|
9
|
+
"No 'Name' or 'product' attribute for annotation '#{id}'"
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
class GenomerPluginValidate::Validator::UppercaseName < Genomer::Plugin
|
2
|
+
include GenomerPluginValidate::Validator
|
3
|
+
|
4
|
+
def run
|
5
|
+
annotations_by_attribute('Name').
|
6
|
+
select{|(name,_)| name =~ /^[A-Z]/}.
|
7
|
+
map{|(_,entry)| entry}.
|
8
|
+
flatten.
|
9
|
+
map{|i| [i.get_attribute('Name'),i.id]}.
|
10
|
+
map{|i| "Illegal capitalised Name attribute '%s' for '%s'" % i}
|
11
|
+
end
|
12
|
+
|
13
|
+
end
|