genomer-plugin-validate 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +4 -0
- data/Gemfile +4 -0
- data/Rakefile +9 -0
- data/VERSION +1 -0
- data/features/annotations/bad-product-field.feature +91 -0
- data/features/annotations/command-line-interface.feature +19 -0
- data/features/annotations/duplicate_id.feature +144 -0
- data/features/annotations/identical_locations.feature +74 -0
- data/features/annotations/incorrect-attributes.feature +135 -0
- data/features/annotations/missing_attributes.feature +75 -0
- data/features/annotations/name.feature +40 -0
- data/features/command-line-interface.feature +36 -0
- data/features/support/env.rb +13 -0
- data/genomer-plugin-validate.gemspec +28 -0
- data/lib/extensions/string.rb +12 -0
- data/lib/genomer-plugin-validate.rb +33 -0
- data/lib/genomer-plugin-validate/group.rb +17 -0
- data/lib/genomer-plugin-validate/group/annotations.rb +20 -0
- data/lib/genomer-plugin-validate/validator.rb +27 -0
- data/lib/genomer-plugin-validate/validator/bad_product_field.rb +45 -0
- data/lib/genomer-plugin-validate/validator/duplicate_coordinates.rb +12 -0
- data/lib/genomer-plugin-validate/validator/duplicate_id.rb +11 -0
- data/lib/genomer-plugin-validate/validator/gff3_attributes.rb +16 -0
- data/lib/genomer-plugin-validate/validator/missing_id.rb +13 -0
- data/lib/genomer-plugin-validate/validator/no_name_or_product.rb +13 -0
- data/lib/genomer-plugin-validate/validator/uppercase_name.rb +13 -0
- data/lib/genomer-plugin-validate/validator/view_attributes.rb +16 -0
- data/man/genomer-validate.ronn +100 -0
- data/spec/genomer-plugin-validate/group/annotations_spec.rb +18 -0
- data/spec/genomer-plugin-validate/group_spec.rb +24 -0
- data/spec/genomer-plugin-validate/validator/bad_product_field_spec.rb +93 -0
- data/spec/genomer-plugin-validate/validator/duplicate_coordinates_spec.rb +24 -0
- data/spec/genomer-plugin-validate/validator/duplicate_id_spec.rb +34 -0
- data/spec/genomer-plugin-validate/validator/gff_attributes_spec.rb +32 -0
- data/spec/genomer-plugin-validate/validator/missing_id_spec.rb +27 -0
- data/spec/genomer-plugin-validate/validator/no_name_or_product_spec.rb +28 -0
- data/spec/genomer-plugin-validate/validator/uppercase_name_spec.rb +22 -0
- data/spec/genomer-plugin-validate/validator/view_attributes_spec.rb +31 -0
- data/spec/genomer-plugin-validate/validator_spec.rb +107 -0
- data/spec/genomer-plugin-validate_spec.rb +92 -0
- data/spec/spec_helper.rb +35 -0
- data/spec/validator_run_matcher.rb +25 -0
- metadata +244 -0
@@ -0,0 +1,75 @@
|
|
1
|
+
Feature: Validating annotation files for missing attributes
|
2
|
+
In order to submit genome annotations
|
3
|
+
A user can use the "annotation" command to detect missing ID, Name, product
|
4
|
+
to ensure that their annotation file contains no errors
|
5
|
+
|
6
|
+
@disable-bundler
|
7
|
+
Scenario: Validating an annotations file with a missing ID attribute
|
8
|
+
Given I successfully run `genomer init project`
|
9
|
+
And I cd to "project"
|
10
|
+
And I write to "assembly/scaffold.yml" with:
|
11
|
+
"""
|
12
|
+
---
|
13
|
+
- sequence:
|
14
|
+
source: contig1
|
15
|
+
"""
|
16
|
+
And I write to "assembly/sequence.fna" with:
|
17
|
+
"""
|
18
|
+
>contig1
|
19
|
+
AAAAATTTTTGGGGGCCCCC
|
20
|
+
"""
|
21
|
+
And I write to "assembly/annotations.gff" with:
|
22
|
+
"""
|
23
|
+
##gff-version 3
|
24
|
+
contig1 . gene 1 3 . + 1 ID=gene1;Name=something
|
25
|
+
contig1 . gene 4 6 . + 1 Name=something
|
26
|
+
"""
|
27
|
+
And I append to "Gemfile" with:
|
28
|
+
"""
|
29
|
+
gem 'genomer-plugin-validate', :path => '../../../'
|
30
|
+
"""
|
31
|
+
When I run `genomer validate annotations`
|
32
|
+
Then the exit status should be 0
|
33
|
+
And the output should contain:
|
34
|
+
"""
|
35
|
+
Annotations found with missing ID attribute
|
36
|
+
|
37
|
+
"""
|
38
|
+
|
39
|
+
@disable-bundler
|
40
|
+
Scenario: Validating an annotations file with a missing Name or product attributes
|
41
|
+
Given I successfully run `genomer init project`
|
42
|
+
And I cd to "project"
|
43
|
+
And I write to "assembly/scaffold.yml" with:
|
44
|
+
"""
|
45
|
+
---
|
46
|
+
- sequence:
|
47
|
+
source: contig1
|
48
|
+
"""
|
49
|
+
And I write to "assembly/sequence.fna" with:
|
50
|
+
"""
|
51
|
+
>contig1
|
52
|
+
AAAAATTTTTGGGGGCCCCC
|
53
|
+
"""
|
54
|
+
And I write to "assembly/annotations.gff" with:
|
55
|
+
"""
|
56
|
+
##gff-version 3
|
57
|
+
contig1 . gene 1 2 . + 1 ID=gene1;Name=something
|
58
|
+
contig1 . gene 3 4 . + 1 ID=gene2;product=something
|
59
|
+
contig1 . gene 5 6 . + 1 ID=gene3;product=something;Name=else
|
60
|
+
contig1 . gene 7 8 . + 1 ID=gene4
|
61
|
+
"""
|
62
|
+
And I append to "Gemfile" with:
|
63
|
+
"""
|
64
|
+
gem 'genomer-plugin-validate', :path => '../../../'
|
65
|
+
"""
|
66
|
+
When I run `genomer validate annotations`
|
67
|
+
Then the exit status should be 0
|
68
|
+
And the output should not contain "gene1"
|
69
|
+
And the output should not contain "gene2"
|
70
|
+
And the output should not contain "gene3"
|
71
|
+
And the output should contain:
|
72
|
+
"""
|
73
|
+
No 'Name' or 'product' attribute for annotation 'gene4'
|
74
|
+
"""
|
75
|
+
|
@@ -0,0 +1,40 @@
|
|
1
|
+
Feature: Validating annotation files for incorrect names
|
2
|
+
In order to submit genome annotations
|
3
|
+
A user can use the "annotation" command to detect uppercase gene names
|
4
|
+
to ensure that their annotation file contains none of these
|
5
|
+
|
6
|
+
@disable-bundler
|
7
|
+
Scenario: Validating an annotations file with an uppercase name attribute
|
8
|
+
Given I successfully run `genomer init project`
|
9
|
+
And I cd to "project"
|
10
|
+
And I write to "assembly/scaffold.yml" with:
|
11
|
+
"""
|
12
|
+
---
|
13
|
+
- sequence:
|
14
|
+
source: contig1
|
15
|
+
"""
|
16
|
+
And I write to "assembly/sequence.fna" with:
|
17
|
+
"""
|
18
|
+
>contig1
|
19
|
+
AAAAATTTTTGGGGGCCCCC
|
20
|
+
"""
|
21
|
+
And I write to "assembly/annotations.gff" with:
|
22
|
+
"""
|
23
|
+
##gff-version 3
|
24
|
+
contig1 . gene 1 3 . + 1 Name=Uppercase;ID=1
|
25
|
+
contig1 . gene 4 6 . + 1 Name=lowercase;ID=2
|
26
|
+
"""
|
27
|
+
And I append to "Gemfile" with:
|
28
|
+
"""
|
29
|
+
gem 'genomer-plugin-validate', :path => '../../../'
|
30
|
+
"""
|
31
|
+
When I run `genomer validate annotations`
|
32
|
+
Then the exit status should be 0
|
33
|
+
And the output should contain:
|
34
|
+
"""
|
35
|
+
Illegal capitalised Name attribute 'Uppercase' for '1'
|
36
|
+
"""
|
37
|
+
And the output should not contain:
|
38
|
+
"""
|
39
|
+
Illegal capitalised Name attribute 'lowercase' for '2'
|
40
|
+
"""
|
@@ -0,0 +1,36 @@
|
|
1
|
+
Feature: The validator command line interface
|
2
|
+
In order to generate correct genomer builds
|
3
|
+
A user can use the "validator" plugin at the command line
|
4
|
+
to validate their genome build
|
5
|
+
|
6
|
+
@disable-bundler
|
7
|
+
Scenario: Running with just the 'validate' command
|
8
|
+
Given I successfully run `genomer init project`
|
9
|
+
And I cd to "project"
|
10
|
+
And I append to "Gemfile" with:
|
11
|
+
"""
|
12
|
+
gem 'genomer-plugin-validate', :path => '../../../'
|
13
|
+
"""
|
14
|
+
When I run `genomer validate`
|
15
|
+
Then the exit status should be 0
|
16
|
+
And the output should contain:
|
17
|
+
"""
|
18
|
+
USAGE: genomer validate <GROUP>
|
19
|
+
|
20
|
+
Available validation groups:
|
21
|
+
"""
|
22
|
+
|
23
|
+
@disable-bundler
|
24
|
+
Scenario: Running with an unknown validation group
|
25
|
+
Given I successfully run `genomer init project`
|
26
|
+
And I cd to "project"
|
27
|
+
And I append to "Gemfile" with:
|
28
|
+
"""
|
29
|
+
gem 'genomer-plugin-validate', :path => '../../../'
|
30
|
+
"""
|
31
|
+
When I run `genomer validate unknown`
|
32
|
+
Then the exit status should be 1
|
33
|
+
And the output should contain:
|
34
|
+
"""
|
35
|
+
Error. Unknown validation group 'unknown'
|
36
|
+
"""
|
@@ -0,0 +1,13 @@
|
|
1
|
+
require 'bundler'
|
2
|
+
begin
|
3
|
+
Bundler.setup(:default, :development)
|
4
|
+
rescue Bundler::BundlerError => e
|
5
|
+
$stderr.puts e.message
|
6
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
7
|
+
exit e.status_code
|
8
|
+
end
|
9
|
+
|
10
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__) + '/../../lib')
|
11
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__) + '/../../spec')
|
12
|
+
|
13
|
+
require 'aruba/cucumber'
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
Gem::Specification.new do |s|
|
4
|
+
s.name = "genomer-plugin-validate"
|
5
|
+
s.version = File.read('VERSION')
|
6
|
+
s.authors = ["Michael Barton"]
|
7
|
+
s.email = ["mail@michaelbarton.me.uk"]
|
8
|
+
s.homepage = ""
|
9
|
+
s.summary = %q{Validate assembly files for errors}
|
10
|
+
s.description = %q{Test assembly files for common errors which may lead to incorrect assembly}
|
11
|
+
|
12
|
+
s.rubyforge_project = "genomer-plugin-validate"
|
13
|
+
|
14
|
+
s.files = `git ls-files`.split("\n")
|
15
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
16
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
17
|
+
s.require_paths = ["lib"]
|
18
|
+
|
19
|
+
s.add_runtime_dependency "genomer", ">= 0.0.4"
|
20
|
+
s.add_runtime_dependency "heredoc_unindent", "~> 1.1.0"
|
21
|
+
|
22
|
+
s.add_development_dependency "rake"
|
23
|
+
s.add_development_dependency "rspec", "~> 2.8.0"
|
24
|
+
s.add_development_dependency "scaffolder-test-helpers", "~> 0.4.1"
|
25
|
+
s.add_development_dependency "cucumber", "~> 1.1.4"
|
26
|
+
s.add_development_dependency "aruba", "~> 0.4.11"
|
27
|
+
s.add_development_dependency "rr", "~> 1.0.4"
|
28
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
require "genomer"
|
2
|
+
require "heredoc_unindent"
|
3
|
+
|
4
|
+
class GenomerPluginValidate < Genomer::Plugin
|
5
|
+
require 'genomer-plugin-validate/validator'
|
6
|
+
require 'genomer-plugin-validate/group'
|
7
|
+
|
8
|
+
def run
|
9
|
+
name = arguments.shift
|
10
|
+
return self.class.help_message if name.nil?
|
11
|
+
|
12
|
+
group = Group.groups[name]
|
13
|
+
raise Genomer::Error, "Unknown validation group '#{name}'" if group.nil?
|
14
|
+
|
15
|
+
group.validators.map{|i| Validator.validators[i]}.map do |v|
|
16
|
+
v.new(arguments,flags).run
|
17
|
+
end.flatten * "\n"
|
18
|
+
end
|
19
|
+
|
20
|
+
def self.help_message
|
21
|
+
msg = <<-EOS.unindent
|
22
|
+
USAGE: genomer validate <GROUP>
|
23
|
+
|
24
|
+
Available validation groups:
|
25
|
+
EOS
|
26
|
+
msg << Group.groups.map do |(k,v)|
|
27
|
+
str = ' '
|
28
|
+
str << k.ljust(15)
|
29
|
+
str << v.description
|
30
|
+
end * "\n"
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module GenomerPluginValidate::Group
|
2
|
+
|
3
|
+
def self.load
|
4
|
+
path = File.join(File.dirname(__FILE__),'..','genomer-plugin-validate','group','*')
|
5
|
+
Dir[path].each do |i|
|
6
|
+
require i if i =~ /\.rb/
|
7
|
+
end
|
8
|
+
end
|
9
|
+
|
10
|
+
def self.groups
|
11
|
+
load
|
12
|
+
Hash[constants.map do |name|
|
13
|
+
[name.to_s.downcase,const_get(name)]
|
14
|
+
end]
|
15
|
+
end
|
16
|
+
|
17
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
class GenomerPluginValidate::Group::Annotations
|
2
|
+
|
3
|
+
def self.description
|
4
|
+
"Validate GFF3 annotations file"
|
5
|
+
end
|
6
|
+
|
7
|
+
def self.validators
|
8
|
+
[
|
9
|
+
:duplicate_id,
|
10
|
+
:missing_id,
|
11
|
+
:no_name_or_product,
|
12
|
+
:gff3_attributes,
|
13
|
+
:view_attributes,
|
14
|
+
:duplicate_coordinates,
|
15
|
+
:uppercase_name,
|
16
|
+
:bad_product_field
|
17
|
+
]
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
require 'extensions/string'
|
2
|
+
|
3
|
+
module GenomerPluginValidate::Validator
|
4
|
+
|
5
|
+
def self.load
|
6
|
+
path = File.join(File.dirname(__FILE__),'..','genomer-plugin-validate','validator','*')
|
7
|
+
Dir[path].each do |i|
|
8
|
+
require i if i =~ /\.rb/
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
def self.validators
|
13
|
+
load
|
14
|
+
Hash[constants.map do |name|
|
15
|
+
[name.to_s.underscore.to_sym, const_get(name)]
|
16
|
+
end]
|
17
|
+
end
|
18
|
+
|
19
|
+
def annotations_by_attribute(attr)
|
20
|
+
annotations.inject(Hash.new{|h,k| h[k] = []}) do |hash,attn|
|
21
|
+
attr_value = attn.get_attribute(attr) ? attn.get_attribute(attr).to_s : nil
|
22
|
+
hash[attr_value] <<= attn
|
23
|
+
hash
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
class GenomerPluginValidate::Validator::BadProductField < Genomer::Plugin
|
2
|
+
include GenomerPluginValidate::Validator
|
3
|
+
|
4
|
+
ERROR = "Bad product field for '%s:' "
|
5
|
+
|
6
|
+
def run
|
7
|
+
[
|
8
|
+
hypothetical_products,
|
9
|
+
domain_related_like_ending_products,
|
10
|
+
nterm_products,
|
11
|
+
all_caps_products
|
12
|
+
].flatten
|
13
|
+
end
|
14
|
+
|
15
|
+
def products_matching(re)
|
16
|
+
annotations_by_attribute("product").
|
17
|
+
map{|(product,entries)| entries.map{|i| [i.id,product]}}.
|
18
|
+
flatten(1).
|
19
|
+
map{|(id,product)| [id, re.match(product)]}.
|
20
|
+
select{|(_,match)| match}.
|
21
|
+
map{|(id,match)| [id,match.to_a[1].downcase]}
|
22
|
+
end
|
23
|
+
|
24
|
+
def hypothetical_products
|
25
|
+
products_matching(/^([Hh]ypothetical)(?! protein)/).
|
26
|
+
map{|i| (ERROR + "start with 'putative' instead of '%s.'") % i}
|
27
|
+
end
|
28
|
+
|
29
|
+
def domain_related_like_ending_products
|
30
|
+
products_matching(/([Dd]omain|[Rr]elated|[Ll]ike).?$/).
|
31
|
+
map{|i| (ERROR + "products ending with '%s' are not allowed.") % i}
|
32
|
+
end
|
33
|
+
|
34
|
+
def nterm_products
|
35
|
+
products_matching(/(?!\B)([Nn][-\s][Tt]erm(inal)?)/).
|
36
|
+
map{|i| (ERROR + "'N-terminal' or variations are not allowed.") % i}
|
37
|
+
end
|
38
|
+
|
39
|
+
def all_caps_products
|
40
|
+
products_matching(/^([A-Z\s-]+)$/).
|
41
|
+
map(&:first).
|
42
|
+
map{|i| (ERROR + "all caps product fields are not allowed.") % i}
|
43
|
+
end
|
44
|
+
|
45
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
class GenomerPluginValidate::Validator::DuplicateCoordinates < Genomer::Plugin
|
2
|
+
|
3
|
+
def run
|
4
|
+
annotations.
|
5
|
+
group_by{|attn| [attn.start, attn.end].sort }.
|
6
|
+
select{|_,v| v.length > 1}.
|
7
|
+
map{|(coords,attns)| attns.map(&:id).sort }.
|
8
|
+
map{|attns| attns.map{|attns| "'#{attns}'"}.join(', ') }.
|
9
|
+
map{|attns| "Identical locations for #{attns}" }
|
10
|
+
end
|
11
|
+
|
12
|
+
end
|
@@ -0,0 +1,11 @@
|
|
1
|
+
class GenomerPluginValidate::Validator::DuplicateID < Genomer::Plugin
|
2
|
+
include GenomerPluginValidate::Validator
|
3
|
+
|
4
|
+
def run
|
5
|
+
annotations_by_attribute('ID').
|
6
|
+
select{|_,v| v.length > 1}.
|
7
|
+
select{|k,_| ! k.nil? }.
|
8
|
+
map{|(id,_)| "Duplicate ID '#{id}'" }
|
9
|
+
end
|
10
|
+
|
11
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
class GenomerPluginValidate::Validator::Gff3Attributes < Genomer::Plugin
|
2
|
+
|
3
|
+
def valid_gff3_attributes
|
4
|
+
%w|ID Name Alias Parent Target Gap Derives_from Note
|
5
|
+
Dbxref Ontology_term Is_circular|
|
6
|
+
end
|
7
|
+
|
8
|
+
def run
|
9
|
+
annotations.
|
10
|
+
map{|attn| attn.attributes.map{|(k,v)| [k,attn] }}.
|
11
|
+
flatten(1).
|
12
|
+
select{|(term,_)| term =~ (/^[A-Z]/) }.
|
13
|
+
reject{|(term,_)| valid_gff3_attributes.include? term }.
|
14
|
+
map{|(term,attn)| "Illegal GFF3 attribute '#{term}' for '#{attn.id}'"}
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
class GenomerPluginValidate::Validator::MissingID < Genomer::Plugin
|
2
|
+
include GenomerPluginValidate::Validator
|
3
|
+
|
4
|
+
def run
|
5
|
+
missing = annotations_by_attribute('ID').detect{|k,_| k.nil? }
|
6
|
+
if missing
|
7
|
+
["Annotations found with missing ID attribute"]
|
8
|
+
else
|
9
|
+
[]
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
class GenomerPluginValidate::Validator::NoNameOrProduct < Genomer::Plugin
|
2
|
+
include GenomerPluginValidate::Validator
|
3
|
+
|
4
|
+
def run
|
5
|
+
no_product = annotations_by_attribute('product')[nil].map(&:id)
|
6
|
+
no_name = annotations_by_attribute('Name')[nil].map(&:id)
|
7
|
+
|
8
|
+
(no_name & no_product).map do |id|
|
9
|
+
"No 'Name' or 'product' attribute for annotation '#{id}'"
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
class GenomerPluginValidate::Validator::UppercaseName < Genomer::Plugin
|
2
|
+
include GenomerPluginValidate::Validator
|
3
|
+
|
4
|
+
def run
|
5
|
+
annotations_by_attribute('Name').
|
6
|
+
select{|(name,_)| name =~ /^[A-Z]/}.
|
7
|
+
map{|(_,entry)| entry}.
|
8
|
+
flatten.
|
9
|
+
map{|i| [i.get_attribute('Name'),i.id]}.
|
10
|
+
map{|i| "Illegal capitalised Name attribute '%s' for '%s'" % i}
|
11
|
+
end
|
12
|
+
|
13
|
+
end
|