genomer-plugin-validate 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (43) hide show
  1. data/.gitignore +4 -0
  2. data/Gemfile +4 -0
  3. data/Rakefile +9 -0
  4. data/VERSION +1 -0
  5. data/features/annotations/bad-product-field.feature +91 -0
  6. data/features/annotations/command-line-interface.feature +19 -0
  7. data/features/annotations/duplicate_id.feature +144 -0
  8. data/features/annotations/identical_locations.feature +74 -0
  9. data/features/annotations/incorrect-attributes.feature +135 -0
  10. data/features/annotations/missing_attributes.feature +75 -0
  11. data/features/annotations/name.feature +40 -0
  12. data/features/command-line-interface.feature +36 -0
  13. data/features/support/env.rb +13 -0
  14. data/genomer-plugin-validate.gemspec +28 -0
  15. data/lib/extensions/string.rb +12 -0
  16. data/lib/genomer-plugin-validate.rb +33 -0
  17. data/lib/genomer-plugin-validate/group.rb +17 -0
  18. data/lib/genomer-plugin-validate/group/annotations.rb +20 -0
  19. data/lib/genomer-plugin-validate/validator.rb +27 -0
  20. data/lib/genomer-plugin-validate/validator/bad_product_field.rb +45 -0
  21. data/lib/genomer-plugin-validate/validator/duplicate_coordinates.rb +12 -0
  22. data/lib/genomer-plugin-validate/validator/duplicate_id.rb +11 -0
  23. data/lib/genomer-plugin-validate/validator/gff3_attributes.rb +16 -0
  24. data/lib/genomer-plugin-validate/validator/missing_id.rb +13 -0
  25. data/lib/genomer-plugin-validate/validator/no_name_or_product.rb +13 -0
  26. data/lib/genomer-plugin-validate/validator/uppercase_name.rb +13 -0
  27. data/lib/genomer-plugin-validate/validator/view_attributes.rb +16 -0
  28. data/man/genomer-validate.ronn +100 -0
  29. data/spec/genomer-plugin-validate/group/annotations_spec.rb +18 -0
  30. data/spec/genomer-plugin-validate/group_spec.rb +24 -0
  31. data/spec/genomer-plugin-validate/validator/bad_product_field_spec.rb +93 -0
  32. data/spec/genomer-plugin-validate/validator/duplicate_coordinates_spec.rb +24 -0
  33. data/spec/genomer-plugin-validate/validator/duplicate_id_spec.rb +34 -0
  34. data/spec/genomer-plugin-validate/validator/gff_attributes_spec.rb +32 -0
  35. data/spec/genomer-plugin-validate/validator/missing_id_spec.rb +27 -0
  36. data/spec/genomer-plugin-validate/validator/no_name_or_product_spec.rb +28 -0
  37. data/spec/genomer-plugin-validate/validator/uppercase_name_spec.rb +22 -0
  38. data/spec/genomer-plugin-validate/validator/view_attributes_spec.rb +31 -0
  39. data/spec/genomer-plugin-validate/validator_spec.rb +107 -0
  40. data/spec/genomer-plugin-validate_spec.rb +92 -0
  41. data/spec/spec_helper.rb +35 -0
  42. data/spec/validator_run_matcher.rb +25 -0
  43. metadata +244 -0
@@ -0,0 +1,75 @@
1
+ Feature: Validating annotation files for missing attributes
2
+ In order to submit genome annotations
3
+ A user can use the "annotation" command to detect missing ID, Name, product
4
+ to ensure that their annotation file contains no errors
5
+
6
+ @disable-bundler
7
+ Scenario: Validating an annotations file with a missing ID attribute
8
+ Given I successfully run `genomer init project`
9
+ And I cd to "project"
10
+ And I write to "assembly/scaffold.yml" with:
11
+ """
12
+ ---
13
+ - sequence:
14
+ source: contig1
15
+ """
16
+ And I write to "assembly/sequence.fna" with:
17
+ """
18
+ >contig1
19
+ AAAAATTTTTGGGGGCCCCC
20
+ """
21
+ And I write to "assembly/annotations.gff" with:
22
+ """
23
+ ##gff-version 3
24
+ contig1 . gene 1 3 . + 1 ID=gene1;Name=something
25
+ contig1 . gene 4 6 . + 1 Name=something
26
+ """
27
+ And I append to "Gemfile" with:
28
+ """
29
+ gem 'genomer-plugin-validate', :path => '../../../'
30
+ """
31
+ When I run `genomer validate annotations`
32
+ Then the exit status should be 0
33
+ And the output should contain:
34
+ """
35
+ Annotations found with missing ID attribute
36
+
37
+ """
38
+
39
+ @disable-bundler
40
+ Scenario: Validating an annotations file with a missing Name or product attributes
41
+ Given I successfully run `genomer init project`
42
+ And I cd to "project"
43
+ And I write to "assembly/scaffold.yml" with:
44
+ """
45
+ ---
46
+ - sequence:
47
+ source: contig1
48
+ """
49
+ And I write to "assembly/sequence.fna" with:
50
+ """
51
+ >contig1
52
+ AAAAATTTTTGGGGGCCCCC
53
+ """
54
+ And I write to "assembly/annotations.gff" with:
55
+ """
56
+ ##gff-version 3
57
+ contig1 . gene 1 2 . + 1 ID=gene1;Name=something
58
+ contig1 . gene 3 4 . + 1 ID=gene2;product=something
59
+ contig1 . gene 5 6 . + 1 ID=gene3;product=something;Name=else
60
+ contig1 . gene 7 8 . + 1 ID=gene4
61
+ """
62
+ And I append to "Gemfile" with:
63
+ """
64
+ gem 'genomer-plugin-validate', :path => '../../../'
65
+ """
66
+ When I run `genomer validate annotations`
67
+ Then the exit status should be 0
68
+ And the output should not contain "gene1"
69
+ And the output should not contain "gene2"
70
+ And the output should not contain "gene3"
71
+ And the output should contain:
72
+ """
73
+ No 'Name' or 'product' attribute for annotation 'gene4'
74
+ """
75
+
@@ -0,0 +1,40 @@
1
+ Feature: Validating annotation files for incorrect names
2
+ In order to submit genome annotations
3
+ A user can use the "annotation" command to detect uppercase gene names
4
+ to ensure that their annotation file contains none of these
5
+
6
+ @disable-bundler
7
+ Scenario: Validating an annotations file with an uppercase name attribute
8
+ Given I successfully run `genomer init project`
9
+ And I cd to "project"
10
+ And I write to "assembly/scaffold.yml" with:
11
+ """
12
+ ---
13
+ - sequence:
14
+ source: contig1
15
+ """
16
+ And I write to "assembly/sequence.fna" with:
17
+ """
18
+ >contig1
19
+ AAAAATTTTTGGGGGCCCCC
20
+ """
21
+ And I write to "assembly/annotations.gff" with:
22
+ """
23
+ ##gff-version 3
24
+ contig1 . gene 1 3 . + 1 Name=Uppercase;ID=1
25
+ contig1 . gene 4 6 . + 1 Name=lowercase;ID=2
26
+ """
27
+ And I append to "Gemfile" with:
28
+ """
29
+ gem 'genomer-plugin-validate', :path => '../../../'
30
+ """
31
+ When I run `genomer validate annotations`
32
+ Then the exit status should be 0
33
+ And the output should contain:
34
+ """
35
+ Illegal capitalised Name attribute 'Uppercase' for '1'
36
+ """
37
+ And the output should not contain:
38
+ """
39
+ Illegal capitalised Name attribute 'lowercase' for '2'
40
+ """
@@ -0,0 +1,36 @@
1
+ Feature: The validator command line interface
2
+ In order to generate correct genomer builds
3
+ A user can use the "validator" plugin at the command line
4
+ to validate their genome build
5
+
6
+ @disable-bundler
7
+ Scenario: Running with just the 'validate' command
8
+ Given I successfully run `genomer init project`
9
+ And I cd to "project"
10
+ And I append to "Gemfile" with:
11
+ """
12
+ gem 'genomer-plugin-validate', :path => '../../../'
13
+ """
14
+ When I run `genomer validate`
15
+ Then the exit status should be 0
16
+ And the output should contain:
17
+ """
18
+ USAGE: genomer validate <GROUP>
19
+
20
+ Available validation groups:
21
+ """
22
+
23
+ @disable-bundler
24
+ Scenario: Running with an unknown validation group
25
+ Given I successfully run `genomer init project`
26
+ And I cd to "project"
27
+ And I append to "Gemfile" with:
28
+ """
29
+ gem 'genomer-plugin-validate', :path => '../../../'
30
+ """
31
+ When I run `genomer validate unknown`
32
+ Then the exit status should be 1
33
+ And the output should contain:
34
+ """
35
+ Error. Unknown validation group 'unknown'
36
+ """
@@ -0,0 +1,13 @@
1
+ require 'bundler'
2
+ begin
3
+ Bundler.setup(:default, :development)
4
+ rescue Bundler::BundlerError => e
5
+ $stderr.puts e.message
6
+ $stderr.puts "Run `bundle install` to install missing gems"
7
+ exit e.status_code
8
+ end
9
+
10
+ $LOAD_PATH.unshift(File.dirname(__FILE__) + '/../../lib')
11
+ $LOAD_PATH.unshift(File.dirname(__FILE__) + '/../../spec')
12
+
13
+ require 'aruba/cucumber'
@@ -0,0 +1,28 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ Gem::Specification.new do |s|
4
+ s.name = "genomer-plugin-validate"
5
+ s.version = File.read('VERSION')
6
+ s.authors = ["Michael Barton"]
7
+ s.email = ["mail@michaelbarton.me.uk"]
8
+ s.homepage = ""
9
+ s.summary = %q{Validate assembly files for errors}
10
+ s.description = %q{Test assembly files for common errors which may lead to incorrect assembly}
11
+
12
+ s.rubyforge_project = "genomer-plugin-validate"
13
+
14
+ s.files = `git ls-files`.split("\n")
15
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
16
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
17
+ s.require_paths = ["lib"]
18
+
19
+ s.add_runtime_dependency "genomer", ">= 0.0.4"
20
+ s.add_runtime_dependency "heredoc_unindent", "~> 1.1.0"
21
+
22
+ s.add_development_dependency "rake"
23
+ s.add_development_dependency "rspec", "~> 2.8.0"
24
+ s.add_development_dependency "scaffolder-test-helpers", "~> 0.4.1"
25
+ s.add_development_dependency "cucumber", "~> 1.1.4"
26
+ s.add_development_dependency "aruba", "~> 0.4.11"
27
+ s.add_development_dependency "rr", "~> 1.0.4"
28
+ end
@@ -0,0 +1,12 @@
1
+ class String
2
+
3
+ # Taken from Rails
4
+ def underscore
5
+ self.gsub(/::/, '/').
6
+ gsub(/([A-Z]+)([A-Z][a-z])/,'\1_\2').
7
+ gsub(/([a-z\d])([A-Z])/,'\1_\2').
8
+ tr("-", "_").
9
+ downcase
10
+ end
11
+
12
+ end
@@ -0,0 +1,33 @@
1
+ require "genomer"
2
+ require "heredoc_unindent"
3
+
4
+ class GenomerPluginValidate < Genomer::Plugin
5
+ require 'genomer-plugin-validate/validator'
6
+ require 'genomer-plugin-validate/group'
7
+
8
+ def run
9
+ name = arguments.shift
10
+ return self.class.help_message if name.nil?
11
+
12
+ group = Group.groups[name]
13
+ raise Genomer::Error, "Unknown validation group '#{name}'" if group.nil?
14
+
15
+ group.validators.map{|i| Validator.validators[i]}.map do |v|
16
+ v.new(arguments,flags).run
17
+ end.flatten * "\n"
18
+ end
19
+
20
+ def self.help_message
21
+ msg = <<-EOS.unindent
22
+ USAGE: genomer validate <GROUP>
23
+
24
+ Available validation groups:
25
+ EOS
26
+ msg << Group.groups.map do |(k,v)|
27
+ str = ' '
28
+ str << k.ljust(15)
29
+ str << v.description
30
+ end * "\n"
31
+ end
32
+
33
+ end
@@ -0,0 +1,17 @@
1
+ module GenomerPluginValidate::Group
2
+
3
+ def self.load
4
+ path = File.join(File.dirname(__FILE__),'..','genomer-plugin-validate','group','*')
5
+ Dir[path].each do |i|
6
+ require i if i =~ /\.rb/
7
+ end
8
+ end
9
+
10
+ def self.groups
11
+ load
12
+ Hash[constants.map do |name|
13
+ [name.to_s.downcase,const_get(name)]
14
+ end]
15
+ end
16
+
17
+ end
@@ -0,0 +1,20 @@
1
+ class GenomerPluginValidate::Group::Annotations
2
+
3
+ def self.description
4
+ "Validate GFF3 annotations file"
5
+ end
6
+
7
+ def self.validators
8
+ [
9
+ :duplicate_id,
10
+ :missing_id,
11
+ :no_name_or_product,
12
+ :gff3_attributes,
13
+ :view_attributes,
14
+ :duplicate_coordinates,
15
+ :uppercase_name,
16
+ :bad_product_field
17
+ ]
18
+ end
19
+
20
+ end
@@ -0,0 +1,27 @@
1
+ require 'extensions/string'
2
+
3
+ module GenomerPluginValidate::Validator
4
+
5
+ def self.load
6
+ path = File.join(File.dirname(__FILE__),'..','genomer-plugin-validate','validator','*')
7
+ Dir[path].each do |i|
8
+ require i if i =~ /\.rb/
9
+ end
10
+ end
11
+
12
+ def self.validators
13
+ load
14
+ Hash[constants.map do |name|
15
+ [name.to_s.underscore.to_sym, const_get(name)]
16
+ end]
17
+ end
18
+
19
+ def annotations_by_attribute(attr)
20
+ annotations.inject(Hash.new{|h,k| h[k] = []}) do |hash,attn|
21
+ attr_value = attn.get_attribute(attr) ? attn.get_attribute(attr).to_s : nil
22
+ hash[attr_value] <<= attn
23
+ hash
24
+ end
25
+ end
26
+
27
+ end
@@ -0,0 +1,45 @@
1
+ class GenomerPluginValidate::Validator::BadProductField < Genomer::Plugin
2
+ include GenomerPluginValidate::Validator
3
+
4
+ ERROR = "Bad product field for '%s:' "
5
+
6
+ def run
7
+ [
8
+ hypothetical_products,
9
+ domain_related_like_ending_products,
10
+ nterm_products,
11
+ all_caps_products
12
+ ].flatten
13
+ end
14
+
15
+ def products_matching(re)
16
+ annotations_by_attribute("product").
17
+ map{|(product,entries)| entries.map{|i| [i.id,product]}}.
18
+ flatten(1).
19
+ map{|(id,product)| [id, re.match(product)]}.
20
+ select{|(_,match)| match}.
21
+ map{|(id,match)| [id,match.to_a[1].downcase]}
22
+ end
23
+
24
+ def hypothetical_products
25
+ products_matching(/^([Hh]ypothetical)(?! protein)/).
26
+ map{|i| (ERROR + "start with 'putative' instead of '%s.'") % i}
27
+ end
28
+
29
+ def domain_related_like_ending_products
30
+ products_matching(/([Dd]omain|[Rr]elated|[Ll]ike).?$/).
31
+ map{|i| (ERROR + "products ending with '%s' are not allowed.") % i}
32
+ end
33
+
34
+ def nterm_products
35
+ products_matching(/(?!\B)([Nn][-\s][Tt]erm(inal)?)/).
36
+ map{|i| (ERROR + "'N-terminal' or variations are not allowed.") % i}
37
+ end
38
+
39
+ def all_caps_products
40
+ products_matching(/^([A-Z\s-]+)$/).
41
+ map(&:first).
42
+ map{|i| (ERROR + "all caps product fields are not allowed.") % i}
43
+ end
44
+
45
+ end
@@ -0,0 +1,12 @@
1
+ class GenomerPluginValidate::Validator::DuplicateCoordinates < Genomer::Plugin
2
+
3
+ def run
4
+ annotations.
5
+ group_by{|attn| [attn.start, attn.end].sort }.
6
+ select{|_,v| v.length > 1}.
7
+ map{|(coords,attns)| attns.map(&:id).sort }.
8
+ map{|attns| attns.map{|attns| "'#{attns}'"}.join(', ') }.
9
+ map{|attns| "Identical locations for #{attns}" }
10
+ end
11
+
12
+ end
@@ -0,0 +1,11 @@
1
+ class GenomerPluginValidate::Validator::DuplicateID < Genomer::Plugin
2
+ include GenomerPluginValidate::Validator
3
+
4
+ def run
5
+ annotations_by_attribute('ID').
6
+ select{|_,v| v.length > 1}.
7
+ select{|k,_| ! k.nil? }.
8
+ map{|(id,_)| "Duplicate ID '#{id}'" }
9
+ end
10
+
11
+ end
@@ -0,0 +1,16 @@
1
+ class GenomerPluginValidate::Validator::Gff3Attributes < Genomer::Plugin
2
+
3
+ def valid_gff3_attributes
4
+ %w|ID Name Alias Parent Target Gap Derives_from Note
5
+ Dbxref Ontology_term Is_circular|
6
+ end
7
+
8
+ def run
9
+ annotations.
10
+ map{|attn| attn.attributes.map{|(k,v)| [k,attn] }}.
11
+ flatten(1).
12
+ select{|(term,_)| term =~ (/^[A-Z]/) }.
13
+ reject{|(term,_)| valid_gff3_attributes.include? term }.
14
+ map{|(term,attn)| "Illegal GFF3 attribute '#{term}' for '#{attn.id}'"}
15
+ end
16
+ end
@@ -0,0 +1,13 @@
1
+ class GenomerPluginValidate::Validator::MissingID < Genomer::Plugin
2
+ include GenomerPluginValidate::Validator
3
+
4
+ def run
5
+ missing = annotations_by_attribute('ID').detect{|k,_| k.nil? }
6
+ if missing
7
+ ["Annotations found with missing ID attribute"]
8
+ else
9
+ []
10
+ end
11
+ end
12
+
13
+ end
@@ -0,0 +1,13 @@
1
+ class GenomerPluginValidate::Validator::NoNameOrProduct < Genomer::Plugin
2
+ include GenomerPluginValidate::Validator
3
+
4
+ def run
5
+ no_product = annotations_by_attribute('product')[nil].map(&:id)
6
+ no_name = annotations_by_attribute('Name')[nil].map(&:id)
7
+
8
+ (no_name & no_product).map do |id|
9
+ "No 'Name' or 'product' attribute for annotation '#{id}'"
10
+ end
11
+ end
12
+
13
+ end
@@ -0,0 +1,13 @@
1
+ class GenomerPluginValidate::Validator::UppercaseName < Genomer::Plugin
2
+ include GenomerPluginValidate::Validator
3
+
4
+ def run
5
+ annotations_by_attribute('Name').
6
+ select{|(name,_)| name =~ /^[A-Z]/}.
7
+ map{|(_,entry)| entry}.
8
+ flatten.
9
+ map{|i| [i.get_attribute('Name'),i.id]}.
10
+ map{|i| "Illegal capitalised Name attribute '%s' for '%s'" % i}
11
+ end
12
+
13
+ end