genomer-plugin-validate 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. data/.gitignore +4 -0
  2. data/Gemfile +4 -0
  3. data/Rakefile +9 -0
  4. data/VERSION +1 -0
  5. data/features/annotations/bad-product-field.feature +91 -0
  6. data/features/annotations/command-line-interface.feature +19 -0
  7. data/features/annotations/duplicate_id.feature +144 -0
  8. data/features/annotations/identical_locations.feature +74 -0
  9. data/features/annotations/incorrect-attributes.feature +135 -0
  10. data/features/annotations/missing_attributes.feature +75 -0
  11. data/features/annotations/name.feature +40 -0
  12. data/features/command-line-interface.feature +36 -0
  13. data/features/support/env.rb +13 -0
  14. data/genomer-plugin-validate.gemspec +28 -0
  15. data/lib/extensions/string.rb +12 -0
  16. data/lib/genomer-plugin-validate.rb +33 -0
  17. data/lib/genomer-plugin-validate/group.rb +17 -0
  18. data/lib/genomer-plugin-validate/group/annotations.rb +20 -0
  19. data/lib/genomer-plugin-validate/validator.rb +27 -0
  20. data/lib/genomer-plugin-validate/validator/bad_product_field.rb +45 -0
  21. data/lib/genomer-plugin-validate/validator/duplicate_coordinates.rb +12 -0
  22. data/lib/genomer-plugin-validate/validator/duplicate_id.rb +11 -0
  23. data/lib/genomer-plugin-validate/validator/gff3_attributes.rb +16 -0
  24. data/lib/genomer-plugin-validate/validator/missing_id.rb +13 -0
  25. data/lib/genomer-plugin-validate/validator/no_name_or_product.rb +13 -0
  26. data/lib/genomer-plugin-validate/validator/uppercase_name.rb +13 -0
  27. data/lib/genomer-plugin-validate/validator/view_attributes.rb +16 -0
  28. data/man/genomer-validate.ronn +100 -0
  29. data/spec/genomer-plugin-validate/group/annotations_spec.rb +18 -0
  30. data/spec/genomer-plugin-validate/group_spec.rb +24 -0
  31. data/spec/genomer-plugin-validate/validator/bad_product_field_spec.rb +93 -0
  32. data/spec/genomer-plugin-validate/validator/duplicate_coordinates_spec.rb +24 -0
  33. data/spec/genomer-plugin-validate/validator/duplicate_id_spec.rb +34 -0
  34. data/spec/genomer-plugin-validate/validator/gff_attributes_spec.rb +32 -0
  35. data/spec/genomer-plugin-validate/validator/missing_id_spec.rb +27 -0
  36. data/spec/genomer-plugin-validate/validator/no_name_or_product_spec.rb +28 -0
  37. data/spec/genomer-plugin-validate/validator/uppercase_name_spec.rb +22 -0
  38. data/spec/genomer-plugin-validate/validator/view_attributes_spec.rb +31 -0
  39. data/spec/genomer-plugin-validate/validator_spec.rb +107 -0
  40. data/spec/genomer-plugin-validate_spec.rb +92 -0
  41. data/spec/spec_helper.rb +35 -0
  42. data/spec/validator_run_matcher.rb +25 -0
  43. metadata +244 -0
@@ -0,0 +1,75 @@
1
+ Feature: Validating annotation files for missing attributes
2
+ In order to submit genome annotations
3
+ A user can use the "annotation" command to detect missing ID, Name, product
4
+ to ensure that their annotation file contains no errors
5
+
6
+ @disable-bundler
7
+ Scenario: Validating an annotations file with a missing ID attribute
8
+ Given I successfully run `genomer init project`
9
+ And I cd to "project"
10
+ And I write to "assembly/scaffold.yml" with:
11
+ """
12
+ ---
13
+ - sequence:
14
+ source: contig1
15
+ """
16
+ And I write to "assembly/sequence.fna" with:
17
+ """
18
+ >contig1
19
+ AAAAATTTTTGGGGGCCCCC
20
+ """
21
+ And I write to "assembly/annotations.gff" with:
22
+ """
23
+ ##gff-version 3
24
+ contig1 . gene 1 3 . + 1 ID=gene1;Name=something
25
+ contig1 . gene 4 6 . + 1 Name=something
26
+ """
27
+ And I append to "Gemfile" with:
28
+ """
29
+ gem 'genomer-plugin-validate', :path => '../../../'
30
+ """
31
+ When I run `genomer validate annotations`
32
+ Then the exit status should be 0
33
+ And the output should contain:
34
+ """
35
+ Annotations found with missing ID attribute
36
+
37
+ """
38
+
39
+ @disable-bundler
40
+ Scenario: Validating an annotations file with a missing Name or product attributes
41
+ Given I successfully run `genomer init project`
42
+ And I cd to "project"
43
+ And I write to "assembly/scaffold.yml" with:
44
+ """
45
+ ---
46
+ - sequence:
47
+ source: contig1
48
+ """
49
+ And I write to "assembly/sequence.fna" with:
50
+ """
51
+ >contig1
52
+ AAAAATTTTTGGGGGCCCCC
53
+ """
54
+ And I write to "assembly/annotations.gff" with:
55
+ """
56
+ ##gff-version 3
57
+ contig1 . gene 1 2 . + 1 ID=gene1;Name=something
58
+ contig1 . gene 3 4 . + 1 ID=gene2;product=something
59
+ contig1 . gene 5 6 . + 1 ID=gene3;product=something;Name=else
60
+ contig1 . gene 7 8 . + 1 ID=gene4
61
+ """
62
+ And I append to "Gemfile" with:
63
+ """
64
+ gem 'genomer-plugin-validate', :path => '../../../'
65
+ """
66
+ When I run `genomer validate annotations`
67
+ Then the exit status should be 0
68
+ And the output should not contain "gene1"
69
+ And the output should not contain "gene2"
70
+ And the output should not contain "gene3"
71
+ And the output should contain:
72
+ """
73
+ No 'Name' or 'product' attribute for annotation 'gene4'
74
+ """
75
+
@@ -0,0 +1,40 @@
1
+ Feature: Validating annotation files for incorrect names
2
+ In order to submit genome annotations
3
+ A user can use the "annotation" command to detect uppercase gene names
4
+ to ensure that their annotation file contains none of these
5
+
6
+ @disable-bundler
7
+ Scenario: Validating an annotations file with an uppercase name attribute
8
+ Given I successfully run `genomer init project`
9
+ And I cd to "project"
10
+ And I write to "assembly/scaffold.yml" with:
11
+ """
12
+ ---
13
+ - sequence:
14
+ source: contig1
15
+ """
16
+ And I write to "assembly/sequence.fna" with:
17
+ """
18
+ >contig1
19
+ AAAAATTTTTGGGGGCCCCC
20
+ """
21
+ And I write to "assembly/annotations.gff" with:
22
+ """
23
+ ##gff-version 3
24
+ contig1 . gene 1 3 . + 1 Name=Uppercase;ID=1
25
+ contig1 . gene 4 6 . + 1 Name=lowercase;ID=2
26
+ """
27
+ And I append to "Gemfile" with:
28
+ """
29
+ gem 'genomer-plugin-validate', :path => '../../../'
30
+ """
31
+ When I run `genomer validate annotations`
32
+ Then the exit status should be 0
33
+ And the output should contain:
34
+ """
35
+ Illegal capitalised Name attribute 'Uppercase' for '1'
36
+ """
37
+ And the output should not contain:
38
+ """
39
+ Illegal capitalised Name attribute 'lowercase' for '2'
40
+ """
@@ -0,0 +1,36 @@
1
+ Feature: The validator command line interface
2
+ In order to generate correct genomer builds
3
+ A user can use the "validator" plugin at the command line
4
+ to validate their genome build
5
+
6
+ @disable-bundler
7
+ Scenario: Running with just the 'validate' command
8
+ Given I successfully run `genomer init project`
9
+ And I cd to "project"
10
+ And I append to "Gemfile" with:
11
+ """
12
+ gem 'genomer-plugin-validate', :path => '../../../'
13
+ """
14
+ When I run `genomer validate`
15
+ Then the exit status should be 0
16
+ And the output should contain:
17
+ """
18
+ USAGE: genomer validate <GROUP>
19
+
20
+ Available validation groups:
21
+ """
22
+
23
+ @disable-bundler
24
+ Scenario: Running with an unknown validation group
25
+ Given I successfully run `genomer init project`
26
+ And I cd to "project"
27
+ And I append to "Gemfile" with:
28
+ """
29
+ gem 'genomer-plugin-validate', :path => '../../../'
30
+ """
31
+ When I run `genomer validate unknown`
32
+ Then the exit status should be 1
33
+ And the output should contain:
34
+ """
35
+ Error. Unknown validation group 'unknown'
36
+ """
@@ -0,0 +1,13 @@
1
+ require 'bundler'
2
+ begin
3
+ Bundler.setup(:default, :development)
4
+ rescue Bundler::BundlerError => e
5
+ $stderr.puts e.message
6
+ $stderr.puts "Run `bundle install` to install missing gems"
7
+ exit e.status_code
8
+ end
9
+
10
+ $LOAD_PATH.unshift(File.dirname(__FILE__) + '/../../lib')
11
+ $LOAD_PATH.unshift(File.dirname(__FILE__) + '/../../spec')
12
+
13
+ require 'aruba/cucumber'
@@ -0,0 +1,28 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ Gem::Specification.new do |s|
4
+ s.name = "genomer-plugin-validate"
5
+ s.version = File.read('VERSION')
6
+ s.authors = ["Michael Barton"]
7
+ s.email = ["mail@michaelbarton.me.uk"]
8
+ s.homepage = ""
9
+ s.summary = %q{Validate assembly files for errors}
10
+ s.description = %q{Test assembly files for common errors which may lead to incorrect assembly}
11
+
12
+ s.rubyforge_project = "genomer-plugin-validate"
13
+
14
+ s.files = `git ls-files`.split("\n")
15
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
16
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
17
+ s.require_paths = ["lib"]
18
+
19
+ s.add_runtime_dependency "genomer", ">= 0.0.4"
20
+ s.add_runtime_dependency "heredoc_unindent", "~> 1.1.0"
21
+
22
+ s.add_development_dependency "rake"
23
+ s.add_development_dependency "rspec", "~> 2.8.0"
24
+ s.add_development_dependency "scaffolder-test-helpers", "~> 0.4.1"
25
+ s.add_development_dependency "cucumber", "~> 1.1.4"
26
+ s.add_development_dependency "aruba", "~> 0.4.11"
27
+ s.add_development_dependency "rr", "~> 1.0.4"
28
+ end
@@ -0,0 +1,12 @@
1
+ class String
2
+
3
+ # Taken from Rails
4
+ def underscore
5
+ self.gsub(/::/, '/').
6
+ gsub(/([A-Z]+)([A-Z][a-z])/,'\1_\2').
7
+ gsub(/([a-z\d])([A-Z])/,'\1_\2').
8
+ tr("-", "_").
9
+ downcase
10
+ end
11
+
12
+ end
@@ -0,0 +1,33 @@
1
+ require "genomer"
2
+ require "heredoc_unindent"
3
+
4
+ class GenomerPluginValidate < Genomer::Plugin
5
+ require 'genomer-plugin-validate/validator'
6
+ require 'genomer-plugin-validate/group'
7
+
8
+ def run
9
+ name = arguments.shift
10
+ return self.class.help_message if name.nil?
11
+
12
+ group = Group.groups[name]
13
+ raise Genomer::Error, "Unknown validation group '#{name}'" if group.nil?
14
+
15
+ group.validators.map{|i| Validator.validators[i]}.map do |v|
16
+ v.new(arguments,flags).run
17
+ end.flatten * "\n"
18
+ end
19
+
20
+ def self.help_message
21
+ msg = <<-EOS.unindent
22
+ USAGE: genomer validate <GROUP>
23
+
24
+ Available validation groups:
25
+ EOS
26
+ msg << Group.groups.map do |(k,v)|
27
+ str = ' '
28
+ str << k.ljust(15)
29
+ str << v.description
30
+ end * "\n"
31
+ end
32
+
33
+ end
@@ -0,0 +1,17 @@
1
+ module GenomerPluginValidate::Group
2
+
3
+ def self.load
4
+ path = File.join(File.dirname(__FILE__),'..','genomer-plugin-validate','group','*')
5
+ Dir[path].each do |i|
6
+ require i if i =~ /\.rb/
7
+ end
8
+ end
9
+
10
+ def self.groups
11
+ load
12
+ Hash[constants.map do |name|
13
+ [name.to_s.downcase,const_get(name)]
14
+ end]
15
+ end
16
+
17
+ end
@@ -0,0 +1,20 @@
1
+ class GenomerPluginValidate::Group::Annotations
2
+
3
+ def self.description
4
+ "Validate GFF3 annotations file"
5
+ end
6
+
7
+ def self.validators
8
+ [
9
+ :duplicate_id,
10
+ :missing_id,
11
+ :no_name_or_product,
12
+ :gff3_attributes,
13
+ :view_attributes,
14
+ :duplicate_coordinates,
15
+ :uppercase_name,
16
+ :bad_product_field
17
+ ]
18
+ end
19
+
20
+ end
@@ -0,0 +1,27 @@
1
+ require 'extensions/string'
2
+
3
+ module GenomerPluginValidate::Validator
4
+
5
+ def self.load
6
+ path = File.join(File.dirname(__FILE__),'..','genomer-plugin-validate','validator','*')
7
+ Dir[path].each do |i|
8
+ require i if i =~ /\.rb/
9
+ end
10
+ end
11
+
12
+ def self.validators
13
+ load
14
+ Hash[constants.map do |name|
15
+ [name.to_s.underscore.to_sym, const_get(name)]
16
+ end]
17
+ end
18
+
19
+ def annotations_by_attribute(attr)
20
+ annotations.inject(Hash.new{|h,k| h[k] = []}) do |hash,attn|
21
+ attr_value = attn.get_attribute(attr) ? attn.get_attribute(attr).to_s : nil
22
+ hash[attr_value] <<= attn
23
+ hash
24
+ end
25
+ end
26
+
27
+ end
@@ -0,0 +1,45 @@
1
+ class GenomerPluginValidate::Validator::BadProductField < Genomer::Plugin
2
+ include GenomerPluginValidate::Validator
3
+
4
+ ERROR = "Bad product field for '%s:' "
5
+
6
+ def run
7
+ [
8
+ hypothetical_products,
9
+ domain_related_like_ending_products,
10
+ nterm_products,
11
+ all_caps_products
12
+ ].flatten
13
+ end
14
+
15
+ def products_matching(re)
16
+ annotations_by_attribute("product").
17
+ map{|(product,entries)| entries.map{|i| [i.id,product]}}.
18
+ flatten(1).
19
+ map{|(id,product)| [id, re.match(product)]}.
20
+ select{|(_,match)| match}.
21
+ map{|(id,match)| [id,match.to_a[1].downcase]}
22
+ end
23
+
24
+ def hypothetical_products
25
+ products_matching(/^([Hh]ypothetical)(?! protein)/).
26
+ map{|i| (ERROR + "start with 'putative' instead of '%s.'") % i}
27
+ end
28
+
29
+ def domain_related_like_ending_products
30
+ products_matching(/([Dd]omain|[Rr]elated|[Ll]ike).?$/).
31
+ map{|i| (ERROR + "products ending with '%s' are not allowed.") % i}
32
+ end
33
+
34
+ def nterm_products
35
+ products_matching(/(?!\B)([Nn][-\s][Tt]erm(inal)?)/).
36
+ map{|i| (ERROR + "'N-terminal' or variations are not allowed.") % i}
37
+ end
38
+
39
+ def all_caps_products
40
+ products_matching(/^([A-Z\s-]+)$/).
41
+ map(&:first).
42
+ map{|i| (ERROR + "all caps product fields are not allowed.") % i}
43
+ end
44
+
45
+ end
@@ -0,0 +1,12 @@
1
+ class GenomerPluginValidate::Validator::DuplicateCoordinates < Genomer::Plugin
2
+
3
+ def run
4
+ annotations.
5
+ group_by{|attn| [attn.start, attn.end].sort }.
6
+ select{|_,v| v.length > 1}.
7
+ map{|(coords,attns)| attns.map(&:id).sort }.
8
+ map{|attns| attns.map{|attns| "'#{attns}'"}.join(', ') }.
9
+ map{|attns| "Identical locations for #{attns}" }
10
+ end
11
+
12
+ end
@@ -0,0 +1,11 @@
1
+ class GenomerPluginValidate::Validator::DuplicateID < Genomer::Plugin
2
+ include GenomerPluginValidate::Validator
3
+
4
+ def run
5
+ annotations_by_attribute('ID').
6
+ select{|_,v| v.length > 1}.
7
+ select{|k,_| ! k.nil? }.
8
+ map{|(id,_)| "Duplicate ID '#{id}'" }
9
+ end
10
+
11
+ end
@@ -0,0 +1,16 @@
1
+ class GenomerPluginValidate::Validator::Gff3Attributes < Genomer::Plugin
2
+
3
+ def valid_gff3_attributes
4
+ %w|ID Name Alias Parent Target Gap Derives_from Note
5
+ Dbxref Ontology_term Is_circular|
6
+ end
7
+
8
+ def run
9
+ annotations.
10
+ map{|attn| attn.attributes.map{|(k,v)| [k,attn] }}.
11
+ flatten(1).
12
+ select{|(term,_)| term =~ (/^[A-Z]/) }.
13
+ reject{|(term,_)| valid_gff3_attributes.include? term }.
14
+ map{|(term,attn)| "Illegal GFF3 attribute '#{term}' for '#{attn.id}'"}
15
+ end
16
+ end
@@ -0,0 +1,13 @@
1
+ class GenomerPluginValidate::Validator::MissingID < Genomer::Plugin
2
+ include GenomerPluginValidate::Validator
3
+
4
+ def run
5
+ missing = annotations_by_attribute('ID').detect{|k,_| k.nil? }
6
+ if missing
7
+ ["Annotations found with missing ID attribute"]
8
+ else
9
+ []
10
+ end
11
+ end
12
+
13
+ end
@@ -0,0 +1,13 @@
1
+ class GenomerPluginValidate::Validator::NoNameOrProduct < Genomer::Plugin
2
+ include GenomerPluginValidate::Validator
3
+
4
+ def run
5
+ no_product = annotations_by_attribute('product')[nil].map(&:id)
6
+ no_name = annotations_by_attribute('Name')[nil].map(&:id)
7
+
8
+ (no_name & no_product).map do |id|
9
+ "No 'Name' or 'product' attribute for annotation '#{id}'"
10
+ end
11
+ end
12
+
13
+ end
@@ -0,0 +1,13 @@
1
+ class GenomerPluginValidate::Validator::UppercaseName < Genomer::Plugin
2
+ include GenomerPluginValidate::Validator
3
+
4
+ def run
5
+ annotations_by_attribute('Name').
6
+ select{|(name,_)| name =~ /^[A-Z]/}.
7
+ map{|(_,entry)| entry}.
8
+ flatten.
9
+ map{|i| [i.get_attribute('Name'),i.id]}.
10
+ map{|i| "Illegal capitalised Name attribute '%s' for '%s'" % i}
11
+ end
12
+
13
+ end