genevalidator 1.6.12 → 2.1.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (91) hide show
  1. checksums.yaml +5 -5
  2. data/.gitignore +30 -1
  3. data/.ruby-version +1 -0
  4. data/.travis.yml +13 -12
  5. data/Gemfile +4 -1
  6. data/Gemfile.lock +135 -0
  7. data/README.md +104 -122
  8. data/Rakefile +377 -5
  9. data/aux/gv_results.slim +155 -0
  10. data/aux/html_files/css/gv.compiled.min.css +8 -0
  11. data/aux/{files → html_files}/css/src/bootstrap.min.css +0 -0
  12. data/aux/{files → html_files}/css/src/font-awesome.min.css +0 -0
  13. data/aux/{files → html_files}/css/src/style.css +0 -0
  14. data/aux/{files → html_files}/fonts/FontAwesome.otf +0 -0
  15. data/aux/{files → html_files}/fonts/fontawesome-webfont.eot +0 -0
  16. data/aux/{files → html_files}/fonts/fontawesome-webfont.svg +0 -0
  17. data/aux/{files → html_files}/fonts/fontawesome-webfont.ttf +0 -0
  18. data/aux/{files → html_files}/fonts/fontawesome-webfont.woff +0 -0
  19. data/aux/{files → html_files}/img/gene.png +0 -0
  20. data/aux/html_files/js/gv.compiled.min.js +1 -0
  21. data/aux/{files → html_files}/js/src/bootstrap.min.js +0 -0
  22. data/aux/{files → html_files}/js/src/d3.v3.min.js +0 -0
  23. data/aux/{files → html_files}/js/src/jquery-2.1.1.min.js +0 -0
  24. data/aux/{files → html_files}/js/src/jquery.tablesorter.min.js +0 -0
  25. data/aux/{files → html_files}/js/src/plots.js +1 -1
  26. data/aux/{files → html_files}/js/src/script.js +0 -0
  27. data/aux/{files → html_files}/json/.gitkeep +0 -0
  28. data/bin/genevalidator +393 -56
  29. data/exemplar_data/README.md +60 -0
  30. data/{data/mrna_data.fasta → exemplar_data/mrna_data.fa} +1 -1
  31. data/{data/protein_data.fasta → exemplar_data/protein_data.fa} +0 -0
  32. data/genevalidator.gemspec +35 -20
  33. data/install.sh +92 -0
  34. data/lib/genevalidator.rb +171 -56
  35. data/lib/genevalidator/arg_validation.rb +26 -55
  36. data/lib/genevalidator/blast.rb +44 -99
  37. data/lib/genevalidator/clusterization.rb +18 -22
  38. data/lib/genevalidator/exceptions.rb +17 -17
  39. data/lib/genevalidator/ext/array.rb +21 -4
  40. data/lib/genevalidator/get_raw_sequences.rb +32 -31
  41. data/lib/genevalidator/hsp.rb +31 -2
  42. data/lib/genevalidator/json_to_gv_results.rb +38 -122
  43. data/lib/genevalidator/output.rb +158 -172
  44. data/lib/genevalidator/output_files.rb +134 -0
  45. data/lib/genevalidator/pool.rb +2 -5
  46. data/lib/genevalidator/query.rb +1 -1
  47. data/lib/genevalidator/tabular_parser.rb +8 -29
  48. data/lib/genevalidator/validation.rb +48 -90
  49. data/lib/genevalidator/validation_alignment.rb +64 -75
  50. data/lib/genevalidator/validation_blast_reading_frame.rb +13 -9
  51. data/lib/genevalidator/validation_duplication.rb +85 -84
  52. data/lib/genevalidator/validation_gene_merge.rb +46 -35
  53. data/lib/genevalidator/validation_length_cluster.rb +18 -15
  54. data/lib/genevalidator/validation_length_rank.rb +19 -15
  55. data/lib/genevalidator/validation_maker_qi.rb +13 -12
  56. data/lib/genevalidator/validation_open_reading_frame.rb +16 -13
  57. data/lib/genevalidator/validation_report.rb +1 -1
  58. data/lib/genevalidator/validation_test.rb +1 -1
  59. data/lib/genevalidator/version.rb +1 -1
  60. data/test/overall.rb +1 -1
  61. data/test/test_all_validations.rb +36 -24
  62. data/test/test_blast.rb +39 -24
  63. data/test/test_clusterization_2d.rb +4 -4
  64. data/test/test_helper.rb +2 -2
  65. data/test/test_query.rb +16 -20
  66. data/test/test_validation_open_reading_frame.rb +122 -122
  67. data/test/test_validations.rb +12 -10
  68. metadata +94 -79
  69. data/aux/files/css/genevalidator.compiled.min.css +0 -16
  70. data/aux/files/js/genevalidator.compiled.min.js +0 -28
  71. data/aux/json_footer.erb +0 -8
  72. data/aux/json_header.erb +0 -19
  73. data/aux/json_query.erb +0 -15
  74. data/aux/template_footer.erb +0 -8
  75. data/aux/template_header.erb +0 -19
  76. data/aux/template_query.erb +0 -14
  77. data/data/README.md +0 -57
  78. data/data/mrna_data.fasta.blast_tabular +0 -3567
  79. data/data/mrna_data.fasta.blast_tabular.raw_seq +0 -53998
  80. data/data/mrna_data.fasta.blast_tabular.raw_seq.idx +0 -5440
  81. data/data/mrna_data.fasta.blast_xml +0 -39800
  82. data/data/mrna_data.fasta.blast_xml.raw_seq +0 -2554
  83. data/data/mrna_data.fasta.blast_xml.raw_seq.idx +0 -3127
  84. data/data/mrna_data.fasta.json +0 -1
  85. data/data/protein_data.fasta.blast_tabular +0 -3278
  86. data/data/protein_data.fasta.blast_tabular.raw_seq +0 -61295
  87. data/data/protein_data.fasta.blast_tabular.raw_seq.idx +0 -4438
  88. data/data/protein_data.fasta.blast_xml +0 -26228
  89. data/data/protein_data.fasta.blast_xml.raw_seq +0 -9803
  90. data/data/protein_data.fasta.blast_xml.raw_seq.idx +0 -1777
  91. data/data/protein_data.fasta.json +0 -1
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 4847ad48911fc47c1e3f6a74f5c4c05d2b736d52
4
- data.tar.gz: 0a198a16c9a31d89d12bce6b70e799d2071bbf3a
2
+ SHA256:
3
+ metadata.gz: ac9af62bb53cde2f76ae700a0098437f064dd354f9c4649006e3770a13b346ea
4
+ data.tar.gz: f655a495a8e1638b035f47c5caae2e1f449477198435bba6561f7af1e71a5142
5
5
  SHA512:
6
- metadata.gz: 6de4d4413f911a31fb840bfe13b942b7c3741e409e50a795be4e4edb009867f1347c1eda328bda759b26f66a88775bf10cb2defcb588f4662da69af88265cdf9
7
- data.tar.gz: ff9d98fb9eeac013f69aa473cfb66a0d617fc911c41a862188c263d74c2ea719c292bd62c6f112eb7138efaccf61aa7990ccd324b50b2c1433b38117ade6e56e
6
+ metadata.gz: 8402a4e0f20fbc7d7765c01a8337ce5a425f5ed92f3616f10e6faa28fa24a4a8624fc6218d0bdf0773a7253934692baa6ae30cb2d87078415978600dacadc4db
7
+ data.tar.gz: 29fc2075baa01d90affa395b664adb884a4583ea4f50238fd6ecee9e5aa2e78f92c965256b4edc70590aaadd40cb61d4b206f7239c69da02e261882c4d689c6e
data/.gitignore CHANGED
@@ -2,8 +2,37 @@
2
2
  *.~*~
3
3
  *.gem
4
4
  .DS_Store
5
- Gemfile.lock
6
5
  # Gemnasium gem configuration file
7
6
  config/gemnasium.yml
8
7
  doc
9
8
  .yardoc
9
+ tmp
10
+ *.rbc
11
+ .bundle
12
+ .config
13
+ InstalledFiles
14
+ _yardoc
15
+ coverage
16
+ doc/
17
+ lib/bundler/man
18
+ pkg
19
+ rdoc
20
+ spec/reports
21
+ test/tmp
22
+ test/version_tmp
23
+ *.bundle
24
+ *.so
25
+ *.o
26
+ *.a
27
+ mkmf.log
28
+ *-linux-x86/
29
+ *-linux-x86_64/
30
+ *-osx/
31
+ *-linux-x86.tar.gz
32
+ *-linux-x86_64.tar.gz
33
+ *-osx.tar.gz
34
+ test/test_files/all_validations_prot/prot.fa.html/
35
+ test/test_files/GV_*
36
+ exemplar_data/GV*
37
+ .vscode
38
+ q/
@@ -0,0 +1 @@
1
+ 2.2
@@ -1,18 +1,19 @@
1
1
  language: ruby
2
2
  rvm:
3
- - "2.0.0"
4
- - "2.1.3"
5
- - "2.2.0"
6
- before_install:
7
- - wget -P ~ http://mafft.cbrc.jp/alignment/software/mafft-7.205-with-extensions-src.tgz
8
- - tar -zxvf ~/mafft-7.205-with-extensions-src.tgz -C ~
9
- - mkdir ~/mafft
10
- - ruby -pi -e "gsub(/^PREFIX = \/usr\/local/, 'PREFIX = ~/mafft/')" ~/mafft-7.205-with-extensions/core/Makefile
11
- - (cd ~/mafft-7.205-with-extensions/core/ && make clean && make && make install)
12
- - export PATH=$PATH:~/mafft/bin
13
- cache: bundler
14
- sudo: false
3
+ - "2.2.10"
4
+ - "2.3.7"
5
+ - "2.4.4"
6
+ - "2.5.1"
7
+ before_script:
8
+ - wget -P ~ https://mafft.cbrc.jp/alignment/software/mafft-7.397-linux.tgz
9
+ - tar -zxvf ~/mafft-7.397-linux.tgz -C ~
10
+ - mkdir ~/mafft_bin
11
+ - echo "#!/bin/bash" > ~/mafft_bin/mafft
12
+ - echo '$HOME/mafft-linux64/mafft.bat "$@"' >> ~/mafft_bin/mafft
13
+ - chmod 755 ~/mafft_bin/mafft
14
+ - export PATH=$PATH:~/mafft_bin
15
15
  script: bundle exec rake test
16
+ after_script: bundle exec codeclimate-test-reporter
16
17
  addons:
17
18
  code_climate:
18
19
  repo_token: 2177997ae2dd26804c32e1ec34a2221f94b71a2170f6c1db2c020f8858cd87f2
data/Gemfile CHANGED
@@ -1,4 +1,7 @@
1
1
  source 'http://rubygems.org'
2
2
 
3
3
  gemspec
4
- gem 'codeclimate-test-reporter', group: :test, require: nil
4
+ group :test do
5
+ gem 'codeclimate-test-reporter', '~> 1.0.0'
6
+ gem 'simplecov'
7
+ end
@@ -0,0 +1,135 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ genevalidator (2.1.3)
5
+ bio (~> 1.4)
6
+ bio-blastxmlparser (~> 2.0)
7
+ genevalidatorapp (~> 2.1.3)
8
+ mechanize (= 2.7.5)
9
+ rack (~> 2.0)
10
+ slim (~> 3.0)
11
+ statsample (= 2.1.0)
12
+
13
+ GEM
14
+ remote: http://rubygems.org/
15
+ specs:
16
+ awesome_print (1.8.0)
17
+ backports (3.11.3)
18
+ bio (1.5.1)
19
+ bio-blastxmlparser (2.0.4)
20
+ bio-logger
21
+ nokogiri
22
+ bio-logger (1.0.1)
23
+ log4r (>= 1.1.9)
24
+ clbustos-rtf (0.4.2)
25
+ codeclimate-test-reporter (1.0.8)
26
+ simplecov (<= 0.13)
27
+ daru (0.1.6)
28
+ backports
29
+ dirty-memoize (0.0.4)
30
+ distribution (0.7.3)
31
+ docile (1.1.5)
32
+ domain_name (0.5.20180417)
33
+ unf (>= 0.0.5, < 1.0.0)
34
+ extendmatrix (0.4)
35
+ genevalidatorapp (2.1.3)
36
+ bio (~> 1.4)
37
+ sinatra (~> 2.0)
38
+ sinatra-cross_origin (~> 0.3)
39
+ slim (~> 3.0)
40
+ http-cookie (1.0.3)
41
+ domain_name (~> 0.5)
42
+ json (2.1.0)
43
+ log4r (1.1.10)
44
+ mechanize (2.7.5)
45
+ domain_name (~> 0.5, >= 0.5.1)
46
+ http-cookie (~> 1.0)
47
+ mime-types (>= 1.17.2)
48
+ net-http-digest_auth (~> 1.1, >= 1.1.1)
49
+ net-http-persistent (~> 2.5, >= 2.5.2)
50
+ nokogiri (~> 1.6)
51
+ ntlm-http (~> 0.1, >= 0.1.1)
52
+ webrobots (>= 0.0.9, < 0.2)
53
+ mime-types (3.2.2)
54
+ mime-types-data (~> 3.2015)
55
+ mime-types-data (3.2018.0812)
56
+ mini_portile2 (2.3.0)
57
+ minimization (0.2.3)
58
+ text-table (~> 1.2)
59
+ minitest (5.11.3)
60
+ mustermann (1.0.2)
61
+ net-http-digest_auth (1.4.1)
62
+ net-http-persistent (2.9.4)
63
+ nokogiri (1.8.4)
64
+ mini_portile2 (~> 2.3.0)
65
+ ntlm-http (0.1.1)
66
+ prawn (0.8.4)
67
+ prawn-core (>= 0.8.4, < 0.9)
68
+ prawn-layout (>= 0.8.4, < 0.9)
69
+ prawn-security (>= 0.8.4, < 0.9)
70
+ prawn-core (0.8.4)
71
+ prawn-layout (0.8.4)
72
+ prawn-security (0.8.4)
73
+ prawn-svg (0.9.1.11)
74
+ prawn (>= 0.8.4)
75
+ rack (2.0.5)
76
+ rack-protection (2.0.3)
77
+ rack
78
+ rake (12.3.1)
79
+ reportbuilder (1.4.2)
80
+ clbustos-rtf (~> 0.4.0)
81
+ prawn (~> 0.8.4)
82
+ prawn-svg (~> 0.9.1)
83
+ text-table (~> 1.2)
84
+ rserve-client (0.3.5)
85
+ ruby-ole (1.2.12.1)
86
+ rubyvis (0.6.1)
87
+ simplecov (0.13.0)
88
+ docile (~> 1.1.0)
89
+ json (>= 1.8, < 3)
90
+ simplecov-html (~> 0.10.0)
91
+ simplecov-html (0.10.2)
92
+ sinatra (2.0.3)
93
+ mustermann (~> 1.0)
94
+ rack (~> 2.0)
95
+ rack-protection (= 2.0.3)
96
+ tilt (~> 2.0)
97
+ sinatra-cross_origin (0.4.0)
98
+ slim (3.0.9)
99
+ temple (>= 0.7.6, < 0.9)
100
+ tilt (>= 1.3.3, < 2.1)
101
+ spreadsheet (1.1.7)
102
+ ruby-ole (>= 1.0)
103
+ statsample (2.1.0)
104
+ awesome_print (~> 1.6)
105
+ daru (~> 0.1.6)
106
+ dirty-memoize (~> 0.0.4)
107
+ distribution (~> 0.7)
108
+ extendmatrix (~> 0.4)
109
+ minimization (~> 0.2)
110
+ reportbuilder (~> 1.4)
111
+ rserve-client (~> 0.3)
112
+ rubyvis (~> 0.6.1)
113
+ spreadsheet (~> 1.1)
114
+ temple (0.8.0)
115
+ text-table (1.2.4)
116
+ tilt (2.0.8)
117
+ unf (0.1.4)
118
+ unf_ext
119
+ unf_ext (0.0.7.5)
120
+ webrobots (0.1.2)
121
+ yard (0.9.16)
122
+
123
+ PLATFORMS
124
+ ruby
125
+
126
+ DEPENDENCIES
127
+ codeclimate-test-reporter (~> 1.0.0)
128
+ genevalidator!
129
+ minitest (~> 5.10)
130
+ rake (~> 12.3)
131
+ simplecov
132
+ yard (~> 0.9.11)
133
+
134
+ BUNDLED WITH
135
+ 1.16.2
data/README.md CHANGED
@@ -14,9 +14,7 @@ If you would like to use GeneValidator on a few sequences, see our online [GeneV
14
14
 
15
15
 
16
16
  If you use GeneValidator in your work, please cite us as follows:
17
- > [Dragan M<sup>&Dagger;</sup>, Moghul MI<sup>&Dagger;</sup>, Priyam A, Bustos C & Wurm Y. 2016. GeneValidator: identify problems with protein-coding gene predictions. <em>Bioinformatics</em>, doi: 10.1093/bioinformatics/btw015](http://bioinformatics.oxfordjournals.org/content/early/2016/02/26/bioinformatics.btw015).
18
-
19
-
17
+ > [Dragan M<sup>&Dagger;</sup>, Moghul I<sup>&Dagger;</sup>, Priyam A, Bustos C & Wurm Y. 2016. GeneValidator: identify problems with protein-coding gene predictions. <em>Bioinformatics</em>, doi: 10.1093/bioinformatics/btw015](https://academic.oup.com/bioinformatics/article/32/10/1559/1742817/GeneValidator-identify-problems-with-protein).
20
18
 
21
19
 
22
20
 
@@ -37,86 +35,59 @@ GeneValidator also runs a further two validation on cDNA sequences:
37
35
  Each analysis of each query returns a binary result (good vs. potential problem) according to p-value or an empirically determined cutoff. The results for each query are combined into an overall quality score from 0 to 100. Each analysis of each query returns a binary result (good vs. potential problem) according to p-value or an empirically determined cutoff. The results for each query are combined into an overall quality score from 0 to 100.
38
36
 
39
37
 
40
-
41
-
42
-
43
38
  ## Installation
44
- ### Installation Requirements
45
- * Ruby (>= 2.0.0)
46
- * NCBI BLAST+ (>= 2.2.30+) (download [here](http://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastDocs&DOC_TYPE=Download)).
47
- * MAFFT installation (>=7.273) (download [here](http://mafft.cbrc.jp/alignment/software/)).
48
- * A web browser - [Mozilla FireFox](https://www.mozilla.org/en-GB/firefox/new/) & Safari are recommended. At the moment, it is not possible to use Chrome to view the results locally (as chrome does not allow ajax to local files). To avoid this, simply use a different browser (like Firefox or Safari) or start a local server in the results folder.
49
-
50
- Please see [here](https://gist.github.com/IsmailM/b783e8a06565197084e6) for more help with installing the prerequisites.
51
39
 
52
- #### Setting up a BLAST database
53
- GeneValidator requires a protein BLAST database in order to fully analyse all sequences. The BLAST database needs to be set up with the `-parse_seqids` argument as follows:
40
+ Run the following in your terminal:
54
41
 
55
42
  ```bash
56
- makeblastdb -in input_db -dbtype prot -parse_seqids
43
+ sh -c "$(curl -fsSL https://raw.githubusercontent.com/wurmlab/genevalidator/master/install.sh)"
57
44
  ```
58
45
 
59
- ### Installation
60
- Simply run the following command in the terminal.
46
+ By default this will install in a folder called `genevalidator` in your current folder. If you wish to have GeneValidator installed in a different location, add the path to the end of the above install line. For example to install GeneValidator in a hidden folder in your home path
61
47
 
62
48
  ```bash
63
- gem install genevalidator
49
+ sh -c "$(curl -fsSL https://raw.githubusercontent.com/wurmlab/genevalidator/master/install.sh)" ~/.genevalidator
64
50
  ```
65
51
 
66
- If that doesn't work, try `sudo gem install genevalidator` instead.
52
+ Alternatively download and compress the standalone package from our [releases](https://github.com/wurmlab/genevalidator/releases/latest) page.
67
53
 
68
- ##### Running From Source (Not Recommended)
69
- It is also possible to run from source. However, this is not recommended.
54
+ The produced folder contains the following:
70
55
 
71
56
  ```bash
72
- # Clone the repository.
73
- git clone https://github.com/wurmlab/genevalidator.git
74
-
75
- # Move into GeneValidator source directory.
76
- cd GeneValidator
77
-
78
- # Install bundler
79
- gem install bundler
80
-
81
- # Use bundler to install dependencies
82
- bundle install
83
-
84
- # Optional: run tests, build documentation and build the gem from source
85
- bundle exec rake
86
-
87
- # Run GeneValidator.
88
- bundle exec genevalidator -h
89
- # note that `bundle exec` executes GeneValidator in the context of the bundle
90
-
91
- # Alternativaly, install GeneValidator as a gem
92
- bundle exec rake install
93
- genevalidator -h
57
+ Readme.txt # See Readme for version and basic usage information
58
+ bin/ # bin folder for genevalidator, BLAST+ and JQ (can add to $PATH)
59
+ blast_db/ # contains the SWISSPROT BLAST database.
60
+ exemplar_data/ # contains exemplar mrna and protein fasta files
61
+ lib/ # contains genevalidator dependencies
94
62
  ```
95
63
 
96
-
97
-
98
-
99
-
100
-
101
64
  ## Usage
102
- Verify GeneValidator installed by running the following command in the terminal:
65
+
66
+ GeneValidator can be run immediately after the GeneValidator package has been downloaded and uncompressed.
103
67
 
104
68
  ```bash
105
- genevalidator
69
+ genevalidator -h
106
70
  ```
107
71
 
108
72
  You should see the following output.
109
73
 
110
74
  ```bash
75
+ SUMMARY:
76
+ GeneValidator - Identify problems with predicted genes
77
+
111
78
  USAGE:
112
- genevalidator [OPTIONS] Input_File
79
+ genevalidator [OPTIONAL ARGUMENTS] INPUT_FILE
80
+
81
+ To run as a web application:
113
82
 
114
- ARGUMENTS:
115
- Input_File: Path to the input fasta file containing the predicted sequences.
83
+ genevalidator app [OPTIONAL ARGUMENTS]
84
+
85
+ See 'genevalidator app --help' for more information
116
86
 
117
87
  OPTIONAL ARGUMENTS
118
- -v, --validations <String> The Validations to be applied.
119
- Validation Options Available (separated by coma):
88
+
89
+ --validations [VALIDATIONS] The Validations to be applied.
90
+ Validation Options Available (separated by comma):
120
91
  all = All validations (default),
121
92
  lenc = Length validation by clusterization,
122
93
  lenr = Length validation by ranking,
@@ -125,46 +96,79 @@ OPTIONAL ARGUMENTS
125
96
  frame = Open reading frame (ORF) validation,
126
97
  orf = Main ORF validation,
127
98
  align = Validating based on multiple alignment
128
- -d, --db [BLAST_DATABASE] Path to the BLAST database
99
+ -d, --db [PATH] Path to the BLAST database
100
+ e.g. genevalidator -d /path/to/databasa.fa Input_File
129
101
  GeneValidator also supports remote databases:
130
102
  e.g. genevalidator -d "swissprot -remote" Input_File
131
- -e, --extract_raw_seqs Produces a fasta file of the raw sequences of all BLAST hits in the
132
- supplied BLAST output file. This fasta file can then be provided to
133
- GeneValidator with the "-r", "--raw_sequences" argument
134
- -j, --json_file [JSON_FILE] Generate HTML report from a JSON file (or a subset of a JSON file)
135
- produced by GeneValidator
136
- -x [BLAST_XML_FILE], Provide GeneValidator with a pre-computed BLAST XML output
137
- --blast_xml_file file (BLAST -outfmt option 5).
138
- -t [BLAST_TABULAR_FILE], Provide GeneValidator with a pre-computed BLAST tabular output
139
- --blast_tabular_file file. (BLAST -outfmt option 6).
140
- -o [BLAST_TABULAR_OPTIONS], Custom format used in BLAST -outfmt argument
141
- --blast_tabular_options See BLAST+ manual pages for more details
142
- -n, --num_threads num_of_threads Specify the number of processor threads to use when running
143
- BLAST and Mafft within GeneValidator.
144
- -r, --raw_sequences [raw_seq] Supply a fasta file of the raw sequences of all BLAST hits present
103
+ -s, --select_single_best Writes the fasta sequence of the best scoring gene to STDOUT.
104
+
105
+ # OUTPUT ARGUMENTS
106
+
107
+ -o, --output_dir [PATH] Path to the output folder.
108
+ By default the output folder is in the same directory as the input
109
+ file and is named as input filename, followed by the time of
110
+ analysis
111
+ -f, --force_rewrite Rewrites over existing output.
112
+ --output_formats [STRING] Output Formats to generate. This can be either: "all", "html",
113
+ "csv", "json", "summary" or "stdout". Multiple formats can be
114
+ separated by a semi-colon e.g. "csv:json".
115
+ By default, all output formats are generated.
116
+
117
+ # BLAST ARGUMENTS
118
+
119
+ --min_blast_hits_required [NUM]
120
+ The minimum number of BLAST hits required by GeneValidator in order
121
+ to carry out validations. Note: certain validations have their own
122
+ set minimum (such as the multiple alignment validation, which
123
+ requires a minimum of 10 BLAST hits)
124
+ -b, --blast_options [STRING] A string that is to passed to BLAST
125
+ -x, --blast_xml_file [PATH] Provide GeneValidator with a pre-computed BLAST XML output
126
+ file (BLAST -outfmt option 5).
127
+ -t, --blast_tabular_file [PATH] Provide GeneValidator with a pre-computed BLAST tabular output
128
+ file. (BLAST -outfmt option 6).
129
+ --blast_tabular_options [STRING]
130
+ Custom format used in BLAST -outfmt argument
131
+ See BLAST+ manual pages for more details
132
+ --raw_sequences [PATH] Supply a fasta file of the raw sequences of all BLAST hits present
145
133
  in the supplied BLAST XML or BLAST tabular file.
146
- -b, --binaries [binaries] Path to BLAST and MAFFT bin folders (is added to $PATH variable)
147
- To be provided as follows:
148
- e.g. genevalidator -b /blast/bin/path/ -b /mafft/bin/path/
149
- --version The version of GeneValidator that you are running.
150
- -h, --help Show this screen.
151
- ```
152
134
 
135
+ # EXTRACT RAW SEQUENCES ARGUMENTS
153
136
 
137
+ -e, --extract_raw_seqs Extract a fasta file of the raw sequences of BLAST hits in the
138
+ supplied BLAST output file. This fasta file can then be provided to
139
+ GeneValidator with the "--raw_sequences" argument
154
140
 
141
+ # REPROCESS JSON ARGUMENTS
155
142
 
143
+ -j, --json_file [JSON_FILE] Path to json file. Re-generate the HTML report from a (filtered)
144
+ JSON file that was previously produced by GeneValidator
145
+
146
+ # GENERAL ARGUMENTS
147
+
148
+ -n, --num_threads [THREADS] Specify the number of processor threads to use when running
149
+ BLAST and GeneValidator.
150
+ -m, --mafft_threads [THREADS] Specify the number of processor threads to use when running
151
+ Mafft. Note Mafft is run independently in each of the threads
152
+ specified in --num_threads.
153
+ -r, --resume [DIR] Resume a previous analysis (creates a new output directory but
154
+ skips
155
+ --bin [DIR] Path to BLAST and MAFFT bin folders (is added to $PATH variable)
156
+ To be provided as follows:
157
+ e.g. genevalidator --bin /blast/bin/ --bin /mafft/bin/
158
+ -h, --help Show this screen.
159
+ -v, --version The version of GeneValidator that you are running.```
156
160
 
157
161
  ## Example Usage Scenarios
158
162
 
159
- #### Simplest Usage (using NCBI remote BLAST servers)
160
- This runs BLAST on NCBI remote Swiss-Prot BLAST database. As such this is suitable for analyses on less than 10 sequences.
163
+ #### Simplest Usage (using included SWISSPROT database)
164
+ This runs BLAST on the included SwissProt BLAST database.
161
165
 
162
166
  ```bash
163
167
  genevalidator INPUT_FASTA_FILE
164
168
  ```
165
169
 
166
- #### Using a local BLAST database.
167
- GeneValidator would run BLAST (using an E-Value 1e-5) on each query against the provided BLAST database and then run the validation analyses.
170
+ #### Using an alternative BLAST database
171
+ GeneValidator requires a protein BLAST database in order to fully analyse all sequences. The BLAST database needs to be set up with the `-parse_seqids` argument of the makeblastdb script from BLAST+ (from Genevalidator Package, in the bin directory). See [this page](https://gist.github.com/IsmailM/3e3519de18c5b8b36d8aa0f223fb7948) for more information on how to set up BLAST databases.
168
172
 
169
173
  ```bash
170
174
  genevalidator -d DATABASE_PATH -n NUM_THREADS INPUT_FASTA_FILE
@@ -179,15 +183,7 @@ GeneValidator supports the XML and tabular BLAST output formats.
179
183
  # Run BLAST (XML output)
180
184
  blast(p/x) -db DATABASE_PATH -num_threads NUM_THREADS -outfmt 5 -out BLAST_XML_FILE -query INPUT_FASTA_FILE
181
185
 
182
- # Optional: Generate a fasta file for the BLAST hits.
183
- # Note: this works best if you use the same database used to create the BLAST OUTPUT file.
184
- genevalidator -d DATABASE_PATH -e -x BLAST_XML_FILE
185
-
186
186
  # Run GeneValidator
187
- ## If you ran the previous command (i.e. if you produced fasta file for the BLAST hits)
188
- genevalidator -n NUM_THREADS -x BLAST_XML_FILE -r RAW_SEQUENCES_FILE INPUT_FASTA_FILE
189
-
190
- ## If you did not run the previous command (this will run the previous command for you)
191
187
  genevalidator -d DATABASE_PATH -n NUM_THREADS -x BLAST_XML_FILE INPUT_FASTA_FILE
192
188
  ```
193
189
 
@@ -197,28 +193,24 @@ This is the same, but using the BLAST tabular output.
197
193
  # Run BLAST (tabular output)
198
194
  blast(p/x) -db DATABASE_PATH -num_threads NUM_THREADS -outfmt '7 qseqid sseqid sacc slen qstart qend sstart send length qframe pident nident evalue qseq sseq' -out BLAST_TAB_FILE -query INPUT_FASTA_FILE
199
195
 
200
- # Optional: Generate a fasta file for the BLAST hits.
201
- # Note: this works best if you use the same database used to create the BLAST OUTPUT file.
202
- genevalidator -d DATABASE_PATH -e -t BLAST_TAB_FILE -o 'qseqid sseqid sacc slen qstart qend sstart send length qframe pident nident evalue qseq sseq'
203
-
204
196
  # Run GeneValidator
205
- ## If you ran the previous command (i.e. if you produced fasta file for the BLAST hits)
206
- genevalidator -n NUM_THREADS -t BLAST_TAB_FILE -o 'qseqid sseqid sacc slen qstart qend sstart send length qframe pident nident evalue qseq sseq' -r RAW_SEQUENCES_FILE INPUT_FASTA_FILE
207
-
208
- ## If you did generate the BLAST hits fasta file (this will run the previous command for you)
209
- genevalidator -d DATABASE_PATH -n NUM_THREADS -t BLAST_TAB_FILE -o 'qseqid sseqid sacc slen qstart qend sstart send length qframe pident nident evalue qseq sseq' INPUT_FASTA_FILE
210
-
197
+ genevalidator -n NUM_THREADS -t BLAST_TAB_FILE -o 'qseqid sseqid sacc slen qstart qend sstart send length qframe pident nident evalue qseq sseq' INPUT_FASTA_FILE
211
198
  ```
212
199
 
213
200
 
214
201
 
215
202
 
216
203
  ## Output
217
- The output produced by GeneValidator is presented in three manners.
204
+ The output produced by GeneValidator is presented in four manners.
218
205
 
219
206
  #### HTML Output
220
207
  Firstly, the output is produced as a colourful, HTML file. This file is titled 'results.html' (found in the 'html' folder) and can be opened in a web browser (please use a supported browser - See [Installation Requirements](#installation-requirements)). This file contains all the results in an easy-to-view manner with graphical visualisations. See exemplar HTML output [here](http://wurmlab.github.io/tools/genevalidator/exemplar_data/protein_input/) (protein input data) and [here](http://wurmlab.github.io/tools/genevalidator/exemplar_data/genetic_input/) (DNA input data).
221
208
 
209
+
210
+ #### CSV Output
211
+ The output is also produced in JSON. GeneValidator is able to re-generate results for any JSON files (or derived JSON files) with that were previously generated by the program. This means that you are able to use the JSON file in your own analysis pipelines and then use GeneValidator to produce the HTML output for the analysed JSON file.
212
+
213
+
222
214
  #### JSON Output
223
215
  The output is also produced in JSON. GeneValidator is able to re-generate results for any JSON files (or derived JSON files) with that were previously generated by the program. This means that you are able to use the JSON file in your own analysis pipelines and then use GeneValidator to produce the HTML output for the analysed JSON file.
224
216
 
@@ -229,44 +221,34 @@ Lastly, a tabular summary of the results is also outputted in the terminal to pr
229
221
 
230
222
 
231
223
 
232
- ## Analysing the JSON output
233
-
234
- There are numerous methods to analyse the JSON output including the [streamable JSON command line program](http://trentm.com/json/) or [jq](https://stedolan.github.io/jq/). The below examples use the JSON tool.
235
-
236
- ### Examplar JSON CLI Installation
237
- After installing node:
224
+ ## Using the JSON output
238
225
 
239
- ```bash
240
- $ npm install -g json
241
- ```
242
-
243
- ### Filtering the results
226
+ JSON output can be filtered or processed in a variety of ways using standard tools, such as the [streamable JSON command line program](http://trentm.com/json/), or [jq](https://stedolan.github.io/jq/). The examples below makes use of jq 1.5 which is bundled with GeneValidator.
244
227
 
245
228
  ```bash
229
+ # Requires jq 1.5
246
230
 
247
231
  # Extract sequences that have an overall score of 100
248
- $ json -f INPUT_JSON_FILE -c 'this.overall_score == 100' > OUTPUT_JSON_FILE
232
+ $ jq '.[] | select(.overall_score == 100)' INPUT_JSON_FILE > OUTPUT_JSON_FILE
249
233
 
250
234
  # Extract sequences that have an overall score of over 70
251
- $ json -f INPUT_JSON_FILE -c 'this.overall_score > 70' > OUTPUT_JSON_FILE
235
+ $ jq '.[] | select(.overall_score == 70)' INPUT_JSON_FILE > OUTPUT_JSON_FILE
252
236
 
253
237
  # Extract sequences that have more than 50 hits
254
- $ json -f INPUT_JSON_FILE -c 'this.no_hits > 50' > OUTPUT_JSON_FILE
238
+ $ jq '.[] | select(.no_hits > 50)' INPUT_JSON_FILE > OUTPUT_JSON_FILE
255
239
 
256
240
  # Sort the JSON based on the overall score (ascending - 0 to 100)
257
- $ json -f INPUT_JSON_FILE -A -e 'this.sort(function(a,b) {return (a.overall_score > b.overall_score) ? 1 : ((b.overall_score > a.overall_score) ? -1 : 0);} );' > OUTPUT_JSON_FILE
258
-
241
+ $ jq 'sort_by(.overall_score)' INPUT_JSON_FILE > OUTPUT_JSON_FILE
259
242
  # Sort the JSON based on the overall score (decending - 100 to 0)
260
- json -f INPUT_JSON_FILE -A -e 'this.sort(function(a,b) {return (a.overall_score < b.overall_score) ? 1 : ((b.overall_score < a.overall_score) ? -1 : 0);} );' > OUTPUT_JSON_FILE
243
+ $ jq 'sort_by(- .overall_score)' INPUT_JSON_FILE > OUTPUT_JSON_FILE
244
+
245
+ # Remove the large graphs objects (note these Graphs objects are required if you wish to pass the json back into GV using the `-j` option - see below)
246
+ $ jq --raw-output '[ .[] | del(.validations[].graphs) ]' INPUT_JSON_FILE > OUTPUT_JSON_FILE
261
247
  ```
262
248
 
263
249
  The subsetted/sorted JSON file can then be passed back into GeneValidator (using the `-j` command line argument) to generate the HTML report for the sequences in the JSON file.
264
250
 
265
251
  ```bash
266
- genevalidator -j SORTED_JSON_FILE
252
+ genevalidator -j OUTPUT_JSON_FILE
267
253
  ```
268
254
 
269
-
270
- ## Related projects
271
- [GeneValidatorApp](https://github.com/wurmlab/GeneValidatorApp) - A Web App wrapper for GeneValidator.<br>
272
- [GeneValidatorApp-API](https://github.com/wurmlab/GeneValidatorApp-API) - An easy to use API for GeneValidatorApp to allow you to use GeneValidator within your web applications.