genevalidator 1.6.12 → 2.1.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.gitignore +30 -1
- data/.ruby-version +1 -0
- data/.travis.yml +13 -12
- data/Gemfile +4 -1
- data/Gemfile.lock +135 -0
- data/README.md +104 -122
- data/Rakefile +377 -5
- data/aux/gv_results.slim +155 -0
- data/aux/html_files/css/gv.compiled.min.css +8 -0
- data/aux/{files → html_files}/css/src/bootstrap.min.css +0 -0
- data/aux/{files → html_files}/css/src/font-awesome.min.css +0 -0
- data/aux/{files → html_files}/css/src/style.css +0 -0
- data/aux/{files → html_files}/fonts/FontAwesome.otf +0 -0
- data/aux/{files → html_files}/fonts/fontawesome-webfont.eot +0 -0
- data/aux/{files → html_files}/fonts/fontawesome-webfont.svg +0 -0
- data/aux/{files → html_files}/fonts/fontawesome-webfont.ttf +0 -0
- data/aux/{files → html_files}/fonts/fontawesome-webfont.woff +0 -0
- data/aux/{files → html_files}/img/gene.png +0 -0
- data/aux/html_files/js/gv.compiled.min.js +1 -0
- data/aux/{files → html_files}/js/src/bootstrap.min.js +0 -0
- data/aux/{files → html_files}/js/src/d3.v3.min.js +0 -0
- data/aux/{files → html_files}/js/src/jquery-2.1.1.min.js +0 -0
- data/aux/{files → html_files}/js/src/jquery.tablesorter.min.js +0 -0
- data/aux/{files → html_files}/js/src/plots.js +1 -1
- data/aux/{files → html_files}/js/src/script.js +0 -0
- data/aux/{files → html_files}/json/.gitkeep +0 -0
- data/bin/genevalidator +393 -56
- data/exemplar_data/README.md +60 -0
- data/{data/mrna_data.fasta → exemplar_data/mrna_data.fa} +1 -1
- data/{data/protein_data.fasta → exemplar_data/protein_data.fa} +0 -0
- data/genevalidator.gemspec +35 -20
- data/install.sh +92 -0
- data/lib/genevalidator.rb +171 -56
- data/lib/genevalidator/arg_validation.rb +26 -55
- data/lib/genevalidator/blast.rb +44 -99
- data/lib/genevalidator/clusterization.rb +18 -22
- data/lib/genevalidator/exceptions.rb +17 -17
- data/lib/genevalidator/ext/array.rb +21 -4
- data/lib/genevalidator/get_raw_sequences.rb +32 -31
- data/lib/genevalidator/hsp.rb +31 -2
- data/lib/genevalidator/json_to_gv_results.rb +38 -122
- data/lib/genevalidator/output.rb +158 -172
- data/lib/genevalidator/output_files.rb +134 -0
- data/lib/genevalidator/pool.rb +2 -5
- data/lib/genevalidator/query.rb +1 -1
- data/lib/genevalidator/tabular_parser.rb +8 -29
- data/lib/genevalidator/validation.rb +48 -90
- data/lib/genevalidator/validation_alignment.rb +64 -75
- data/lib/genevalidator/validation_blast_reading_frame.rb +13 -9
- data/lib/genevalidator/validation_duplication.rb +85 -84
- data/lib/genevalidator/validation_gene_merge.rb +46 -35
- data/lib/genevalidator/validation_length_cluster.rb +18 -15
- data/lib/genevalidator/validation_length_rank.rb +19 -15
- data/lib/genevalidator/validation_maker_qi.rb +13 -12
- data/lib/genevalidator/validation_open_reading_frame.rb +16 -13
- data/lib/genevalidator/validation_report.rb +1 -1
- data/lib/genevalidator/validation_test.rb +1 -1
- data/lib/genevalidator/version.rb +1 -1
- data/test/overall.rb +1 -1
- data/test/test_all_validations.rb +36 -24
- data/test/test_blast.rb +39 -24
- data/test/test_clusterization_2d.rb +4 -4
- data/test/test_helper.rb +2 -2
- data/test/test_query.rb +16 -20
- data/test/test_validation_open_reading_frame.rb +122 -122
- data/test/test_validations.rb +12 -10
- metadata +94 -79
- data/aux/files/css/genevalidator.compiled.min.css +0 -16
- data/aux/files/js/genevalidator.compiled.min.js +0 -28
- data/aux/json_footer.erb +0 -8
- data/aux/json_header.erb +0 -19
- data/aux/json_query.erb +0 -15
- data/aux/template_footer.erb +0 -8
- data/aux/template_header.erb +0 -19
- data/aux/template_query.erb +0 -14
- data/data/README.md +0 -57
- data/data/mrna_data.fasta.blast_tabular +0 -3567
- data/data/mrna_data.fasta.blast_tabular.raw_seq +0 -53998
- data/data/mrna_data.fasta.blast_tabular.raw_seq.idx +0 -5440
- data/data/mrna_data.fasta.blast_xml +0 -39800
- data/data/mrna_data.fasta.blast_xml.raw_seq +0 -2554
- data/data/mrna_data.fasta.blast_xml.raw_seq.idx +0 -3127
- data/data/mrna_data.fasta.json +0 -1
- data/data/protein_data.fasta.blast_tabular +0 -3278
- data/data/protein_data.fasta.blast_tabular.raw_seq +0 -61295
- data/data/protein_data.fasta.blast_tabular.raw_seq.idx +0 -4438
- data/data/protein_data.fasta.blast_xml +0 -26228
- data/data/protein_data.fasta.blast_xml.raw_seq +0 -9803
- data/data/protein_data.fasta.blast_xml.raw_seq.idx +0 -1777
- data/data/protein_data.fasta.json +0 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: ac9af62bb53cde2f76ae700a0098437f064dd354f9c4649006e3770a13b346ea
|
4
|
+
data.tar.gz: f655a495a8e1638b035f47c5caae2e1f449477198435bba6561f7af1e71a5142
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8402a4e0f20fbc7d7765c01a8337ce5a425f5ed92f3616f10e6faa28fa24a4a8624fc6218d0bdf0773a7253934692baa6ae30cb2d87078415978600dacadc4db
|
7
|
+
data.tar.gz: 29fc2075baa01d90affa395b664adb884a4583ea4f50238fd6ecee9e5aa2e78f92c965256b4edc70590aaadd40cb61d4b206f7239c69da02e261882c4d689c6e
|
data/.gitignore
CHANGED
@@ -2,8 +2,37 @@
|
|
2
2
|
*.~*~
|
3
3
|
*.gem
|
4
4
|
.DS_Store
|
5
|
-
Gemfile.lock
|
6
5
|
# Gemnasium gem configuration file
|
7
6
|
config/gemnasium.yml
|
8
7
|
doc
|
9
8
|
.yardoc
|
9
|
+
tmp
|
10
|
+
*.rbc
|
11
|
+
.bundle
|
12
|
+
.config
|
13
|
+
InstalledFiles
|
14
|
+
_yardoc
|
15
|
+
coverage
|
16
|
+
doc/
|
17
|
+
lib/bundler/man
|
18
|
+
pkg
|
19
|
+
rdoc
|
20
|
+
spec/reports
|
21
|
+
test/tmp
|
22
|
+
test/version_tmp
|
23
|
+
*.bundle
|
24
|
+
*.so
|
25
|
+
*.o
|
26
|
+
*.a
|
27
|
+
mkmf.log
|
28
|
+
*-linux-x86/
|
29
|
+
*-linux-x86_64/
|
30
|
+
*-osx/
|
31
|
+
*-linux-x86.tar.gz
|
32
|
+
*-linux-x86_64.tar.gz
|
33
|
+
*-osx.tar.gz
|
34
|
+
test/test_files/all_validations_prot/prot.fa.html/
|
35
|
+
test/test_files/GV_*
|
36
|
+
exemplar_data/GV*
|
37
|
+
.vscode
|
38
|
+
q/
|
data/.ruby-version
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
2.2
|
data/.travis.yml
CHANGED
@@ -1,18 +1,19 @@
|
|
1
1
|
language: ruby
|
2
2
|
rvm:
|
3
|
-
- "2.
|
4
|
-
- "2.
|
5
|
-
- "2.
|
6
|
-
|
7
|
-
|
8
|
-
-
|
9
|
-
-
|
10
|
-
-
|
11
|
-
-
|
12
|
-
-
|
13
|
-
|
14
|
-
|
3
|
+
- "2.2.10"
|
4
|
+
- "2.3.7"
|
5
|
+
- "2.4.4"
|
6
|
+
- "2.5.1"
|
7
|
+
before_script:
|
8
|
+
- wget -P ~ https://mafft.cbrc.jp/alignment/software/mafft-7.397-linux.tgz
|
9
|
+
- tar -zxvf ~/mafft-7.397-linux.tgz -C ~
|
10
|
+
- mkdir ~/mafft_bin
|
11
|
+
- echo "#!/bin/bash" > ~/mafft_bin/mafft
|
12
|
+
- echo '$HOME/mafft-linux64/mafft.bat "$@"' >> ~/mafft_bin/mafft
|
13
|
+
- chmod 755 ~/mafft_bin/mafft
|
14
|
+
- export PATH=$PATH:~/mafft_bin
|
15
15
|
script: bundle exec rake test
|
16
|
+
after_script: bundle exec codeclimate-test-reporter
|
16
17
|
addons:
|
17
18
|
code_climate:
|
18
19
|
repo_token: 2177997ae2dd26804c32e1ec34a2221f94b71a2170f6c1db2c020f8858cd87f2
|
data/Gemfile
CHANGED
data/Gemfile.lock
ADDED
@@ -0,0 +1,135 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
genevalidator (2.1.3)
|
5
|
+
bio (~> 1.4)
|
6
|
+
bio-blastxmlparser (~> 2.0)
|
7
|
+
genevalidatorapp (~> 2.1.3)
|
8
|
+
mechanize (= 2.7.5)
|
9
|
+
rack (~> 2.0)
|
10
|
+
slim (~> 3.0)
|
11
|
+
statsample (= 2.1.0)
|
12
|
+
|
13
|
+
GEM
|
14
|
+
remote: http://rubygems.org/
|
15
|
+
specs:
|
16
|
+
awesome_print (1.8.0)
|
17
|
+
backports (3.11.3)
|
18
|
+
bio (1.5.1)
|
19
|
+
bio-blastxmlparser (2.0.4)
|
20
|
+
bio-logger
|
21
|
+
nokogiri
|
22
|
+
bio-logger (1.0.1)
|
23
|
+
log4r (>= 1.1.9)
|
24
|
+
clbustos-rtf (0.4.2)
|
25
|
+
codeclimate-test-reporter (1.0.8)
|
26
|
+
simplecov (<= 0.13)
|
27
|
+
daru (0.1.6)
|
28
|
+
backports
|
29
|
+
dirty-memoize (0.0.4)
|
30
|
+
distribution (0.7.3)
|
31
|
+
docile (1.1.5)
|
32
|
+
domain_name (0.5.20180417)
|
33
|
+
unf (>= 0.0.5, < 1.0.0)
|
34
|
+
extendmatrix (0.4)
|
35
|
+
genevalidatorapp (2.1.3)
|
36
|
+
bio (~> 1.4)
|
37
|
+
sinatra (~> 2.0)
|
38
|
+
sinatra-cross_origin (~> 0.3)
|
39
|
+
slim (~> 3.0)
|
40
|
+
http-cookie (1.0.3)
|
41
|
+
domain_name (~> 0.5)
|
42
|
+
json (2.1.0)
|
43
|
+
log4r (1.1.10)
|
44
|
+
mechanize (2.7.5)
|
45
|
+
domain_name (~> 0.5, >= 0.5.1)
|
46
|
+
http-cookie (~> 1.0)
|
47
|
+
mime-types (>= 1.17.2)
|
48
|
+
net-http-digest_auth (~> 1.1, >= 1.1.1)
|
49
|
+
net-http-persistent (~> 2.5, >= 2.5.2)
|
50
|
+
nokogiri (~> 1.6)
|
51
|
+
ntlm-http (~> 0.1, >= 0.1.1)
|
52
|
+
webrobots (>= 0.0.9, < 0.2)
|
53
|
+
mime-types (3.2.2)
|
54
|
+
mime-types-data (~> 3.2015)
|
55
|
+
mime-types-data (3.2018.0812)
|
56
|
+
mini_portile2 (2.3.0)
|
57
|
+
minimization (0.2.3)
|
58
|
+
text-table (~> 1.2)
|
59
|
+
minitest (5.11.3)
|
60
|
+
mustermann (1.0.2)
|
61
|
+
net-http-digest_auth (1.4.1)
|
62
|
+
net-http-persistent (2.9.4)
|
63
|
+
nokogiri (1.8.4)
|
64
|
+
mini_portile2 (~> 2.3.0)
|
65
|
+
ntlm-http (0.1.1)
|
66
|
+
prawn (0.8.4)
|
67
|
+
prawn-core (>= 0.8.4, < 0.9)
|
68
|
+
prawn-layout (>= 0.8.4, < 0.9)
|
69
|
+
prawn-security (>= 0.8.4, < 0.9)
|
70
|
+
prawn-core (0.8.4)
|
71
|
+
prawn-layout (0.8.4)
|
72
|
+
prawn-security (0.8.4)
|
73
|
+
prawn-svg (0.9.1.11)
|
74
|
+
prawn (>= 0.8.4)
|
75
|
+
rack (2.0.5)
|
76
|
+
rack-protection (2.0.3)
|
77
|
+
rack
|
78
|
+
rake (12.3.1)
|
79
|
+
reportbuilder (1.4.2)
|
80
|
+
clbustos-rtf (~> 0.4.0)
|
81
|
+
prawn (~> 0.8.4)
|
82
|
+
prawn-svg (~> 0.9.1)
|
83
|
+
text-table (~> 1.2)
|
84
|
+
rserve-client (0.3.5)
|
85
|
+
ruby-ole (1.2.12.1)
|
86
|
+
rubyvis (0.6.1)
|
87
|
+
simplecov (0.13.0)
|
88
|
+
docile (~> 1.1.0)
|
89
|
+
json (>= 1.8, < 3)
|
90
|
+
simplecov-html (~> 0.10.0)
|
91
|
+
simplecov-html (0.10.2)
|
92
|
+
sinatra (2.0.3)
|
93
|
+
mustermann (~> 1.0)
|
94
|
+
rack (~> 2.0)
|
95
|
+
rack-protection (= 2.0.3)
|
96
|
+
tilt (~> 2.0)
|
97
|
+
sinatra-cross_origin (0.4.0)
|
98
|
+
slim (3.0.9)
|
99
|
+
temple (>= 0.7.6, < 0.9)
|
100
|
+
tilt (>= 1.3.3, < 2.1)
|
101
|
+
spreadsheet (1.1.7)
|
102
|
+
ruby-ole (>= 1.0)
|
103
|
+
statsample (2.1.0)
|
104
|
+
awesome_print (~> 1.6)
|
105
|
+
daru (~> 0.1.6)
|
106
|
+
dirty-memoize (~> 0.0.4)
|
107
|
+
distribution (~> 0.7)
|
108
|
+
extendmatrix (~> 0.4)
|
109
|
+
minimization (~> 0.2)
|
110
|
+
reportbuilder (~> 1.4)
|
111
|
+
rserve-client (~> 0.3)
|
112
|
+
rubyvis (~> 0.6.1)
|
113
|
+
spreadsheet (~> 1.1)
|
114
|
+
temple (0.8.0)
|
115
|
+
text-table (1.2.4)
|
116
|
+
tilt (2.0.8)
|
117
|
+
unf (0.1.4)
|
118
|
+
unf_ext
|
119
|
+
unf_ext (0.0.7.5)
|
120
|
+
webrobots (0.1.2)
|
121
|
+
yard (0.9.16)
|
122
|
+
|
123
|
+
PLATFORMS
|
124
|
+
ruby
|
125
|
+
|
126
|
+
DEPENDENCIES
|
127
|
+
codeclimate-test-reporter (~> 1.0.0)
|
128
|
+
genevalidator!
|
129
|
+
minitest (~> 5.10)
|
130
|
+
rake (~> 12.3)
|
131
|
+
simplecov
|
132
|
+
yard (~> 0.9.11)
|
133
|
+
|
134
|
+
BUNDLED WITH
|
135
|
+
1.16.2
|
data/README.md
CHANGED
@@ -14,9 +14,7 @@ If you would like to use GeneValidator on a few sequences, see our online [GeneV
|
|
14
14
|
|
15
15
|
|
16
16
|
If you use GeneValidator in your work, please cite us as follows:
|
17
|
-
> [Dragan M<sup>‡</sup>, Moghul
|
18
|
-
|
19
|
-
|
17
|
+
> [Dragan M<sup>‡</sup>, Moghul I<sup>‡</sup>, Priyam A, Bustos C & Wurm Y. 2016. GeneValidator: identify problems with protein-coding gene predictions. <em>Bioinformatics</em>, doi: 10.1093/bioinformatics/btw015](https://academic.oup.com/bioinformatics/article/32/10/1559/1742817/GeneValidator-identify-problems-with-protein).
|
20
18
|
|
21
19
|
|
22
20
|
|
@@ -37,86 +35,59 @@ GeneValidator also runs a further two validation on cDNA sequences:
|
|
37
35
|
Each analysis of each query returns a binary result (good vs. potential problem) according to p-value or an empirically determined cutoff. The results for each query are combined into an overall quality score from 0 to 100. Each analysis of each query returns a binary result (good vs. potential problem) according to p-value or an empirically determined cutoff. The results for each query are combined into an overall quality score from 0 to 100.
|
38
36
|
|
39
37
|
|
40
|
-
|
41
|
-
|
42
|
-
|
43
38
|
## Installation
|
44
|
-
### Installation Requirements
|
45
|
-
* Ruby (>= 2.0.0)
|
46
|
-
* NCBI BLAST+ (>= 2.2.30+) (download [here](http://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastDocs&DOC_TYPE=Download)).
|
47
|
-
* MAFFT installation (>=7.273) (download [here](http://mafft.cbrc.jp/alignment/software/)).
|
48
|
-
* A web browser - [Mozilla FireFox](https://www.mozilla.org/en-GB/firefox/new/) & Safari are recommended. At the moment, it is not possible to use Chrome to view the results locally (as chrome does not allow ajax to local files). To avoid this, simply use a different browser (like Firefox or Safari) or start a local server in the results folder.
|
49
|
-
|
50
|
-
Please see [here](https://gist.github.com/IsmailM/b783e8a06565197084e6) for more help with installing the prerequisites.
|
51
39
|
|
52
|
-
|
53
|
-
GeneValidator requires a protein BLAST database in order to fully analyse all sequences. The BLAST database needs to be set up with the `-parse_seqids` argument as follows:
|
40
|
+
Run the following in your terminal:
|
54
41
|
|
55
42
|
```bash
|
56
|
-
|
43
|
+
sh -c "$(curl -fsSL https://raw.githubusercontent.com/wurmlab/genevalidator/master/install.sh)"
|
57
44
|
```
|
58
45
|
|
59
|
-
|
60
|
-
Simply run the following command in the terminal.
|
46
|
+
By default this will install in a folder called `genevalidator` in your current folder. If you wish to have GeneValidator installed in a different location, add the path to the end of the above install line. For example to install GeneValidator in a hidden folder in your home path
|
61
47
|
|
62
48
|
```bash
|
63
|
-
|
49
|
+
sh -c "$(curl -fsSL https://raw.githubusercontent.com/wurmlab/genevalidator/master/install.sh)" ~/.genevalidator
|
64
50
|
```
|
65
51
|
|
66
|
-
|
52
|
+
Alternatively download and compress the standalone package from our [releases](https://github.com/wurmlab/genevalidator/releases/latest) page.
|
67
53
|
|
68
|
-
|
69
|
-
It is also possible to run from source. However, this is not recommended.
|
54
|
+
The produced folder contains the following:
|
70
55
|
|
71
56
|
```bash
|
72
|
-
#
|
73
|
-
|
74
|
-
|
75
|
-
#
|
76
|
-
|
77
|
-
|
78
|
-
# Install bundler
|
79
|
-
gem install bundler
|
80
|
-
|
81
|
-
# Use bundler to install dependencies
|
82
|
-
bundle install
|
83
|
-
|
84
|
-
# Optional: run tests, build documentation and build the gem from source
|
85
|
-
bundle exec rake
|
86
|
-
|
87
|
-
# Run GeneValidator.
|
88
|
-
bundle exec genevalidator -h
|
89
|
-
# note that `bundle exec` executes GeneValidator in the context of the bundle
|
90
|
-
|
91
|
-
# Alternativaly, install GeneValidator as a gem
|
92
|
-
bundle exec rake install
|
93
|
-
genevalidator -h
|
57
|
+
Readme.txt # See Readme for version and basic usage information
|
58
|
+
bin/ # bin folder for genevalidator, BLAST+ and JQ (can add to $PATH)
|
59
|
+
blast_db/ # contains the SWISSPROT BLAST database.
|
60
|
+
exemplar_data/ # contains exemplar mrna and protein fasta files
|
61
|
+
lib/ # contains genevalidator dependencies
|
94
62
|
```
|
95
63
|
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
64
|
## Usage
|
102
|
-
|
65
|
+
|
66
|
+
GeneValidator can be run immediately after the GeneValidator package has been downloaded and uncompressed.
|
103
67
|
|
104
68
|
```bash
|
105
|
-
genevalidator
|
69
|
+
genevalidator -h
|
106
70
|
```
|
107
71
|
|
108
72
|
You should see the following output.
|
109
73
|
|
110
74
|
```bash
|
75
|
+
SUMMARY:
|
76
|
+
GeneValidator - Identify problems with predicted genes
|
77
|
+
|
111
78
|
USAGE:
|
112
|
-
|
79
|
+
genevalidator [OPTIONAL ARGUMENTS] INPUT_FILE
|
80
|
+
|
81
|
+
To run as a web application:
|
113
82
|
|
114
|
-
ARGUMENTS
|
115
|
-
|
83
|
+
genevalidator app [OPTIONAL ARGUMENTS]
|
84
|
+
|
85
|
+
See 'genevalidator app --help' for more information
|
116
86
|
|
117
87
|
OPTIONAL ARGUMENTS
|
118
|
-
|
119
|
-
|
88
|
+
|
89
|
+
--validations [VALIDATIONS] The Validations to be applied.
|
90
|
+
Validation Options Available (separated by comma):
|
120
91
|
all = All validations (default),
|
121
92
|
lenc = Length validation by clusterization,
|
122
93
|
lenr = Length validation by ranking,
|
@@ -125,46 +96,79 @@ OPTIONAL ARGUMENTS
|
|
125
96
|
frame = Open reading frame (ORF) validation,
|
126
97
|
orf = Main ORF validation,
|
127
98
|
align = Validating based on multiple alignment
|
128
|
-
-d, --db [
|
99
|
+
-d, --db [PATH] Path to the BLAST database
|
100
|
+
e.g. genevalidator -d /path/to/databasa.fa Input_File
|
129
101
|
GeneValidator also supports remote databases:
|
130
102
|
e.g. genevalidator -d "swissprot -remote" Input_File
|
131
|
-
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
103
|
+
-s, --select_single_best Writes the fasta sequence of the best scoring gene to STDOUT.
|
104
|
+
|
105
|
+
# OUTPUT ARGUMENTS
|
106
|
+
|
107
|
+
-o, --output_dir [PATH] Path to the output folder.
|
108
|
+
By default the output folder is in the same directory as the input
|
109
|
+
file and is named as input filename, followed by the time of
|
110
|
+
analysis
|
111
|
+
-f, --force_rewrite Rewrites over existing output.
|
112
|
+
--output_formats [STRING] Output Formats to generate. This can be either: "all", "html",
|
113
|
+
"csv", "json", "summary" or "stdout". Multiple formats can be
|
114
|
+
separated by a semi-colon e.g. "csv:json".
|
115
|
+
By default, all output formats are generated.
|
116
|
+
|
117
|
+
# BLAST ARGUMENTS
|
118
|
+
|
119
|
+
--min_blast_hits_required [NUM]
|
120
|
+
The minimum number of BLAST hits required by GeneValidator in order
|
121
|
+
to carry out validations. Note: certain validations have their own
|
122
|
+
set minimum (such as the multiple alignment validation, which
|
123
|
+
requires a minimum of 10 BLAST hits)
|
124
|
+
-b, --blast_options [STRING] A string that is to passed to BLAST
|
125
|
+
-x, --blast_xml_file [PATH] Provide GeneValidator with a pre-computed BLAST XML output
|
126
|
+
file (BLAST -outfmt option 5).
|
127
|
+
-t, --blast_tabular_file [PATH] Provide GeneValidator with a pre-computed BLAST tabular output
|
128
|
+
file. (BLAST -outfmt option 6).
|
129
|
+
--blast_tabular_options [STRING]
|
130
|
+
Custom format used in BLAST -outfmt argument
|
131
|
+
See BLAST+ manual pages for more details
|
132
|
+
--raw_sequences [PATH] Supply a fasta file of the raw sequences of all BLAST hits present
|
145
133
|
in the supplied BLAST XML or BLAST tabular file.
|
146
|
-
-b, --binaries [binaries] Path to BLAST and MAFFT bin folders (is added to $PATH variable)
|
147
|
-
To be provided as follows:
|
148
|
-
e.g. genevalidator -b /blast/bin/path/ -b /mafft/bin/path/
|
149
|
-
--version The version of GeneValidator that you are running.
|
150
|
-
-h, --help Show this screen.
|
151
|
-
```
|
152
134
|
|
135
|
+
# EXTRACT RAW SEQUENCES ARGUMENTS
|
153
136
|
|
137
|
+
-e, --extract_raw_seqs Extract a fasta file of the raw sequences of BLAST hits in the
|
138
|
+
supplied BLAST output file. This fasta file can then be provided to
|
139
|
+
GeneValidator with the "--raw_sequences" argument
|
154
140
|
|
141
|
+
# REPROCESS JSON ARGUMENTS
|
155
142
|
|
143
|
+
-j, --json_file [JSON_FILE] Path to json file. Re-generate the HTML report from a (filtered)
|
144
|
+
JSON file that was previously produced by GeneValidator
|
145
|
+
|
146
|
+
# GENERAL ARGUMENTS
|
147
|
+
|
148
|
+
-n, --num_threads [THREADS] Specify the number of processor threads to use when running
|
149
|
+
BLAST and GeneValidator.
|
150
|
+
-m, --mafft_threads [THREADS] Specify the number of processor threads to use when running
|
151
|
+
Mafft. Note Mafft is run independently in each of the threads
|
152
|
+
specified in --num_threads.
|
153
|
+
-r, --resume [DIR] Resume a previous analysis (creates a new output directory but
|
154
|
+
skips
|
155
|
+
--bin [DIR] Path to BLAST and MAFFT bin folders (is added to $PATH variable)
|
156
|
+
To be provided as follows:
|
157
|
+
e.g. genevalidator --bin /blast/bin/ --bin /mafft/bin/
|
158
|
+
-h, --help Show this screen.
|
159
|
+
-v, --version The version of GeneValidator that you are running.```
|
156
160
|
|
157
161
|
## Example Usage Scenarios
|
158
162
|
|
159
|
-
#### Simplest Usage (using
|
160
|
-
This runs BLAST on
|
163
|
+
#### Simplest Usage (using included SWISSPROT database)
|
164
|
+
This runs BLAST on the included SwissProt BLAST database.
|
161
165
|
|
162
166
|
```bash
|
163
167
|
genevalidator INPUT_FASTA_FILE
|
164
168
|
```
|
165
169
|
|
166
|
-
#### Using
|
167
|
-
GeneValidator
|
170
|
+
#### Using an alternative BLAST database
|
171
|
+
GeneValidator requires a protein BLAST database in order to fully analyse all sequences. The BLAST database needs to be set up with the `-parse_seqids` argument of the makeblastdb script from BLAST+ (from Genevalidator Package, in the bin directory). See [this page](https://gist.github.com/IsmailM/3e3519de18c5b8b36d8aa0f223fb7948) for more information on how to set up BLAST databases.
|
168
172
|
|
169
173
|
```bash
|
170
174
|
genevalidator -d DATABASE_PATH -n NUM_THREADS INPUT_FASTA_FILE
|
@@ -179,15 +183,7 @@ GeneValidator supports the XML and tabular BLAST output formats.
|
|
179
183
|
# Run BLAST (XML output)
|
180
184
|
blast(p/x) -db DATABASE_PATH -num_threads NUM_THREADS -outfmt 5 -out BLAST_XML_FILE -query INPUT_FASTA_FILE
|
181
185
|
|
182
|
-
# Optional: Generate a fasta file for the BLAST hits.
|
183
|
-
# Note: this works best if you use the same database used to create the BLAST OUTPUT file.
|
184
|
-
genevalidator -d DATABASE_PATH -e -x BLAST_XML_FILE
|
185
|
-
|
186
186
|
# Run GeneValidator
|
187
|
-
## If you ran the previous command (i.e. if you produced fasta file for the BLAST hits)
|
188
|
-
genevalidator -n NUM_THREADS -x BLAST_XML_FILE -r RAW_SEQUENCES_FILE INPUT_FASTA_FILE
|
189
|
-
|
190
|
-
## If you did not run the previous command (this will run the previous command for you)
|
191
187
|
genevalidator -d DATABASE_PATH -n NUM_THREADS -x BLAST_XML_FILE INPUT_FASTA_FILE
|
192
188
|
```
|
193
189
|
|
@@ -197,28 +193,24 @@ This is the same, but using the BLAST tabular output.
|
|
197
193
|
# Run BLAST (tabular output)
|
198
194
|
blast(p/x) -db DATABASE_PATH -num_threads NUM_THREADS -outfmt '7 qseqid sseqid sacc slen qstart qend sstart send length qframe pident nident evalue qseq sseq' -out BLAST_TAB_FILE -query INPUT_FASTA_FILE
|
199
195
|
|
200
|
-
# Optional: Generate a fasta file for the BLAST hits.
|
201
|
-
# Note: this works best if you use the same database used to create the BLAST OUTPUT file.
|
202
|
-
genevalidator -d DATABASE_PATH -e -t BLAST_TAB_FILE -o 'qseqid sseqid sacc slen qstart qend sstart send length qframe pident nident evalue qseq sseq'
|
203
|
-
|
204
196
|
# Run GeneValidator
|
205
|
-
|
206
|
-
genevalidator -n NUM_THREADS -t BLAST_TAB_FILE -o 'qseqid sseqid sacc slen qstart qend sstart send length qframe pident nident evalue qseq sseq' -r RAW_SEQUENCES_FILE INPUT_FASTA_FILE
|
207
|
-
|
208
|
-
## If you did generate the BLAST hits fasta file (this will run the previous command for you)
|
209
|
-
genevalidator -d DATABASE_PATH -n NUM_THREADS -t BLAST_TAB_FILE -o 'qseqid sseqid sacc slen qstart qend sstart send length qframe pident nident evalue qseq sseq' INPUT_FASTA_FILE
|
210
|
-
|
197
|
+
genevalidator -n NUM_THREADS -t BLAST_TAB_FILE -o 'qseqid sseqid sacc slen qstart qend sstart send length qframe pident nident evalue qseq sseq' INPUT_FASTA_FILE
|
211
198
|
```
|
212
199
|
|
213
200
|
|
214
201
|
|
215
202
|
|
216
203
|
## Output
|
217
|
-
The output produced by GeneValidator is presented in
|
204
|
+
The output produced by GeneValidator is presented in four manners.
|
218
205
|
|
219
206
|
#### HTML Output
|
220
207
|
Firstly, the output is produced as a colourful, HTML file. This file is titled 'results.html' (found in the 'html' folder) and can be opened in a web browser (please use a supported browser - See [Installation Requirements](#installation-requirements)). This file contains all the results in an easy-to-view manner with graphical visualisations. See exemplar HTML output [here](http://wurmlab.github.io/tools/genevalidator/exemplar_data/protein_input/) (protein input data) and [here](http://wurmlab.github.io/tools/genevalidator/exemplar_data/genetic_input/) (DNA input data).
|
221
208
|
|
209
|
+
|
210
|
+
#### CSV Output
|
211
|
+
The output is also produced in JSON. GeneValidator is able to re-generate results for any JSON files (or derived JSON files) with that were previously generated by the program. This means that you are able to use the JSON file in your own analysis pipelines and then use GeneValidator to produce the HTML output for the analysed JSON file.
|
212
|
+
|
213
|
+
|
222
214
|
#### JSON Output
|
223
215
|
The output is also produced in JSON. GeneValidator is able to re-generate results for any JSON files (or derived JSON files) with that were previously generated by the program. This means that you are able to use the JSON file in your own analysis pipelines and then use GeneValidator to produce the HTML output for the analysed JSON file.
|
224
216
|
|
@@ -229,44 +221,34 @@ Lastly, a tabular summary of the results is also outputted in the terminal to pr
|
|
229
221
|
|
230
222
|
|
231
223
|
|
232
|
-
##
|
233
|
-
|
234
|
-
There are numerous methods to analyse the JSON output including the [streamable JSON command line program](http://trentm.com/json/) or [jq](https://stedolan.github.io/jq/). The below examples use the JSON tool.
|
235
|
-
|
236
|
-
### Examplar JSON CLI Installation
|
237
|
-
After installing node:
|
224
|
+
## Using the JSON output
|
238
225
|
|
239
|
-
|
240
|
-
$ npm install -g json
|
241
|
-
```
|
242
|
-
|
243
|
-
### Filtering the results
|
226
|
+
JSON output can be filtered or processed in a variety of ways using standard tools, such as the [streamable JSON command line program](http://trentm.com/json/), or [jq](https://stedolan.github.io/jq/). The examples below makes use of jq 1.5 which is bundled with GeneValidator.
|
244
227
|
|
245
228
|
```bash
|
229
|
+
# Requires jq 1.5
|
246
230
|
|
247
231
|
# Extract sequences that have an overall score of 100
|
248
|
-
$
|
232
|
+
$ jq '.[] | select(.overall_score == 100)' INPUT_JSON_FILE > OUTPUT_JSON_FILE
|
249
233
|
|
250
234
|
# Extract sequences that have an overall score of over 70
|
251
|
-
$
|
235
|
+
$ jq '.[] | select(.overall_score == 70)' INPUT_JSON_FILE > OUTPUT_JSON_FILE
|
252
236
|
|
253
237
|
# Extract sequences that have more than 50 hits
|
254
|
-
$
|
238
|
+
$ jq '.[] | select(.no_hits > 50)' INPUT_JSON_FILE > OUTPUT_JSON_FILE
|
255
239
|
|
256
240
|
# Sort the JSON based on the overall score (ascending - 0 to 100)
|
257
|
-
$
|
258
|
-
|
241
|
+
$ jq 'sort_by(.overall_score)' INPUT_JSON_FILE > OUTPUT_JSON_FILE
|
259
242
|
# Sort the JSON based on the overall score (decending - 100 to 0)
|
260
|
-
|
243
|
+
$ jq 'sort_by(- .overall_score)' INPUT_JSON_FILE > OUTPUT_JSON_FILE
|
244
|
+
|
245
|
+
# Remove the large graphs objects (note these Graphs objects are required if you wish to pass the json back into GV using the `-j` option - see below)
|
246
|
+
$ jq --raw-output '[ .[] | del(.validations[].graphs) ]' INPUT_JSON_FILE > OUTPUT_JSON_FILE
|
261
247
|
```
|
262
248
|
|
263
249
|
The subsetted/sorted JSON file can then be passed back into GeneValidator (using the `-j` command line argument) to generate the HTML report for the sequences in the JSON file.
|
264
250
|
|
265
251
|
```bash
|
266
|
-
genevalidator -j
|
252
|
+
genevalidator -j OUTPUT_JSON_FILE
|
267
253
|
```
|
268
254
|
|
269
|
-
|
270
|
-
## Related projects
|
271
|
-
[GeneValidatorApp](https://github.com/wurmlab/GeneValidatorApp) - A Web App wrapper for GeneValidator.<br>
|
272
|
-
[GeneValidatorApp-API](https://github.com/wurmlab/GeneValidatorApp-API) - An easy to use API for GeneValidatorApp to allow you to use GeneValidator within your web applications.
|