genevalidator 1.6.12 → 2.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.gitignore +30 -1
- data/.ruby-version +1 -0
- data/.travis.yml +13 -12
- data/Gemfile +4 -1
- data/Gemfile.lock +135 -0
- data/README.md +104 -122
- data/Rakefile +377 -5
- data/aux/gv_results.slim +155 -0
- data/aux/html_files/css/gv.compiled.min.css +8 -0
- data/aux/{files → html_files}/css/src/bootstrap.min.css +0 -0
- data/aux/{files → html_files}/css/src/font-awesome.min.css +0 -0
- data/aux/{files → html_files}/css/src/style.css +0 -0
- data/aux/{files → html_files}/fonts/FontAwesome.otf +0 -0
- data/aux/{files → html_files}/fonts/fontawesome-webfont.eot +0 -0
- data/aux/{files → html_files}/fonts/fontawesome-webfont.svg +0 -0
- data/aux/{files → html_files}/fonts/fontawesome-webfont.ttf +0 -0
- data/aux/{files → html_files}/fonts/fontawesome-webfont.woff +0 -0
- data/aux/{files → html_files}/img/gene.png +0 -0
- data/aux/html_files/js/gv.compiled.min.js +1 -0
- data/aux/{files → html_files}/js/src/bootstrap.min.js +0 -0
- data/aux/{files → html_files}/js/src/d3.v3.min.js +0 -0
- data/aux/{files → html_files}/js/src/jquery-2.1.1.min.js +0 -0
- data/aux/{files → html_files}/js/src/jquery.tablesorter.min.js +0 -0
- data/aux/{files → html_files}/js/src/plots.js +1 -1
- data/aux/{files → html_files}/js/src/script.js +0 -0
- data/aux/{files → html_files}/json/.gitkeep +0 -0
- data/bin/genevalidator +393 -56
- data/exemplar_data/README.md +60 -0
- data/{data/mrna_data.fasta → exemplar_data/mrna_data.fa} +1 -1
- data/{data/protein_data.fasta → exemplar_data/protein_data.fa} +0 -0
- data/genevalidator.gemspec +35 -20
- data/install.sh +92 -0
- data/lib/genevalidator.rb +171 -56
- data/lib/genevalidator/arg_validation.rb +26 -55
- data/lib/genevalidator/blast.rb +44 -99
- data/lib/genevalidator/clusterization.rb +18 -22
- data/lib/genevalidator/exceptions.rb +17 -17
- data/lib/genevalidator/ext/array.rb +21 -4
- data/lib/genevalidator/get_raw_sequences.rb +32 -31
- data/lib/genevalidator/hsp.rb +31 -2
- data/lib/genevalidator/json_to_gv_results.rb +38 -122
- data/lib/genevalidator/output.rb +158 -172
- data/lib/genevalidator/output_files.rb +134 -0
- data/lib/genevalidator/pool.rb +2 -5
- data/lib/genevalidator/query.rb +1 -1
- data/lib/genevalidator/tabular_parser.rb +8 -29
- data/lib/genevalidator/validation.rb +48 -90
- data/lib/genevalidator/validation_alignment.rb +64 -75
- data/lib/genevalidator/validation_blast_reading_frame.rb +13 -9
- data/lib/genevalidator/validation_duplication.rb +85 -84
- data/lib/genevalidator/validation_gene_merge.rb +46 -35
- data/lib/genevalidator/validation_length_cluster.rb +18 -15
- data/lib/genevalidator/validation_length_rank.rb +19 -15
- data/lib/genevalidator/validation_maker_qi.rb +13 -12
- data/lib/genevalidator/validation_open_reading_frame.rb +16 -13
- data/lib/genevalidator/validation_report.rb +1 -1
- data/lib/genevalidator/validation_test.rb +1 -1
- data/lib/genevalidator/version.rb +1 -1
- data/test/overall.rb +1 -1
- data/test/test_all_validations.rb +36 -24
- data/test/test_blast.rb +39 -24
- data/test/test_clusterization_2d.rb +4 -4
- data/test/test_helper.rb +2 -2
- data/test/test_query.rb +16 -20
- data/test/test_validation_open_reading_frame.rb +122 -122
- data/test/test_validations.rb +12 -10
- metadata +94 -79
- data/aux/files/css/genevalidator.compiled.min.css +0 -16
- data/aux/files/js/genevalidator.compiled.min.js +0 -28
- data/aux/json_footer.erb +0 -8
- data/aux/json_header.erb +0 -19
- data/aux/json_query.erb +0 -15
- data/aux/template_footer.erb +0 -8
- data/aux/template_header.erb +0 -19
- data/aux/template_query.erb +0 -14
- data/data/README.md +0 -57
- data/data/mrna_data.fasta.blast_tabular +0 -3567
- data/data/mrna_data.fasta.blast_tabular.raw_seq +0 -53998
- data/data/mrna_data.fasta.blast_tabular.raw_seq.idx +0 -5440
- data/data/mrna_data.fasta.blast_xml +0 -39800
- data/data/mrna_data.fasta.blast_xml.raw_seq +0 -2554
- data/data/mrna_data.fasta.blast_xml.raw_seq.idx +0 -3127
- data/data/mrna_data.fasta.json +0 -1
- data/data/protein_data.fasta.blast_tabular +0 -3278
- data/data/protein_data.fasta.blast_tabular.raw_seq +0 -61295
- data/data/protein_data.fasta.blast_tabular.raw_seq.idx +0 -4438
- data/data/protein_data.fasta.blast_xml +0 -26228
- data/data/protein_data.fasta.blast_xml.raw_seq +0 -9803
- data/data/protein_data.fasta.blast_xml.raw_seq.idx +0 -1777
- data/data/protein_data.fasta.json +0 -1
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# Running GeneValidator with sample data
|
|
2
|
+
|
|
3
|
+
Here, we walk through the steps involved in analysing some sample data with GeneValidator. There are two options on how to run genevalidator - the second option is faster with larger input files.
|
|
4
|
+
|
|
5
|
+
## Expected Results
|
|
6
|
+
|
|
7
|
+
<strong>protein_data.fa</strong> [See here](http://wurmlab.github.io/tools/genevalidator/examplar_data/protein_input/)
|
|
8
|
+
<strong>mrna_data.fa</strong> [See here](http://wurmlab.github.io/tools/genevalidator/examplar_data/genetic_input/)
|
|
9
|
+
|
|
10
|
+
##### Running GeneValidator with a the included SwissProt Database, with four threads
|
|
11
|
+
|
|
12
|
+
```bash
|
|
13
|
+
# Protein data
|
|
14
|
+
$ genevalidator -n 4 protein_data.fa
|
|
15
|
+
|
|
16
|
+
# MRNA data
|
|
17
|
+
$ genevalidator -n 4 mrna_data.fa
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
This will produce a folder that will contain your result files.
|
|
21
|
+
|
|
22
|
+
##### Running GeneValidator with a pre-computed BLAST XML file
|
|
23
|
+
|
|
24
|
+
For protein_data.fa:
|
|
25
|
+
|
|
26
|
+
```
|
|
27
|
+
blastp -db DATABASE_PATH -num_threads 4 -out protein_data.blast.xml -query protein_data.fa -outfmt 5
|
|
28
|
+
|
|
29
|
+
# Run GeneValidator
|
|
30
|
+
genevalidator -d DATABASE_PATH -n 4 -x protein_data.blast.xml protein_data.fa
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
For mrna_data.fa:
|
|
34
|
+
|
|
35
|
+
```
|
|
36
|
+
blastx -db DATABASE_PATH -num_threads 4 -out mrna_data.blast.xml -query mrna_data.fa -outfmt 5
|
|
37
|
+
|
|
38
|
+
# Run GeneValidator
|
|
39
|
+
genevalidator -d DATABASE_PATH -n 4 -x mrna_data.blast.xml mrna_data.fa
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
##### Running GeneValidator with a pre-computed BLAST tabular file
|
|
43
|
+
|
|
44
|
+
For protein_data.fa:
|
|
45
|
+
|
|
46
|
+
```
|
|
47
|
+
blastp -db DATABASE_PATH -num_threads 4 -out protein_data.blast.tsv -query protein_data.fa -outfmt '7 qseqid sseqid sacc slen qstart qend sstart send length qframe pident nident evalue qseq sseq'
|
|
48
|
+
|
|
49
|
+
# Run GeneValidator
|
|
50
|
+
genevalidator -d DATABASE_PATH -n 4 -t protein_data.blast.tsv --blast_tabular_options 'qseqid sseqid sacc slen qstart qend sstart send length qframe pident nident evalue qseq sseq' protein_data.fa
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
For mrna_data.fa:
|
|
54
|
+
|
|
55
|
+
```
|
|
56
|
+
blastp -db DATABASE_PATH -num_threads 4 -out mrna_data.blast.tsv -query mrna_data.fa -outfmt '7 qseqid sseqid sacc slen qstart qend sstart send length qframe pident nident evalue qseq sseq'
|
|
57
|
+
|
|
58
|
+
# Run GeneValidator
|
|
59
|
+
genevalidator -d DATABASE_PATH -n 4 -t mrna_data.blast.tsv --blast_tabular_options 'qseqid sseqid sacc slen qstart qend sstart send length qframe pident nident evalue qseq sseq' mrna_data.fa
|
|
60
|
+
```
|
|
@@ -235,4 +235,4 @@ CCATGCCGGAGCATCAGTAGATCTTGCCATCTTCTCCCTTCATCTGGCAGGTGTCTCCTC
|
|
|
235
235
|
CATCCTCGGAGCAATTAACTTTATCACCACAGCCATCAACATGAAACCACCTGCCCTCTC
|
|
236
236
|
ACAATACCAAACCCCCCTATTCGTTTGATCCGTCTTAATTACCGCCATCCTTCTTCTCCT
|
|
237
237
|
TTCCCTCCCAGTTCTCGCCGCTGGTATTACAATGCTTCTAACAGATCGAAATCTAAACAC
|
|
238
|
-
TACATTCTTCGACCCTGCAGGGGGCGGAGACCCAATTTTATACCAACACTTA
|
|
238
|
+
TACATTCTTCGACCCTGCAGGGGGCGGAGACCCAATTTTATACCAACACTTA
|
|
File without changes
|
data/genevalidator.gemspec
CHANGED
|
@@ -1,12 +1,10 @@
|
|
|
1
|
-
|
|
2
|
-
lib = File.expand_path('../lib', __FILE__)
|
|
1
|
+
lib = File.expand_path('lib', __dir__)
|
|
3
2
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
|
4
3
|
require 'genevalidator/version'
|
|
5
4
|
|
|
6
5
|
Gem::Specification.new do |s|
|
|
7
|
-
# meta
|
|
8
6
|
s.name = 'genevalidator'
|
|
9
|
-
s.version =
|
|
7
|
+
s.version = GeneValidator::VERSION
|
|
10
8
|
s.authors = ['Monica Dragan', 'Ismail Moghul', 'Anurag Priyam',
|
|
11
9
|
'Yannick Wurm']
|
|
12
10
|
s.email = 'y.wurm@qmul.ac.uk'
|
|
@@ -14,19 +12,27 @@ Gem::Specification.new do |s|
|
|
|
14
12
|
s.license = 'AGPL'
|
|
15
13
|
s.summary = 'Identifying problems with gene predictions.'
|
|
16
14
|
s.description = 'The tool validates the input predicted genes and provides' \
|
|
17
|
-
' useful information (length validation, gene merge'\
|
|
15
|
+
' useful information (length validation, gene merge' \
|
|
18
16
|
' validation, sequence duplication checking, ORF finding)' \
|
|
19
17
|
' based on the similarities to genes in public databases.'
|
|
18
|
+
s.required_ruby_version = '>= 2.2.0'
|
|
20
19
|
|
|
21
|
-
s.
|
|
22
|
-
s.add_development_dependency '
|
|
23
|
-
s.add_development_dependency '
|
|
24
|
-
|
|
25
|
-
s.
|
|
26
|
-
s.
|
|
27
|
-
s.add_dependency
|
|
28
|
-
s.add_dependency
|
|
29
|
-
s.add_dependency
|
|
20
|
+
s.add_development_dependency 'minitest', '~> 5.10'
|
|
21
|
+
s.add_development_dependency 'rake', '~> 12.3'
|
|
22
|
+
s.add_development_dependency 'yard', '~> 0.9.11'
|
|
23
|
+
|
|
24
|
+
s.add_dependency 'bio', '~> 1.4'
|
|
25
|
+
s.add_dependency 'bio-blastxmlparser', '~> 2.0'
|
|
26
|
+
s.add_dependency 'genevalidatorapp', '~> 2.1.3'
|
|
27
|
+
s.add_dependency 'rack', '~> 2.0'
|
|
28
|
+
s.add_dependency 'slim', '~>3.0'
|
|
29
|
+
s.add_dependency 'statsample', '2.1.0'
|
|
30
|
+
|
|
31
|
+
# Adding mechanize gem just to silence a message on load.
|
|
32
|
+
# This is due the Statsample gem
|
|
33
|
+
# See https://github.com/SciRuby/daru/issues/404
|
|
34
|
+
# See https://github.com/SciRuby/statsample/pull/69
|
|
35
|
+
s.add_dependency 'mechanize', '2.7.5'
|
|
30
36
|
|
|
31
37
|
s.files = `git ls-files -z`.split("\x0")
|
|
32
38
|
s.executables = s.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
|
@@ -35,15 +41,24 @@ Gem::Specification.new do |s|
|
|
|
35
41
|
|
|
36
42
|
s.post_install_message = <<INFO
|
|
37
43
|
|
|
38
|
-
|
|
39
|
-
|
|
44
|
+
----------------------------------------------------------------------------
|
|
45
|
+
Thank you for validating your gene predictions with GeneValidator!
|
|
46
|
+
|
|
47
|
+
==> To launch GeneValidator execute 'genevalidator' from command line.
|
|
48
|
+
|
|
49
|
+
genevalidator [OPTIONAL ARGUMENTS] INPUT_FILE
|
|
50
|
+
|
|
51
|
+
See 'genevalidator --help' for more information
|
|
52
|
+
|
|
53
|
+
==> To launch GeneValidator as a web application execute 'genevalidator' from command line.
|
|
54
|
+
|
|
55
|
+
genevalidator app [OPTIONAL ARGUMENTS]
|
|
40
56
|
|
|
41
|
-
|
|
57
|
+
See 'genevalidator app --help' for more information
|
|
42
58
|
|
|
43
|
-
|
|
59
|
+
==> Visit https://wurmlab.github.io/tools/genevalidator/ for more information.
|
|
44
60
|
|
|
45
|
-
|
|
46
|
-
------------------------------------------------------------------------
|
|
61
|
+
----------------------------------------------------------------------------
|
|
47
62
|
|
|
48
63
|
INFO
|
|
49
64
|
end
|
data/install.sh
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
#!/bin/sh
|
|
2
|
+
|
|
3
|
+
## USAGE: bash install.sh $INSTALL_DIR
|
|
4
|
+
## $ bash install.sh $INSTALL_DIR
|
|
5
|
+
|
|
6
|
+
set -eu
|
|
7
|
+
|
|
8
|
+
# OS detection
|
|
9
|
+
KERNEL="$(uname -s | tr '[:upper:]' '[:lower:]')"
|
|
10
|
+
|
|
11
|
+
if [ "$KERNEL" = "darwin" ]; then
|
|
12
|
+
PLATFORM='osx'
|
|
13
|
+
elif [ "$KERNEL" = "linux" ]; then
|
|
14
|
+
ARCH=$(uname -m)
|
|
15
|
+
if [ "$ARCH" = "x86_64" ]; then
|
|
16
|
+
PLATFORM='linux-x86_64'
|
|
17
|
+
else
|
|
18
|
+
PLATFORM='linux-x86'
|
|
19
|
+
fi
|
|
20
|
+
fi
|
|
21
|
+
|
|
22
|
+
# If there is an argument then there is where GV will installed
|
|
23
|
+
if [ "$0" = 'sh' ]; then
|
|
24
|
+
# I.e. when piping from curl
|
|
25
|
+
INSTALL_DIR=$PWD/genevalidator
|
|
26
|
+
elif [ "$0" = 'install.sh' ]; then
|
|
27
|
+
# I.e. when running directly
|
|
28
|
+
INSTALL_DIR=$PWD/genevalidator
|
|
29
|
+
else
|
|
30
|
+
INSTALL_DIR="$0"
|
|
31
|
+
fi
|
|
32
|
+
|
|
33
|
+
GV_URL=$(curl -s https://api.github.com/repos/wurmlab/genevalidator/releases/latest \
|
|
34
|
+
| grep browser_download_url \
|
|
35
|
+
| grep -i $PLATFORM \
|
|
36
|
+
| cut -d '"' -f 4)
|
|
37
|
+
|
|
38
|
+
echo >&2 "==> Installing GeneValidator to:"
|
|
39
|
+
echo >&2 " ${INSTALL_DIR}"
|
|
40
|
+
echo >&2
|
|
41
|
+
|
|
42
|
+
mkdir "${INSTALL_DIR}"
|
|
43
|
+
curl -SL "$GV_URL" | tar zxf - -C "${INSTALL_DIR}" --strip-components 1
|
|
44
|
+
|
|
45
|
+
echo >&2
|
|
46
|
+
echo >&2 "==> GeneValidator successfully installed."
|
|
47
|
+
|
|
48
|
+
### Check which SHELL and then test different profile files
|
|
49
|
+
case $SHELL in
|
|
50
|
+
*/zsh)
|
|
51
|
+
# assume Zsh
|
|
52
|
+
if test -e "${HOME}/.zshrc"; then
|
|
53
|
+
DOT_FILE=${HOME}/.zshrc
|
|
54
|
+
elif test -e "${HOME}/.zprofile"; then
|
|
55
|
+
DOT_FILE=${HOME}/.zprofile
|
|
56
|
+
elif test -e "${HOME}/.profile"; then
|
|
57
|
+
DOT_FILE=${HOME}/.profile
|
|
58
|
+
fi
|
|
59
|
+
;;
|
|
60
|
+
*/bash)
|
|
61
|
+
# assume Bash
|
|
62
|
+
if test -e "${HOME}/.bashrc"; then
|
|
63
|
+
DOT_FILE=${HOME}/.bashrc
|
|
64
|
+
elif test -e "${HOME}/.bash_profile"; then
|
|
65
|
+
DOT_FILE=${HOME}/.bash_profile
|
|
66
|
+
elif test -e "${HOME}/.profile"; then
|
|
67
|
+
DOT_FILE=${HOME}/.profile
|
|
68
|
+
fi
|
|
69
|
+
;;
|
|
70
|
+
*)
|
|
71
|
+
if test -e "${HOME}/.profile"; then
|
|
72
|
+
DOT_FILE=${HOME}/.profile
|
|
73
|
+
fi
|
|
74
|
+
esac
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
if [ -z ${DOT_FILE+x} ]; then
|
|
78
|
+
# DOT File hasn't been set.
|
|
79
|
+
echo >&2
|
|
80
|
+
echo >&2 '==> No profile files were found.'
|
|
81
|
+
echo >&2 ' Please create one and add the following line to that file:'
|
|
82
|
+
echo >&2
|
|
83
|
+
echo >&2 ' export PATH="'"${INSTALL_DIR}"'/bin:${PATH}"'
|
|
84
|
+
else
|
|
85
|
+
echo >&2 'export PATH="'"${INSTALL_DIR}"'/bin:${PATH}"' >> "${DOT_FILE}"
|
|
86
|
+
echo >&2
|
|
87
|
+
echo >&2 "==> Added GeneValidator to your PATH in ${DOT_FILE}"
|
|
88
|
+
echo >&2
|
|
89
|
+
echo >&2 "==> Run \`genevalidator -h\` in a new window to get started."
|
|
90
|
+
fi
|
|
91
|
+
|
|
92
|
+
echo >&2
|
data/lib/genevalidator.rb
CHANGED
|
@@ -5,61 +5,36 @@ require 'genevalidator/arg_validation'
|
|
|
5
5
|
require 'genevalidator/blast'
|
|
6
6
|
require 'genevalidator/exceptions'
|
|
7
7
|
require 'genevalidator/get_raw_sequences'
|
|
8
|
+
require 'genevalidator/json_to_gv_results'
|
|
8
9
|
require 'genevalidator/output'
|
|
10
|
+
require 'genevalidator/output_files'
|
|
9
11
|
require 'genevalidator/tabular_parser'
|
|
10
12
|
require 'genevalidator/validation'
|
|
11
13
|
|
|
12
14
|
# Top level module / namespace.
|
|
13
15
|
module GeneValidator
|
|
14
16
|
class << self
|
|
15
|
-
attr_accessor :opt, :config, :overview
|
|
17
|
+
attr_accessor :opt, :config, :overview, :dirs
|
|
16
18
|
attr_reader :raw_seq_file_index
|
|
17
19
|
attr_reader :raw_seq_file_load
|
|
18
20
|
# array of indexes for the start offsets of each query in the fasta file
|
|
19
21
|
attr_reader :query_idx
|
|
20
|
-
attr_accessor :mutex, :
|
|
22
|
+
attr_accessor :mutex, :mutex_array
|
|
21
23
|
|
|
22
|
-
def init(opt, start_idx = 1
|
|
23
|
-
|
|
24
|
+
def init(opt, start_idx = 1)
|
|
25
|
+
warn '==> Analysing input arguments'
|
|
24
26
|
@opt = opt
|
|
25
27
|
GVArgValidation.validate_args # validates @opt
|
|
28
|
+
number_of_sequences = index_the_input
|
|
26
29
|
|
|
27
|
-
@config =
|
|
28
|
-
|
|
29
|
-
start_idx: start_idx,
|
|
30
|
-
summary: summary,
|
|
31
|
-
|
|
32
|
-
type: BlastUtils.guess_sequence_type_from_input_file,
|
|
33
|
-
filename: File.basename(@opt[:input_fasta_file]),
|
|
34
|
-
html_path: "#{@opt[:input_fasta_file]}.html",
|
|
35
|
-
json_file: File.join(File.dirname(@opt[:input_fasta_file]),
|
|
36
|
-
"#{File.basename(@opt[:input_fasta_file])}.json"),
|
|
37
|
-
plot_dir: "#{@opt[:input_fasta_file]}.html/files/json",
|
|
38
|
-
aux: File.expand_path(File.join(File.dirname(__FILE__), '../aux')),
|
|
39
|
-
|
|
40
|
-
json_output: [],
|
|
41
|
-
run_no: 0,
|
|
42
|
-
output_max: 2500 # max no. of queries in the output file
|
|
43
|
-
}
|
|
44
|
-
|
|
45
|
-
@overview = {
|
|
46
|
-
no_queries: 0,
|
|
47
|
-
scores: [],
|
|
48
|
-
good_scores: 0,
|
|
49
|
-
bad_scores: 0,
|
|
50
|
-
nee: 0,
|
|
51
|
-
no_mafft: 0,
|
|
52
|
-
no_internet: 0,
|
|
53
|
-
map_errors: Hash.new(0),
|
|
54
|
-
run_time: Hash.new(Pair1.new(0, 0))
|
|
55
|
-
}
|
|
30
|
+
@config = setup_config(start_idx, number_of_sequences)
|
|
31
|
+
@dirs = setup_dirnames(@opt[:input_fasta_file])
|
|
56
32
|
|
|
57
33
|
@mutex = Mutex.new
|
|
58
34
|
@mutex_array = Mutex.new
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
index_the_input
|
|
35
|
+
|
|
36
|
+
resume_from_previous_run(opt[:resumable]) unless opt[:resumable].nil?
|
|
37
|
+
|
|
63
38
|
RawSequences.index_raw_seq_file if @opt[:raw_sequences]
|
|
64
39
|
end
|
|
65
40
|
|
|
@@ -69,6 +44,8 @@ module GeneValidator
|
|
|
69
44
|
# Run BLAST on all sequences (generates @opt[:blast_xml_file])
|
|
70
45
|
# if no BLAST OUTPUT file provided...
|
|
71
46
|
unless @opt[:blast_xml_file] || @opt[:blast_tabular_file]
|
|
47
|
+
blast_xml_fname = "#{dirs[:filename]}.blast_xml"
|
|
48
|
+
opt[:blast_xml_file] = File.join(dirs[:tmp_dir], blast_xml_fname)
|
|
72
49
|
BlastUtils.run_blast_on_input_file
|
|
73
50
|
end
|
|
74
51
|
# Obtain fasta file of all BLAST hits if running align or dup validations
|
|
@@ -78,19 +55,110 @@ module GeneValidator
|
|
|
78
55
|
end
|
|
79
56
|
# Run Validations
|
|
80
57
|
iterator = parse_blast_output_file
|
|
81
|
-
|
|
58
|
+
Validations.new.run_validations(iterator)
|
|
59
|
+
produce_output
|
|
60
|
+
print_directories_locations
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
##
|
|
64
|
+
# Params:
|
|
65
|
+
# +output+: filename or stream, according to the type
|
|
66
|
+
# +type+: file or stream
|
|
67
|
+
# Returns an iterator..
|
|
68
|
+
def parse_blast_output_file
|
|
69
|
+
if @opt[:blast_xml_file]
|
|
70
|
+
Bio::BlastXMLParser::XmlIterator.new(@opt[:blast_xml_file]).to_enum
|
|
71
|
+
else
|
|
72
|
+
TabularParser.new
|
|
73
|
+
end
|
|
74
|
+
## TODO: Add a Rescue statement - e.g. if unable to create the Object...
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
# Also called by json_to_gv script
|
|
78
|
+
def setup_dirnames(input_file)
|
|
79
|
+
fname = File.basename(input_file, File.extname(input_file))
|
|
80
|
+
out_dir = setup_output_dir(fname)
|
|
81
|
+
{ filename: fname,
|
|
82
|
+
output_dir: out_dir,
|
|
83
|
+
tmp_dir: File.join(out_dir, 'tmp'),
|
|
84
|
+
json_dir: File.join(out_dir, 'tmp/json'),
|
|
85
|
+
html_file: File.join(out_dir, "#{fname}_results*.html"),
|
|
86
|
+
json_file: File.join(out_dir, "#{fname}_results.json"),
|
|
87
|
+
csv_file: File.join(out_dir, "#{fname}_results.csv"),
|
|
88
|
+
summary_file: File.join(out_dir, "#{fname}_summary.csv"),
|
|
89
|
+
fasta_file: File.join(out_dir, "#{fname}_results.fa"),
|
|
90
|
+
aux_dir: File.expand_path('../aux', __dir__) }
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
def extract_input_fasta_sequence(index)
|
|
94
|
+
start_offset = @query_idx[index + 1] - @query_idx[index]
|
|
95
|
+
end_offset = @query_idx[index]
|
|
96
|
+
IO.binread(@opt[:input_fasta_file], start_offset, end_offset)
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
def produce_output
|
|
100
|
+
@overview = Output.generate_overview(@config[:json_output],
|
|
101
|
+
@opt[:min_blast_hits])
|
|
102
|
+
eval_text = Output.generate_evaluation_text(@overview)
|
|
103
|
+
Output.print_console_footer(eval_text, @opt)
|
|
104
|
+
|
|
105
|
+
output_files = OutputFiles.new
|
|
106
|
+
output_files.write_json
|
|
107
|
+
output_files.write_html(eval_text)
|
|
108
|
+
output_files.write_csv
|
|
109
|
+
output_files.write_summary
|
|
110
|
+
output_files.print_best_fasta
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
private
|
|
114
|
+
|
|
115
|
+
def setup_config(start_idx, seq_length)
|
|
116
|
+
{
|
|
117
|
+
idx: 0,
|
|
118
|
+
start_idx: start_idx,
|
|
119
|
+
|
|
120
|
+
type: BlastUtils.guess_sequence_type_from_input_file,
|
|
82
121
|
|
|
83
|
-
|
|
84
|
-
|
|
122
|
+
json_output: Array.new(seq_length),
|
|
123
|
+
run_no: 0,
|
|
124
|
+
output_max: 2500 # max no. of queries in the output html file
|
|
125
|
+
}
|
|
85
126
|
end
|
|
86
127
|
|
|
87
128
|
##
|
|
88
129
|
# Creates the output folder and copies the auxiliar folders to this folder
|
|
89
|
-
def
|
|
90
|
-
|
|
130
|
+
def setup_output_dir(fname)
|
|
131
|
+
dir_name = "#{fname}_" + Time.now.strftime('%Y_%m_%d_%H_%M_%S')
|
|
132
|
+
default_outdir = File.join(Dir.pwd, dir_name)
|
|
133
|
+
output_dir = @opt[:output_dir].nil? ? default_outdir : @opt[:output_dir]
|
|
134
|
+
assert_output_dir_does_not_exist(output_dir)
|
|
91
135
|
Dir.mkdir(output_dir)
|
|
92
|
-
|
|
93
|
-
|
|
136
|
+
Dir.mkdir(File.join(output_dir, 'tmp'))
|
|
137
|
+
cp_html_files(output_dir)
|
|
138
|
+
output_dir
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
def assert_output_dir_does_not_exist(output_dir)
|
|
142
|
+
return unless Dir.exist?(output_dir)
|
|
143
|
+
FileUtils.rm_r(output_dir) if @opt[:force_rewrite]
|
|
144
|
+
return if @opt[:force_rewrite]
|
|
145
|
+
warn "The output directory (#{output_dir}) already exists."
|
|
146
|
+
warn ''
|
|
147
|
+
warn 'Please remove this directory before continuing.'
|
|
148
|
+
warn 'Alternatively, you rerun GeneValidator with the `--force` argument,'
|
|
149
|
+
warn 'which rewrites over any previous output.'
|
|
150
|
+
exit 1
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
def cp_html_files(output_dir)
|
|
154
|
+
if @opt[:output_formats].include? 'html'
|
|
155
|
+
aux_files = File.expand_path('../aux/html_files/', __dir__)
|
|
156
|
+
FileUtils.cp_r(aux_files, output_dir)
|
|
157
|
+
FileUtils.ln_s(File.join('..', 'html_files', 'json'),
|
|
158
|
+
File.join(output_dir, 'tmp', 'json'))
|
|
159
|
+
else
|
|
160
|
+
Dir.mkdir(File.join(output_dir, 'tmp', 'json'))
|
|
161
|
+
end
|
|
94
162
|
end
|
|
95
163
|
|
|
96
164
|
##
|
|
@@ -99,22 +167,69 @@ module GeneValidator
|
|
|
99
167
|
# start and end positions of each query.
|
|
100
168
|
def index_the_input
|
|
101
169
|
fasta_content = IO.binread(@opt[:input_fasta_file])
|
|
102
|
-
@query_idx = fasta_content.enum_for(:scan, /(>[^>]+)/).map
|
|
170
|
+
@query_idx = fasta_content.enum_for(:scan, /(>[^>]+)/).map do
|
|
171
|
+
Regexp.last_match.begin(0)
|
|
172
|
+
end
|
|
103
173
|
@query_idx.push(fasta_content.length)
|
|
174
|
+
@query_idx.length - 1
|
|
104
175
|
end
|
|
105
176
|
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
#
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
def
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
177
|
+
def print_directories_locations
|
|
178
|
+
warn '==> GeneValidator output files have been saved to:'
|
|
179
|
+
warn " #{File.expand_path(@dirs[:output_dir])}"
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
def resume_from_previous_run(prev_dir)
|
|
183
|
+
prev_tmp_dir = File.join(prev_dir, 'tmp')
|
|
184
|
+
return unless Dir.exist? prev_tmp_dir
|
|
185
|
+
copy_blast_xml_files(prev_tmp_dir)
|
|
186
|
+
copy_raw_seq_files(prev_tmp_dir)
|
|
187
|
+
copy_prev_json_output(prev_tmp_dir)
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
def copy_blast_xml_files(prev_tmp_dir)
|
|
191
|
+
return if @opt[:blast_xml_file] || @opt[:blast_tabular_file]
|
|
192
|
+
prev_blast_xml = Dir[File.join(prev_tmp_dir, '*blast_xml')]
|
|
193
|
+
return if prev_blast_xml.empty?
|
|
194
|
+
blast_xml_fname = "#{@dirs[:filename]}.blast_xml"
|
|
195
|
+
@opt[:blast_xml_file] = File.join(@dirs[:tmp_dir], blast_xml_fname)
|
|
196
|
+
FileUtils.cp(prev_blast_xml[0], @opt[:blast_xml_file])
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
def copy_raw_seq_files(prev_tmp_dir)
|
|
200
|
+
return if @opt[:raw_sequences]
|
|
201
|
+
return unless @opt[:validations].include?('align') ||
|
|
202
|
+
@opt[:validations].include?('dup')
|
|
203
|
+
prev_raw_seq = Dir[File.join(prev_tmp_dir, '*raw_seq')]
|
|
204
|
+
return if prev_raw_seq.empty?
|
|
205
|
+
raw_seq_fname = "#{@dirs[:filename]}.blast_xml.raw_seq"
|
|
206
|
+
@opt[:raw_sequences] = File.join(@dirs[:tmp_dir], raw_seq_fname)
|
|
207
|
+
FileUtils.cp(prev_raw_seq[0], @opt[:raw_sequences])
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
def copy_prev_json_output(prev_tmp_dir)
|
|
211
|
+
prev_json_dir = File.join(prev_tmp_dir, 'json')
|
|
212
|
+
return unless Dir.exist? prev_json_dir
|
|
213
|
+
all_jsons = Dir[File.join(prev_json_dir, '*.json')]
|
|
214
|
+
FileUtils.cp(all_jsons, @dirs[:json_dir])
|
|
215
|
+
overview_json = Dir[File.join(prev_json_dir, 'overview.json')]
|
|
216
|
+
data_jsons = all_jsons - overview_json
|
|
217
|
+
parse_prev_json(data_jsons)
|
|
218
|
+
end
|
|
219
|
+
|
|
220
|
+
def parse_prev_json(data_jsons)
|
|
221
|
+
data_jsons.each do |json|
|
|
222
|
+
json_contents = File.read(File.expand_path(json))
|
|
223
|
+
data = JSON.parse(json_contents, symbolize_names: true)
|
|
224
|
+
idx = json.match(/(\d+).json/)[1].to_i - 1
|
|
225
|
+
@config[:json_output][idx] = data
|
|
226
|
+
print_prev_json_to_console(data)
|
|
116
227
|
end
|
|
117
|
-
|
|
228
|
+
end
|
|
229
|
+
|
|
230
|
+
def print_prev_json_to_console(data)
|
|
231
|
+
JsonToGVResults.print_console_header(data)
|
|
232
|
+
JsonToGVResults.print_output_console(data)
|
|
118
233
|
end
|
|
119
234
|
end
|
|
120
235
|
end
|