genevalidator 1.6.12 → 2.1.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.gitignore +30 -1
- data/.ruby-version +1 -0
- data/.travis.yml +13 -12
- data/Gemfile +4 -1
- data/Gemfile.lock +135 -0
- data/README.md +104 -122
- data/Rakefile +377 -5
- data/aux/gv_results.slim +155 -0
- data/aux/html_files/css/gv.compiled.min.css +8 -0
- data/aux/{files → html_files}/css/src/bootstrap.min.css +0 -0
- data/aux/{files → html_files}/css/src/font-awesome.min.css +0 -0
- data/aux/{files → html_files}/css/src/style.css +0 -0
- data/aux/{files → html_files}/fonts/FontAwesome.otf +0 -0
- data/aux/{files → html_files}/fonts/fontawesome-webfont.eot +0 -0
- data/aux/{files → html_files}/fonts/fontawesome-webfont.svg +0 -0
- data/aux/{files → html_files}/fonts/fontawesome-webfont.ttf +0 -0
- data/aux/{files → html_files}/fonts/fontawesome-webfont.woff +0 -0
- data/aux/{files → html_files}/img/gene.png +0 -0
- data/aux/html_files/js/gv.compiled.min.js +1 -0
- data/aux/{files → html_files}/js/src/bootstrap.min.js +0 -0
- data/aux/{files → html_files}/js/src/d3.v3.min.js +0 -0
- data/aux/{files → html_files}/js/src/jquery-2.1.1.min.js +0 -0
- data/aux/{files → html_files}/js/src/jquery.tablesorter.min.js +0 -0
- data/aux/{files → html_files}/js/src/plots.js +1 -1
- data/aux/{files → html_files}/js/src/script.js +0 -0
- data/aux/{files → html_files}/json/.gitkeep +0 -0
- data/bin/genevalidator +393 -56
- data/exemplar_data/README.md +60 -0
- data/{data/mrna_data.fasta → exemplar_data/mrna_data.fa} +1 -1
- data/{data/protein_data.fasta → exemplar_data/protein_data.fa} +0 -0
- data/genevalidator.gemspec +35 -20
- data/install.sh +92 -0
- data/lib/genevalidator.rb +171 -56
- data/lib/genevalidator/arg_validation.rb +26 -55
- data/lib/genevalidator/blast.rb +44 -99
- data/lib/genevalidator/clusterization.rb +18 -22
- data/lib/genevalidator/exceptions.rb +17 -17
- data/lib/genevalidator/ext/array.rb +21 -4
- data/lib/genevalidator/get_raw_sequences.rb +32 -31
- data/lib/genevalidator/hsp.rb +31 -2
- data/lib/genevalidator/json_to_gv_results.rb +38 -122
- data/lib/genevalidator/output.rb +158 -172
- data/lib/genevalidator/output_files.rb +134 -0
- data/lib/genevalidator/pool.rb +2 -5
- data/lib/genevalidator/query.rb +1 -1
- data/lib/genevalidator/tabular_parser.rb +8 -29
- data/lib/genevalidator/validation.rb +48 -90
- data/lib/genevalidator/validation_alignment.rb +64 -75
- data/lib/genevalidator/validation_blast_reading_frame.rb +13 -9
- data/lib/genevalidator/validation_duplication.rb +85 -84
- data/lib/genevalidator/validation_gene_merge.rb +46 -35
- data/lib/genevalidator/validation_length_cluster.rb +18 -15
- data/lib/genevalidator/validation_length_rank.rb +19 -15
- data/lib/genevalidator/validation_maker_qi.rb +13 -12
- data/lib/genevalidator/validation_open_reading_frame.rb +16 -13
- data/lib/genevalidator/validation_report.rb +1 -1
- data/lib/genevalidator/validation_test.rb +1 -1
- data/lib/genevalidator/version.rb +1 -1
- data/test/overall.rb +1 -1
- data/test/test_all_validations.rb +36 -24
- data/test/test_blast.rb +39 -24
- data/test/test_clusterization_2d.rb +4 -4
- data/test/test_helper.rb +2 -2
- data/test/test_query.rb +16 -20
- data/test/test_validation_open_reading_frame.rb +122 -122
- data/test/test_validations.rb +12 -10
- metadata +94 -79
- data/aux/files/css/genevalidator.compiled.min.css +0 -16
- data/aux/files/js/genevalidator.compiled.min.js +0 -28
- data/aux/json_footer.erb +0 -8
- data/aux/json_header.erb +0 -19
- data/aux/json_query.erb +0 -15
- data/aux/template_footer.erb +0 -8
- data/aux/template_header.erb +0 -19
- data/aux/template_query.erb +0 -14
- data/data/README.md +0 -57
- data/data/mrna_data.fasta.blast_tabular +0 -3567
- data/data/mrna_data.fasta.blast_tabular.raw_seq +0 -53998
- data/data/mrna_data.fasta.blast_tabular.raw_seq.idx +0 -5440
- data/data/mrna_data.fasta.blast_xml +0 -39800
- data/data/mrna_data.fasta.blast_xml.raw_seq +0 -2554
- data/data/mrna_data.fasta.blast_xml.raw_seq.idx +0 -3127
- data/data/mrna_data.fasta.json +0 -1
- data/data/protein_data.fasta.blast_tabular +0 -3278
- data/data/protein_data.fasta.blast_tabular.raw_seq +0 -61295
- data/data/protein_data.fasta.blast_tabular.raw_seq.idx +0 -4438
- data/data/protein_data.fasta.blast_xml +0 -26228
- data/data/protein_data.fasta.blast_xml.raw_seq +0 -9803
- data/data/protein_data.fasta.blast_xml.raw_seq.idx +0 -1777
- data/data/protein_data.fasta.json +0 -1
@@ -0,0 +1,60 @@
|
|
1
|
+
# Running GeneValidator with sample data
|
2
|
+
|
3
|
+
Here, we walk through the steps involved in analysing some sample data with GeneValidator. There are two options on how to run genevalidator - the second option is faster with larger input files.
|
4
|
+
|
5
|
+
## Expected Results
|
6
|
+
|
7
|
+
<strong>protein_data.fa</strong> [See here](http://wurmlab.github.io/tools/genevalidator/examplar_data/protein_input/)
|
8
|
+
<strong>mrna_data.fa</strong> [See here](http://wurmlab.github.io/tools/genevalidator/examplar_data/genetic_input/)
|
9
|
+
|
10
|
+
##### Running GeneValidator with a the included SwissProt Database, with four threads
|
11
|
+
|
12
|
+
```bash
|
13
|
+
# Protein data
|
14
|
+
$ genevalidator -n 4 protein_data.fa
|
15
|
+
|
16
|
+
# MRNA data
|
17
|
+
$ genevalidator -n 4 mrna_data.fa
|
18
|
+
```
|
19
|
+
|
20
|
+
This will produce a folder that will contain your result files.
|
21
|
+
|
22
|
+
##### Running GeneValidator with a pre-computed BLAST XML file
|
23
|
+
|
24
|
+
For protein_data.fa:
|
25
|
+
|
26
|
+
```
|
27
|
+
blastp -db DATABASE_PATH -num_threads 4 -out protein_data.blast.xml -query protein_data.fa -outfmt 5
|
28
|
+
|
29
|
+
# Run GeneValidator
|
30
|
+
genevalidator -d DATABASE_PATH -n 4 -x protein_data.blast.xml protein_data.fa
|
31
|
+
```
|
32
|
+
|
33
|
+
For mrna_data.fa:
|
34
|
+
|
35
|
+
```
|
36
|
+
blastx -db DATABASE_PATH -num_threads 4 -out mrna_data.blast.xml -query mrna_data.fa -outfmt 5
|
37
|
+
|
38
|
+
# Run GeneValidator
|
39
|
+
genevalidator -d DATABASE_PATH -n 4 -x mrna_data.blast.xml mrna_data.fa
|
40
|
+
```
|
41
|
+
|
42
|
+
##### Running GeneValidator with a pre-computed BLAST tabular file
|
43
|
+
|
44
|
+
For protein_data.fa:
|
45
|
+
|
46
|
+
```
|
47
|
+
blastp -db DATABASE_PATH -num_threads 4 -out protein_data.blast.tsv -query protein_data.fa -outfmt '7 qseqid sseqid sacc slen qstart qend sstart send length qframe pident nident evalue qseq sseq'
|
48
|
+
|
49
|
+
# Run GeneValidator
|
50
|
+
genevalidator -d DATABASE_PATH -n 4 -t protein_data.blast.tsv --blast_tabular_options 'qseqid sseqid sacc slen qstart qend sstart send length qframe pident nident evalue qseq sseq' protein_data.fa
|
51
|
+
```
|
52
|
+
|
53
|
+
For mrna_data.fa:
|
54
|
+
|
55
|
+
```
|
56
|
+
blastp -db DATABASE_PATH -num_threads 4 -out mrna_data.blast.tsv -query mrna_data.fa -outfmt '7 qseqid sseqid sacc slen qstart qend sstart send length qframe pident nident evalue qseq sseq'
|
57
|
+
|
58
|
+
# Run GeneValidator
|
59
|
+
genevalidator -d DATABASE_PATH -n 4 -t mrna_data.blast.tsv --blast_tabular_options 'qseqid sseqid sacc slen qstart qend sstart send length qframe pident nident evalue qseq sseq' mrna_data.fa
|
60
|
+
```
|
@@ -235,4 +235,4 @@ CCATGCCGGAGCATCAGTAGATCTTGCCATCTTCTCCCTTCATCTGGCAGGTGTCTCCTC
|
|
235
235
|
CATCCTCGGAGCAATTAACTTTATCACCACAGCCATCAACATGAAACCACCTGCCCTCTC
|
236
236
|
ACAATACCAAACCCCCCTATTCGTTTGATCCGTCTTAATTACCGCCATCCTTCTTCTCCT
|
237
237
|
TTCCCTCCCAGTTCTCGCCGCTGGTATTACAATGCTTCTAACAGATCGAAATCTAAACAC
|
238
|
-
TACATTCTTCGACCCTGCAGGGGGCGGAGACCCAATTTTATACCAACACTTA
|
238
|
+
TACATTCTTCGACCCTGCAGGGGGCGGAGACCCAATTTTATACCAACACTTA
|
File without changes
|
data/genevalidator.gemspec
CHANGED
@@ -1,12 +1,10 @@
|
|
1
|
-
|
2
|
-
lib = File.expand_path('../lib', __FILE__)
|
1
|
+
lib = File.expand_path('lib', __dir__)
|
3
2
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
3
|
require 'genevalidator/version'
|
5
4
|
|
6
5
|
Gem::Specification.new do |s|
|
7
|
-
# meta
|
8
6
|
s.name = 'genevalidator'
|
9
|
-
s.version =
|
7
|
+
s.version = GeneValidator::VERSION
|
10
8
|
s.authors = ['Monica Dragan', 'Ismail Moghul', 'Anurag Priyam',
|
11
9
|
'Yannick Wurm']
|
12
10
|
s.email = 'y.wurm@qmul.ac.uk'
|
@@ -14,19 +12,27 @@ Gem::Specification.new do |s|
|
|
14
12
|
s.license = 'AGPL'
|
15
13
|
s.summary = 'Identifying problems with gene predictions.'
|
16
14
|
s.description = 'The tool validates the input predicted genes and provides' \
|
17
|
-
' useful information (length validation, gene merge'\
|
15
|
+
' useful information (length validation, gene merge' \
|
18
16
|
' validation, sequence duplication checking, ORF finding)' \
|
19
17
|
' based on the similarities to genes in public databases.'
|
18
|
+
s.required_ruby_version = '>= 2.2.0'
|
20
19
|
|
21
|
-
s.
|
22
|
-
s.add_development_dependency '
|
23
|
-
s.add_development_dependency '
|
24
|
-
|
25
|
-
s.
|
26
|
-
s.
|
27
|
-
s.add_dependency
|
28
|
-
s.add_dependency
|
29
|
-
s.add_dependency
|
20
|
+
s.add_development_dependency 'minitest', '~> 5.10'
|
21
|
+
s.add_development_dependency 'rake', '~> 12.3'
|
22
|
+
s.add_development_dependency 'yard', '~> 0.9.11'
|
23
|
+
|
24
|
+
s.add_dependency 'bio', '~> 1.4'
|
25
|
+
s.add_dependency 'bio-blastxmlparser', '~> 2.0'
|
26
|
+
s.add_dependency 'genevalidatorapp', '~> 2.1.3'
|
27
|
+
s.add_dependency 'rack', '~> 2.0'
|
28
|
+
s.add_dependency 'slim', '~>3.0'
|
29
|
+
s.add_dependency 'statsample', '2.1.0'
|
30
|
+
|
31
|
+
# Adding mechanize gem just to silence a message on load.
|
32
|
+
# This is due the Statsample gem
|
33
|
+
# See https://github.com/SciRuby/daru/issues/404
|
34
|
+
# See https://github.com/SciRuby/statsample/pull/69
|
35
|
+
s.add_dependency 'mechanize', '2.7.5'
|
30
36
|
|
31
37
|
s.files = `git ls-files -z`.split("\x0")
|
32
38
|
s.executables = s.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
@@ -35,15 +41,24 @@ Gem::Specification.new do |s|
|
|
35
41
|
|
36
42
|
s.post_install_message = <<INFO
|
37
43
|
|
38
|
-
|
39
|
-
|
44
|
+
----------------------------------------------------------------------------
|
45
|
+
Thank you for validating your gene predictions with GeneValidator!
|
46
|
+
|
47
|
+
==> To launch GeneValidator execute 'genevalidator' from command line.
|
48
|
+
|
49
|
+
genevalidator [OPTIONAL ARGUMENTS] INPUT_FILE
|
50
|
+
|
51
|
+
See 'genevalidator --help' for more information
|
52
|
+
|
53
|
+
==> To launch GeneValidator as a web application execute 'genevalidator' from command line.
|
54
|
+
|
55
|
+
genevalidator app [OPTIONAL ARGUMENTS]
|
40
56
|
|
41
|
-
|
57
|
+
See 'genevalidator app --help' for more information
|
42
58
|
|
43
|
-
|
59
|
+
==> Visit https://wurmlab.github.io/tools/genevalidator/ for more information.
|
44
60
|
|
45
|
-
|
46
|
-
------------------------------------------------------------------------
|
61
|
+
----------------------------------------------------------------------------
|
47
62
|
|
48
63
|
INFO
|
49
64
|
end
|
data/install.sh
ADDED
@@ -0,0 +1,92 @@
|
|
1
|
+
#!/bin/sh
|
2
|
+
|
3
|
+
## USAGE: bash install.sh $INSTALL_DIR
|
4
|
+
## $ bash install.sh $INSTALL_DIR
|
5
|
+
|
6
|
+
set -eu
|
7
|
+
|
8
|
+
# OS detection
|
9
|
+
KERNEL="$(uname -s | tr '[:upper:]' '[:lower:]')"
|
10
|
+
|
11
|
+
if [ "$KERNEL" = "darwin" ]; then
|
12
|
+
PLATFORM='osx'
|
13
|
+
elif [ "$KERNEL" = "linux" ]; then
|
14
|
+
ARCH=$(uname -m)
|
15
|
+
if [ "$ARCH" = "x86_64" ]; then
|
16
|
+
PLATFORM='linux-x86_64'
|
17
|
+
else
|
18
|
+
PLATFORM='linux-x86'
|
19
|
+
fi
|
20
|
+
fi
|
21
|
+
|
22
|
+
# If there is an argument then there is where GV will installed
|
23
|
+
if [ "$0" = 'sh' ]; then
|
24
|
+
# I.e. when piping from curl
|
25
|
+
INSTALL_DIR=$PWD/genevalidator
|
26
|
+
elif [ "$0" = 'install.sh' ]; then
|
27
|
+
# I.e. when running directly
|
28
|
+
INSTALL_DIR=$PWD/genevalidator
|
29
|
+
else
|
30
|
+
INSTALL_DIR="$0"
|
31
|
+
fi
|
32
|
+
|
33
|
+
GV_URL=$(curl -s https://api.github.com/repos/wurmlab/genevalidator/releases/latest \
|
34
|
+
| grep browser_download_url \
|
35
|
+
| grep -i $PLATFORM \
|
36
|
+
| cut -d '"' -f 4)
|
37
|
+
|
38
|
+
echo >&2 "==> Installing GeneValidator to:"
|
39
|
+
echo >&2 " ${INSTALL_DIR}"
|
40
|
+
echo >&2
|
41
|
+
|
42
|
+
mkdir "${INSTALL_DIR}"
|
43
|
+
curl -SL "$GV_URL" | tar zxf - -C "${INSTALL_DIR}" --strip-components 1
|
44
|
+
|
45
|
+
echo >&2
|
46
|
+
echo >&2 "==> GeneValidator successfully installed."
|
47
|
+
|
48
|
+
### Check which SHELL and then test different profile files
|
49
|
+
case $SHELL in
|
50
|
+
*/zsh)
|
51
|
+
# assume Zsh
|
52
|
+
if test -e "${HOME}/.zshrc"; then
|
53
|
+
DOT_FILE=${HOME}/.zshrc
|
54
|
+
elif test -e "${HOME}/.zprofile"; then
|
55
|
+
DOT_FILE=${HOME}/.zprofile
|
56
|
+
elif test -e "${HOME}/.profile"; then
|
57
|
+
DOT_FILE=${HOME}/.profile
|
58
|
+
fi
|
59
|
+
;;
|
60
|
+
*/bash)
|
61
|
+
# assume Bash
|
62
|
+
if test -e "${HOME}/.bashrc"; then
|
63
|
+
DOT_FILE=${HOME}/.bashrc
|
64
|
+
elif test -e "${HOME}/.bash_profile"; then
|
65
|
+
DOT_FILE=${HOME}/.bash_profile
|
66
|
+
elif test -e "${HOME}/.profile"; then
|
67
|
+
DOT_FILE=${HOME}/.profile
|
68
|
+
fi
|
69
|
+
;;
|
70
|
+
*)
|
71
|
+
if test -e "${HOME}/.profile"; then
|
72
|
+
DOT_FILE=${HOME}/.profile
|
73
|
+
fi
|
74
|
+
esac
|
75
|
+
|
76
|
+
|
77
|
+
if [ -z ${DOT_FILE+x} ]; then
|
78
|
+
# DOT File hasn't been set.
|
79
|
+
echo >&2
|
80
|
+
echo >&2 '==> No profile files were found.'
|
81
|
+
echo >&2 ' Please create one and add the following line to that file:'
|
82
|
+
echo >&2
|
83
|
+
echo >&2 ' export PATH="'"${INSTALL_DIR}"'/bin:${PATH}"'
|
84
|
+
else
|
85
|
+
echo >&2 'export PATH="'"${INSTALL_DIR}"'/bin:${PATH}"' >> "${DOT_FILE}"
|
86
|
+
echo >&2
|
87
|
+
echo >&2 "==> Added GeneValidator to your PATH in ${DOT_FILE}"
|
88
|
+
echo >&2
|
89
|
+
echo >&2 "==> Run \`genevalidator -h\` in a new window to get started."
|
90
|
+
fi
|
91
|
+
|
92
|
+
echo >&2
|
data/lib/genevalidator.rb
CHANGED
@@ -5,61 +5,36 @@ require 'genevalidator/arg_validation'
|
|
5
5
|
require 'genevalidator/blast'
|
6
6
|
require 'genevalidator/exceptions'
|
7
7
|
require 'genevalidator/get_raw_sequences'
|
8
|
+
require 'genevalidator/json_to_gv_results'
|
8
9
|
require 'genevalidator/output'
|
10
|
+
require 'genevalidator/output_files'
|
9
11
|
require 'genevalidator/tabular_parser'
|
10
12
|
require 'genevalidator/validation'
|
11
13
|
|
12
14
|
# Top level module / namespace.
|
13
15
|
module GeneValidator
|
14
16
|
class << self
|
15
|
-
attr_accessor :opt, :config, :overview
|
17
|
+
attr_accessor :opt, :config, :overview, :dirs
|
16
18
|
attr_reader :raw_seq_file_index
|
17
19
|
attr_reader :raw_seq_file_load
|
18
20
|
# array of indexes for the start offsets of each query in the fasta file
|
19
21
|
attr_reader :query_idx
|
20
|
-
attr_accessor :mutex, :
|
22
|
+
attr_accessor :mutex, :mutex_array
|
21
23
|
|
22
|
-
def init(opt, start_idx = 1
|
23
|
-
|
24
|
+
def init(opt, start_idx = 1)
|
25
|
+
warn '==> Analysing input arguments'
|
24
26
|
@opt = opt
|
25
27
|
GVArgValidation.validate_args # validates @opt
|
28
|
+
number_of_sequences = index_the_input
|
26
29
|
|
27
|
-
@config =
|
28
|
-
|
29
|
-
start_idx: start_idx,
|
30
|
-
summary: summary,
|
31
|
-
|
32
|
-
type: BlastUtils.guess_sequence_type_from_input_file,
|
33
|
-
filename: File.basename(@opt[:input_fasta_file]),
|
34
|
-
html_path: "#{@opt[:input_fasta_file]}.html",
|
35
|
-
json_file: File.join(File.dirname(@opt[:input_fasta_file]),
|
36
|
-
"#{File.basename(@opt[:input_fasta_file])}.json"),
|
37
|
-
plot_dir: "#{@opt[:input_fasta_file]}.html/files/json",
|
38
|
-
aux: File.expand_path(File.join(File.dirname(__FILE__), '../aux')),
|
39
|
-
|
40
|
-
json_output: [],
|
41
|
-
run_no: 0,
|
42
|
-
output_max: 2500 # max no. of queries in the output file
|
43
|
-
}
|
44
|
-
|
45
|
-
@overview = {
|
46
|
-
no_queries: 0,
|
47
|
-
scores: [],
|
48
|
-
good_scores: 0,
|
49
|
-
bad_scores: 0,
|
50
|
-
nee: 0,
|
51
|
-
no_mafft: 0,
|
52
|
-
no_internet: 0,
|
53
|
-
map_errors: Hash.new(0),
|
54
|
-
run_time: Hash.new(Pair1.new(0, 0))
|
55
|
-
}
|
30
|
+
@config = setup_config(start_idx, number_of_sequences)
|
31
|
+
@dirs = setup_dirnames(@opt[:input_fasta_file])
|
56
32
|
|
57
33
|
@mutex = Mutex.new
|
58
34
|
@mutex_array = Mutex.new
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
index_the_input
|
35
|
+
|
36
|
+
resume_from_previous_run(opt[:resumable]) unless opt[:resumable].nil?
|
37
|
+
|
63
38
|
RawSequences.index_raw_seq_file if @opt[:raw_sequences]
|
64
39
|
end
|
65
40
|
|
@@ -69,6 +44,8 @@ module GeneValidator
|
|
69
44
|
# Run BLAST on all sequences (generates @opt[:blast_xml_file])
|
70
45
|
# if no BLAST OUTPUT file provided...
|
71
46
|
unless @opt[:blast_xml_file] || @opt[:blast_tabular_file]
|
47
|
+
blast_xml_fname = "#{dirs[:filename]}.blast_xml"
|
48
|
+
opt[:blast_xml_file] = File.join(dirs[:tmp_dir], blast_xml_fname)
|
72
49
|
BlastUtils.run_blast_on_input_file
|
73
50
|
end
|
74
51
|
# Obtain fasta file of all BLAST hits if running align or dup validations
|
@@ -78,19 +55,110 @@ module GeneValidator
|
|
78
55
|
end
|
79
56
|
# Run Validations
|
80
57
|
iterator = parse_blast_output_file
|
81
|
-
|
58
|
+
Validations.new.run_validations(iterator)
|
59
|
+
produce_output
|
60
|
+
print_directories_locations
|
61
|
+
end
|
62
|
+
|
63
|
+
##
|
64
|
+
# Params:
|
65
|
+
# +output+: filename or stream, according to the type
|
66
|
+
# +type+: file or stream
|
67
|
+
# Returns an iterator..
|
68
|
+
def parse_blast_output_file
|
69
|
+
if @opt[:blast_xml_file]
|
70
|
+
Bio::BlastXMLParser::XmlIterator.new(@opt[:blast_xml_file]).to_enum
|
71
|
+
else
|
72
|
+
TabularParser.new
|
73
|
+
end
|
74
|
+
## TODO: Add a Rescue statement - e.g. if unable to create the Object...
|
75
|
+
end
|
76
|
+
|
77
|
+
# Also called by json_to_gv script
|
78
|
+
def setup_dirnames(input_file)
|
79
|
+
fname = File.basename(input_file, File.extname(input_file))
|
80
|
+
out_dir = setup_output_dir(fname)
|
81
|
+
{ filename: fname,
|
82
|
+
output_dir: out_dir,
|
83
|
+
tmp_dir: File.join(out_dir, 'tmp'),
|
84
|
+
json_dir: File.join(out_dir, 'tmp/json'),
|
85
|
+
html_file: File.join(out_dir, "#{fname}_results*.html"),
|
86
|
+
json_file: File.join(out_dir, "#{fname}_results.json"),
|
87
|
+
csv_file: File.join(out_dir, "#{fname}_results.csv"),
|
88
|
+
summary_file: File.join(out_dir, "#{fname}_summary.csv"),
|
89
|
+
fasta_file: File.join(out_dir, "#{fname}_results.fa"),
|
90
|
+
aux_dir: File.expand_path('../aux', __dir__) }
|
91
|
+
end
|
92
|
+
|
93
|
+
def extract_input_fasta_sequence(index)
|
94
|
+
start_offset = @query_idx[index + 1] - @query_idx[index]
|
95
|
+
end_offset = @query_idx[index]
|
96
|
+
IO.binread(@opt[:input_fasta_file], start_offset, end_offset)
|
97
|
+
end
|
98
|
+
|
99
|
+
def produce_output
|
100
|
+
@overview = Output.generate_overview(@config[:json_output],
|
101
|
+
@opt[:min_blast_hits])
|
102
|
+
eval_text = Output.generate_evaluation_text(@overview)
|
103
|
+
Output.print_console_footer(eval_text, @opt)
|
104
|
+
|
105
|
+
output_files = OutputFiles.new
|
106
|
+
output_files.write_json
|
107
|
+
output_files.write_html(eval_text)
|
108
|
+
output_files.write_csv
|
109
|
+
output_files.write_summary
|
110
|
+
output_files.print_best_fasta
|
111
|
+
end
|
112
|
+
|
113
|
+
private
|
114
|
+
|
115
|
+
def setup_config(start_idx, seq_length)
|
116
|
+
{
|
117
|
+
idx: 0,
|
118
|
+
start_idx: start_idx,
|
119
|
+
|
120
|
+
type: BlastUtils.guess_sequence_type_from_input_file,
|
82
121
|
|
83
|
-
|
84
|
-
|
122
|
+
json_output: Array.new(seq_length),
|
123
|
+
run_no: 0,
|
124
|
+
output_max: 2500 # max no. of queries in the output html file
|
125
|
+
}
|
85
126
|
end
|
86
127
|
|
87
128
|
##
|
88
129
|
# Creates the output folder and copies the auxiliar folders to this folder
|
89
|
-
def
|
90
|
-
|
130
|
+
def setup_output_dir(fname)
|
131
|
+
dir_name = "#{fname}_" + Time.now.strftime('%Y_%m_%d_%H_%M_%S')
|
132
|
+
default_outdir = File.join(Dir.pwd, dir_name)
|
133
|
+
output_dir = @opt[:output_dir].nil? ? default_outdir : @opt[:output_dir]
|
134
|
+
assert_output_dir_does_not_exist(output_dir)
|
91
135
|
Dir.mkdir(output_dir)
|
92
|
-
|
93
|
-
|
136
|
+
Dir.mkdir(File.join(output_dir, 'tmp'))
|
137
|
+
cp_html_files(output_dir)
|
138
|
+
output_dir
|
139
|
+
end
|
140
|
+
|
141
|
+
def assert_output_dir_does_not_exist(output_dir)
|
142
|
+
return unless Dir.exist?(output_dir)
|
143
|
+
FileUtils.rm_r(output_dir) if @opt[:force_rewrite]
|
144
|
+
return if @opt[:force_rewrite]
|
145
|
+
warn "The output directory (#{output_dir}) already exists."
|
146
|
+
warn ''
|
147
|
+
warn 'Please remove this directory before continuing.'
|
148
|
+
warn 'Alternatively, you rerun GeneValidator with the `--force` argument,'
|
149
|
+
warn 'which rewrites over any previous output.'
|
150
|
+
exit 1
|
151
|
+
end
|
152
|
+
|
153
|
+
def cp_html_files(output_dir)
|
154
|
+
if @opt[:output_formats].include? 'html'
|
155
|
+
aux_files = File.expand_path('../aux/html_files/', __dir__)
|
156
|
+
FileUtils.cp_r(aux_files, output_dir)
|
157
|
+
FileUtils.ln_s(File.join('..', 'html_files', 'json'),
|
158
|
+
File.join(output_dir, 'tmp', 'json'))
|
159
|
+
else
|
160
|
+
Dir.mkdir(File.join(output_dir, 'tmp', 'json'))
|
161
|
+
end
|
94
162
|
end
|
95
163
|
|
96
164
|
##
|
@@ -99,22 +167,69 @@ module GeneValidator
|
|
99
167
|
# start and end positions of each query.
|
100
168
|
def index_the_input
|
101
169
|
fasta_content = IO.binread(@opt[:input_fasta_file])
|
102
|
-
@query_idx = fasta_content.enum_for(:scan, /(>[^>]+)/).map
|
170
|
+
@query_idx = fasta_content.enum_for(:scan, /(>[^>]+)/).map do
|
171
|
+
Regexp.last_match.begin(0)
|
172
|
+
end
|
103
173
|
@query_idx.push(fasta_content.length)
|
174
|
+
@query_idx.length - 1
|
104
175
|
end
|
105
176
|
|
106
|
-
|
107
|
-
|
108
|
-
#
|
109
|
-
|
110
|
-
|
111
|
-
def
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
177
|
+
def print_directories_locations
|
178
|
+
warn '==> GeneValidator output files have been saved to:'
|
179
|
+
warn " #{File.expand_path(@dirs[:output_dir])}"
|
180
|
+
end
|
181
|
+
|
182
|
+
def resume_from_previous_run(prev_dir)
|
183
|
+
prev_tmp_dir = File.join(prev_dir, 'tmp')
|
184
|
+
return unless Dir.exist? prev_tmp_dir
|
185
|
+
copy_blast_xml_files(prev_tmp_dir)
|
186
|
+
copy_raw_seq_files(prev_tmp_dir)
|
187
|
+
copy_prev_json_output(prev_tmp_dir)
|
188
|
+
end
|
189
|
+
|
190
|
+
def copy_blast_xml_files(prev_tmp_dir)
|
191
|
+
return if @opt[:blast_xml_file] || @opt[:blast_tabular_file]
|
192
|
+
prev_blast_xml = Dir[File.join(prev_tmp_dir, '*blast_xml')]
|
193
|
+
return if prev_blast_xml.empty?
|
194
|
+
blast_xml_fname = "#{@dirs[:filename]}.blast_xml"
|
195
|
+
@opt[:blast_xml_file] = File.join(@dirs[:tmp_dir], blast_xml_fname)
|
196
|
+
FileUtils.cp(prev_blast_xml[0], @opt[:blast_xml_file])
|
197
|
+
end
|
198
|
+
|
199
|
+
def copy_raw_seq_files(prev_tmp_dir)
|
200
|
+
return if @opt[:raw_sequences]
|
201
|
+
return unless @opt[:validations].include?('align') ||
|
202
|
+
@opt[:validations].include?('dup')
|
203
|
+
prev_raw_seq = Dir[File.join(prev_tmp_dir, '*raw_seq')]
|
204
|
+
return if prev_raw_seq.empty?
|
205
|
+
raw_seq_fname = "#{@dirs[:filename]}.blast_xml.raw_seq"
|
206
|
+
@opt[:raw_sequences] = File.join(@dirs[:tmp_dir], raw_seq_fname)
|
207
|
+
FileUtils.cp(prev_raw_seq[0], @opt[:raw_sequences])
|
208
|
+
end
|
209
|
+
|
210
|
+
def copy_prev_json_output(prev_tmp_dir)
|
211
|
+
prev_json_dir = File.join(prev_tmp_dir, 'json')
|
212
|
+
return unless Dir.exist? prev_json_dir
|
213
|
+
all_jsons = Dir[File.join(prev_json_dir, '*.json')]
|
214
|
+
FileUtils.cp(all_jsons, @dirs[:json_dir])
|
215
|
+
overview_json = Dir[File.join(prev_json_dir, 'overview.json')]
|
216
|
+
data_jsons = all_jsons - overview_json
|
217
|
+
parse_prev_json(data_jsons)
|
218
|
+
end
|
219
|
+
|
220
|
+
def parse_prev_json(data_jsons)
|
221
|
+
data_jsons.each do |json|
|
222
|
+
json_contents = File.read(File.expand_path(json))
|
223
|
+
data = JSON.parse(json_contents, symbolize_names: true)
|
224
|
+
idx = json.match(/(\d+).json/)[1].to_i - 1
|
225
|
+
@config[:json_output][idx] = data
|
226
|
+
print_prev_json_to_console(data)
|
116
227
|
end
|
117
|
-
|
228
|
+
end
|
229
|
+
|
230
|
+
def print_prev_json_to_console(data)
|
231
|
+
JsonToGVResults.print_console_header(data)
|
232
|
+
JsonToGVResults.print_output_console(data)
|
118
233
|
end
|
119
234
|
end
|
120
235
|
end
|