genevalidator 1.6.12 → 2.1.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.gitignore +30 -1
- data/.ruby-version +1 -0
- data/.travis.yml +13 -12
- data/Gemfile +4 -1
- data/Gemfile.lock +135 -0
- data/README.md +104 -122
- data/Rakefile +377 -5
- data/aux/gv_results.slim +155 -0
- data/aux/html_files/css/gv.compiled.min.css +8 -0
- data/aux/{files → html_files}/css/src/bootstrap.min.css +0 -0
- data/aux/{files → html_files}/css/src/font-awesome.min.css +0 -0
- data/aux/{files → html_files}/css/src/style.css +0 -0
- data/aux/{files → html_files}/fonts/FontAwesome.otf +0 -0
- data/aux/{files → html_files}/fonts/fontawesome-webfont.eot +0 -0
- data/aux/{files → html_files}/fonts/fontawesome-webfont.svg +0 -0
- data/aux/{files → html_files}/fonts/fontawesome-webfont.ttf +0 -0
- data/aux/{files → html_files}/fonts/fontawesome-webfont.woff +0 -0
- data/aux/{files → html_files}/img/gene.png +0 -0
- data/aux/html_files/js/gv.compiled.min.js +1 -0
- data/aux/{files → html_files}/js/src/bootstrap.min.js +0 -0
- data/aux/{files → html_files}/js/src/d3.v3.min.js +0 -0
- data/aux/{files → html_files}/js/src/jquery-2.1.1.min.js +0 -0
- data/aux/{files → html_files}/js/src/jquery.tablesorter.min.js +0 -0
- data/aux/{files → html_files}/js/src/plots.js +1 -1
- data/aux/{files → html_files}/js/src/script.js +0 -0
- data/aux/{files → html_files}/json/.gitkeep +0 -0
- data/bin/genevalidator +393 -56
- data/exemplar_data/README.md +60 -0
- data/{data/mrna_data.fasta → exemplar_data/mrna_data.fa} +1 -1
- data/{data/protein_data.fasta → exemplar_data/protein_data.fa} +0 -0
- data/genevalidator.gemspec +35 -20
- data/install.sh +92 -0
- data/lib/genevalidator.rb +171 -56
- data/lib/genevalidator/arg_validation.rb +26 -55
- data/lib/genevalidator/blast.rb +44 -99
- data/lib/genevalidator/clusterization.rb +18 -22
- data/lib/genevalidator/exceptions.rb +17 -17
- data/lib/genevalidator/ext/array.rb +21 -4
- data/lib/genevalidator/get_raw_sequences.rb +32 -31
- data/lib/genevalidator/hsp.rb +31 -2
- data/lib/genevalidator/json_to_gv_results.rb +38 -122
- data/lib/genevalidator/output.rb +158 -172
- data/lib/genevalidator/output_files.rb +134 -0
- data/lib/genevalidator/pool.rb +2 -5
- data/lib/genevalidator/query.rb +1 -1
- data/lib/genevalidator/tabular_parser.rb +8 -29
- data/lib/genevalidator/validation.rb +48 -90
- data/lib/genevalidator/validation_alignment.rb +64 -75
- data/lib/genevalidator/validation_blast_reading_frame.rb +13 -9
- data/lib/genevalidator/validation_duplication.rb +85 -84
- data/lib/genevalidator/validation_gene_merge.rb +46 -35
- data/lib/genevalidator/validation_length_cluster.rb +18 -15
- data/lib/genevalidator/validation_length_rank.rb +19 -15
- data/lib/genevalidator/validation_maker_qi.rb +13 -12
- data/lib/genevalidator/validation_open_reading_frame.rb +16 -13
- data/lib/genevalidator/validation_report.rb +1 -1
- data/lib/genevalidator/validation_test.rb +1 -1
- data/lib/genevalidator/version.rb +1 -1
- data/test/overall.rb +1 -1
- data/test/test_all_validations.rb +36 -24
- data/test/test_blast.rb +39 -24
- data/test/test_clusterization_2d.rb +4 -4
- data/test/test_helper.rb +2 -2
- data/test/test_query.rb +16 -20
- data/test/test_validation_open_reading_frame.rb +122 -122
- data/test/test_validations.rb +12 -10
- metadata +94 -79
- data/aux/files/css/genevalidator.compiled.min.css +0 -16
- data/aux/files/js/genevalidator.compiled.min.js +0 -28
- data/aux/json_footer.erb +0 -8
- data/aux/json_header.erb +0 -19
- data/aux/json_query.erb +0 -15
- data/aux/template_footer.erb +0 -8
- data/aux/template_header.erb +0 -19
- data/aux/template_query.erb +0 -14
- data/data/README.md +0 -57
- data/data/mrna_data.fasta.blast_tabular +0 -3567
- data/data/mrna_data.fasta.blast_tabular.raw_seq +0 -53998
- data/data/mrna_data.fasta.blast_tabular.raw_seq.idx +0 -5440
- data/data/mrna_data.fasta.blast_xml +0 -39800
- data/data/mrna_data.fasta.blast_xml.raw_seq +0 -2554
- data/data/mrna_data.fasta.blast_xml.raw_seq.idx +0 -3127
- data/data/mrna_data.fasta.json +0 -1
- data/data/protein_data.fasta.blast_tabular +0 -3278
- data/data/protein_data.fasta.blast_tabular.raw_seq +0 -61295
- data/data/protein_data.fasta.blast_tabular.raw_seq.idx +0 -4438
- data/data/protein_data.fasta.blast_xml +0 -26228
- data/data/protein_data.fasta.blast_xml.raw_seq +0 -9803
- data/data/protein_data.fasta.blast_xml.raw_seq.idx +0 -1777
- data/data/protein_data.fasta.json +0 -1
@@ -1,11 +1,11 @@
|
|
1
1
|
module GeneValidator
|
2
2
|
# Exception raised when BLAST path is not added to the CLASSPATH
|
3
|
-
class ClasspathError <
|
3
|
+
class ClasspathError < RuntimeError
|
4
4
|
end
|
5
5
|
|
6
6
|
# Exception raised when the command line type argument
|
7
7
|
# does not corrsepond to the type of the sequences in the fasta file
|
8
|
-
class SequenceTypeError <
|
8
|
+
class SequenceTypeError < RuntimeError
|
9
9
|
def to_s
|
10
10
|
"\nSequence Type error: Possible cause include that the blast output" \
|
11
11
|
" was not obtained against a protein database.\n"
|
@@ -13,15 +13,15 @@ module GeneValidator
|
|
13
13
|
end
|
14
14
|
|
15
15
|
# Exception raised when an unexisting file is accessed
|
16
|
-
class FileNotFoundException <
|
16
|
+
class FileNotFoundException < RuntimeError
|
17
17
|
end
|
18
18
|
|
19
19
|
# Exception raised when blast does not find any hit
|
20
|
-
class QueryError <
|
20
|
+
class QueryError < RuntimeError
|
21
21
|
end
|
22
22
|
|
23
23
|
# Exception raised when a validation class is not instance of ValidationTest
|
24
|
-
class ValidationClassError <
|
24
|
+
class ValidationClassError < RuntimeError
|
25
25
|
def to_s
|
26
26
|
"\nClass Type error: Possible cause include that one of the validations" \
|
27
27
|
" is not a sub-class of ValidationTest\n"
|
@@ -30,7 +30,7 @@ module GeneValidator
|
|
30
30
|
|
31
31
|
# Exception raised when a validation report class is not instance of
|
32
32
|
# ValidationReport
|
33
|
-
class ReportClassError <
|
33
|
+
class ReportClassError < RuntimeError
|
34
34
|
def to_s
|
35
35
|
"\nClass Type error: Possible causes include that the type of one of" \
|
36
36
|
' the validation reports is not a subclass of the ValidationReport' \
|
@@ -40,20 +40,20 @@ module GeneValidator
|
|
40
40
|
|
41
41
|
# Exception raised when there are not enough blast hits to make a statistical
|
42
42
|
# validation
|
43
|
-
class NotEnoughHitsError <
|
43
|
+
class NotEnoughHitsError < RuntimeError
|
44
44
|
end
|
45
45
|
|
46
46
|
# Exception raised when function dependig on the internet connection raise
|
47
47
|
# Exception
|
48
|
-
class NoInternetError <
|
48
|
+
class NoInternetError < RuntimeError
|
49
49
|
end
|
50
50
|
|
51
51
|
# Exception raised when the alignment initialization raises exception
|
52
|
-
class NoMafftInstallationError <
|
52
|
+
class NoMafftInstallationError < RuntimeError
|
53
53
|
end
|
54
54
|
|
55
55
|
# Exception raised when the -v argument didn't filter any validatio test
|
56
|
-
class NoValidationError <
|
56
|
+
class NoValidationError < RuntimeError
|
57
57
|
def to_s
|
58
58
|
"\nValidation error: Possible cause inlcude that the -v arguments" \
|
59
59
|
" supplied is not valid\n"
|
@@ -61,7 +61,7 @@ module GeneValidator
|
|
61
61
|
end
|
62
62
|
|
63
63
|
# Exception raised when the are alias duplications
|
64
|
-
class AliasDuplicationError <
|
64
|
+
class AliasDuplicationError < RuntimeError
|
65
65
|
def to_s
|
66
66
|
"\nAlias Duplication error: Possible cause: At least two validations" \
|
67
67
|
" have the same CLI alias\n"
|
@@ -69,27 +69,27 @@ module GeneValidator
|
|
69
69
|
end
|
70
70
|
|
71
71
|
# Exception raised when the BLAST is not set up with the '-parse-seqids' arg.
|
72
|
-
class BLASTDBError <
|
72
|
+
class BLASTDBError < RuntimeError
|
73
73
|
end
|
74
74
|
|
75
75
|
# Error raised by QI Validation when the query does not have QI tag
|
76
|
-
class NotEnoughEvidence <
|
76
|
+
class NotEnoughEvidence < RuntimeError
|
77
77
|
end
|
78
78
|
|
79
79
|
# Exception raised when the are alias duplications
|
80
|
-
class NoPIdentError <
|
80
|
+
class NoPIdentError < RuntimeError
|
81
81
|
end
|
82
82
|
|
83
83
|
# Exception raised when the tabular format does not correspond to the tabular
|
84
84
|
# argument
|
85
|
-
class InconsistentTabularFormat <
|
85
|
+
class InconsistentTabularFormat < RuntimeError
|
86
86
|
end
|
87
87
|
|
88
88
|
# Exception raised when there are more than one reading frame among the hits
|
89
89
|
# of one prediction
|
90
|
-
class ReadingFrameError <
|
90
|
+
class ReadingFrameError < RuntimeError
|
91
91
|
end
|
92
92
|
|
93
|
-
class OtherError <
|
93
|
+
class OtherError < RuntimeError
|
94
94
|
end
|
95
95
|
end
|
@@ -9,15 +9,15 @@ module GeneValidator
|
|
9
9
|
sum / length.to_f
|
10
10
|
end
|
11
11
|
|
12
|
-
def median
|
13
|
-
sorted = sort
|
12
|
+
def median(already_sorted = false)
|
13
|
+
sorted = already_sorted ? self : sort
|
14
14
|
len = sorted.length
|
15
15
|
(sorted[(len - 1) / 2] + sorted[len / 2]) / 2.0
|
16
16
|
end
|
17
17
|
|
18
18
|
def mode
|
19
|
-
freq =
|
20
|
-
|
19
|
+
freq = each_with_object(Hash.new(0)) { |v, h| h[v] += 1; }
|
20
|
+
max_by { |v| freq[v] }
|
21
21
|
end
|
22
22
|
|
23
23
|
def sample_variance
|
@@ -29,6 +29,23 @@ module GeneValidator
|
|
29
29
|
def standard_deviation
|
30
30
|
Math.sqrt(sample_variance)
|
31
31
|
end
|
32
|
+
|
33
|
+
def all_quartiles
|
34
|
+
sorted = sort
|
35
|
+
len = sorted.length
|
36
|
+
split = sorted.median_split
|
37
|
+
[
|
38
|
+
split[0].median(true),
|
39
|
+
sorted.median(true),
|
40
|
+
split[1].median(true)
|
41
|
+
]
|
42
|
+
end
|
43
|
+
|
44
|
+
def median_split
|
45
|
+
len = length
|
46
|
+
center = len % 2
|
47
|
+
[self[0..len / 2 - 1], self[len / 2 + center..-1]]
|
48
|
+
end
|
32
49
|
end
|
33
50
|
end
|
34
51
|
|
@@ -13,24 +13,25 @@ module GeneValidator
|
|
13
13
|
class RawSequences
|
14
14
|
class <<self
|
15
15
|
extend Forwardable
|
16
|
-
def_delegators GeneValidator, :opt, :config
|
16
|
+
def_delegators GeneValidator, :opt, :config, :dirs
|
17
17
|
|
18
18
|
def init
|
19
|
-
|
20
|
-
|
19
|
+
warn '==> Extracting fasta sequences for each BLAST HSP from the' \
|
20
|
+
' BLAST database'
|
21
21
|
|
22
22
|
@blast_file = opt[:blast_xml_file] if opt[:blast_xml_file]
|
23
23
|
@blast_file = opt[:blast_tabular_file] if opt[:blast_tabular_file]
|
24
24
|
|
25
|
-
|
26
|
-
|
25
|
+
fname = File.basename(@blast_file)
|
26
|
+
opt[:raw_sequences] = File.join(dirs[:tmp_dir], "#{fname}.raw_seq")
|
27
|
+
@index_file = File.join(dirs[:tmp_dir], "#{fname}.index")
|
27
28
|
end
|
28
29
|
|
29
30
|
##
|
30
31
|
# Obtains raw_sequences from BLAST output file...
|
31
32
|
def run
|
32
33
|
init
|
33
|
-
if opt[:db]
|
34
|
+
if opt[:db].match?(/remote/)
|
34
35
|
write_a_raw_seq_file(opt[:raw_sequences], 'remote')
|
35
36
|
else
|
36
37
|
write_an_index_file(@index_file, 'local')
|
@@ -55,12 +56,13 @@ module GeneValidator
|
|
55
56
|
index_hash = {}
|
56
57
|
keys.each_with_index do |k, i|
|
57
58
|
start = values[i]
|
58
|
-
endf =
|
59
|
+
endf = i == values.length - 1 ? content.length - 1 : values[i + 1]
|
59
60
|
index_hash[k] = [start, endf]
|
60
61
|
end
|
61
62
|
|
62
63
|
# create FASTA index
|
63
|
-
|
64
|
+
fname = File.basename(raw_seq_file)
|
65
|
+
config[:raw_seq_file_index] = File.join(dirs[:tmp_dir], "#{fname}.idx")
|
64
66
|
config[:raw_seq_file_load] = index_hash
|
65
67
|
|
66
68
|
File.open(config[:raw_seq_file_index], 'w') do |f|
|
@@ -76,29 +78,29 @@ module GeneValidator
|
|
76
78
|
iterate_xml(file, db_type) if opt[:blast_xml_file]
|
77
79
|
iterate_tabular(file, db_type) if opt[:blast_tabular_file]
|
78
80
|
rescue BLASTDBError
|
79
|
-
|
81
|
+
warn '*** BLAST Database Error: Genevalidator requires BLAST' \
|
80
82
|
" databases to be created with the '-parse_seqids argument."
|
81
|
-
|
82
|
-
|
83
|
+
warn ' See https://github.com/wurmlab/genevalidator' \
|
84
|
+
'#setting-up-a-blast-database for more information'
|
83
85
|
exit 1
|
84
|
-
rescue
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
86
|
+
rescue StandardError
|
87
|
+
warn '*** Error: There was an error in analysing the BLAST'
|
88
|
+
warn ' output file. Please ensure that BLAST output file'
|
89
|
+
warn ' is in the correct format and then try again. If you'
|
90
|
+
warn ' are using a remote database, please ensure that you'
|
91
|
+
warn ' have internet access.'
|
90
92
|
exit 1
|
91
93
|
ensure
|
92
94
|
file.close unless file.nil?
|
93
95
|
end
|
94
96
|
|
95
|
-
|
97
|
+
alias write_a_raw_seq_file write_an_index_file
|
96
98
|
|
97
99
|
def iterate_xml(file, db_type)
|
98
100
|
n = Bio::BlastXMLParser::XmlIterator.new(opt[:blast_xml_file]).to_enum
|
99
101
|
n.each do |iter|
|
100
102
|
iter.each do |hit|
|
101
|
-
|
103
|
+
raise BLASTDBError if hit.hit_id =~ /\|BL_ORD_ID\|/
|
102
104
|
if db_type == 'remote' || hit.hit_id.nil?
|
103
105
|
file.puts FetchRawSequences.extract_from_remote_db(hit.accession)
|
104
106
|
else
|
@@ -116,7 +118,7 @@ module GeneValidator
|
|
116
118
|
headers: table_headers)
|
117
119
|
|
118
120
|
rows.each do |row|
|
119
|
-
|
121
|
+
raise BLASTDBError if row['sseqid'] =~ /\|BL_ORD_ID\|/i
|
120
122
|
if db_type == 'remote' || row['sseqid'].nil?
|
121
123
|
file.puts FetchRawSequences.extract_from_remote_db(row['sacc'])
|
122
124
|
else
|
@@ -136,15 +138,15 @@ module GeneValidator
|
|
136
138
|
# first try to extract from previously created raw_sequences HASH
|
137
139
|
raw_seq = extract_from_index(identifier) if opt[:raw_sequences]
|
138
140
|
# then try to just extract that sequence based on accession.
|
139
|
-
if opt[:db] !~ /remote/ && (raw_seq.nil? || raw_seq =~ /Error/)
|
141
|
+
if opt[:db] !~ /remote/ && (raw_seq.nil? || raw_seq =~ /Error/i)
|
140
142
|
raw_seq = extract_from_local_db(false, accession)
|
141
143
|
end
|
142
144
|
# then try to extract from remote database
|
143
|
-
if opt[:db] =~ /remote/ && (raw_seq.nil? || raw_seq =~ /Error/)
|
145
|
+
if opt[:db] =~ /remote/ && (raw_seq.nil? || raw_seq =~ /Error/i)
|
144
146
|
raw_seq = extract_from_remote_db(accession)
|
145
147
|
end
|
146
148
|
# return nil if the raw_sequence still produces an error.
|
147
|
-
|
149
|
+
raw_seq =~ /Error/i ? nil : raw_seq
|
148
150
|
end
|
149
151
|
|
150
152
|
##
|
@@ -157,8 +159,8 @@ module GeneValidator
|
|
157
159
|
idx = config[:raw_seq_file_load][identifier]
|
158
160
|
query = IO.binread(opt[:raw_sequences], idx[1] - idx[0], idx[0])
|
159
161
|
parse_query = query.scan(/>([^\n]*)\n([A-Za-z\n]*)/)[0]
|
160
|
-
parse_query[1].
|
161
|
-
rescue
|
162
|
+
parse_query[1].delete("\n")
|
163
|
+
rescue StandardError
|
162
164
|
'Error' # return error so it can then try alternative fetching method.
|
163
165
|
end
|
164
166
|
|
@@ -170,7 +172,7 @@ module GeneValidator
|
|
170
172
|
# Output:
|
171
173
|
# String with the nucleotide sequence corresponding to the accession
|
172
174
|
def extract_from_local_db(batch, accno = nil, idx_file = nil)
|
173
|
-
cmd =
|
175
|
+
cmd = batch ? batch_raw_seq_cmd(idx_file) : single_raw_seq_cmd(accno)
|
174
176
|
efile = Tempfile.new('blast_out')
|
175
177
|
`#{cmd} &>#{efile.path}`
|
176
178
|
raw_seqs = efile.read
|
@@ -193,9 +195,9 @@ module GeneValidator
|
|
193
195
|
def failed_raw_sequences(blast_output)
|
194
196
|
blast_output.each_line do |line|
|
195
197
|
acc = line.match(/Error: (\w+): OID not found/)[1]
|
196
|
-
|
198
|
+
warn "\nCould not find sequence '#{acc.chomp}' within the" \
|
197
199
|
' BLAST database.'
|
198
|
-
|
200
|
+
warn "Attempting to obtain sequence '#{acc.chomp}' from" \
|
199
201
|
' remote BLAST databases.'
|
200
202
|
File.open(opt[:raw_sequences], 'a+') do |f|
|
201
203
|
f.puts extract_from_remote_db(acc)
|
@@ -204,13 +206,12 @@ module GeneValidator
|
|
204
206
|
end
|
205
207
|
|
206
208
|
def extract_from_remote_db(accession, db_seq_type = 'protein')
|
207
|
-
uri = '
|
209
|
+
uri = 'https://www.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?' \
|
208
210
|
"db=#{db_seq_type}&retmax=1&usehistory=y&term=#{accession}/"
|
209
211
|
result = Net::HTTP.get(URI.parse(uri))
|
210
212
|
query = result.match(%r{<\bQueryKey\b>([\w\W\d]+)</\bQueryKey\b>})[1]
|
211
213
|
web_env = result.match(%r{<\bWebEnv\b>([\w\W\d]+)</\bWebEnv\b>})[1]
|
212
|
-
|
213
|
-
uri = 'http://www.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?' \
|
214
|
+
uri = 'https://www.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?' \
|
214
215
|
'rettype=fasta&retmode=text&retstart=0&retmax=1&' \
|
215
216
|
"db=#{db_seq_type}&query_key=#{query}&WebEnv=#{web_env}"
|
216
217
|
result = Net::HTTP.get(URI.parse(uri))
|
data/lib/genevalidator/hsp.rb
CHANGED
@@ -1,9 +1,14 @@
|
|
1
|
+
require 'forwardable'
|
2
|
+
|
1
3
|
require 'genevalidator/blast'
|
2
4
|
require 'genevalidator/exceptions'
|
3
5
|
|
4
6
|
module GeneValidator
|
5
7
|
# A class that initialises the BLAST tabular attributes
|
6
8
|
class Hsp
|
9
|
+
extend Forwardable
|
10
|
+
def_delegators GeneValidator, :config
|
11
|
+
|
7
12
|
attr_accessor :hit_from # ref. from the unaligned hit sequence
|
8
13
|
attr_accessor :hit_to
|
9
14
|
attr_accessor :match_query_from # ref. from the unaligned query sequence
|
@@ -23,9 +28,30 @@ module GeneValidator
|
|
23
28
|
attr_accessor :gaps
|
24
29
|
attr_accessor :align_len
|
25
30
|
|
26
|
-
def initialize
|
31
|
+
def initialize(input = {})
|
27
32
|
@query_alignment = nil
|
28
33
|
@hit_alignment = nil
|
34
|
+
init_xml_attributes(input[:xml_input]) if input[:xml_input]
|
35
|
+
init_tabular_attribute(input[:tabular_input]) if input[:tabular_input]
|
36
|
+
end
|
37
|
+
|
38
|
+
def init_xml_attributes(hsp)
|
39
|
+
@match_query_from = hsp.query_from.to_i
|
40
|
+
@match_query_to = hsp.query_to.to_i
|
41
|
+
@query_reading_frame = hsp.query_frame.to_i
|
42
|
+
@hit_from = hsp.hit_from.to_i
|
43
|
+
@hit_to = hsp.hit_to.to_i
|
44
|
+
@query_alignment = hsp.qseq.to_s
|
45
|
+
@hit_alignment = hsp.hseq.to_s
|
46
|
+
@align_len = hsp.align_len.to_i
|
47
|
+
@pidentity = (100 * hsp.identity / hsp.align_len.to_f).round(2)
|
48
|
+
@identity = hsp.identity.to_i
|
49
|
+
@hsp_evalue = format('%.0e', hsp.evalue)
|
50
|
+
assert_seq_type(@hit_alignment) if @hit_alignment
|
51
|
+
assert_seq_type(@query_alignment) if @query_alignment
|
52
|
+
return unless config[:type] == :nucleotide
|
53
|
+
@match_query_from = (@match_query_from / 3) + 1
|
54
|
+
@match_query_to = (@match_query_to / 3) + 1
|
29
55
|
end
|
30
56
|
|
31
57
|
##
|
@@ -52,7 +78,10 @@ module GeneValidator
|
|
52
78
|
|
53
79
|
def assert_seq_type(query)
|
54
80
|
seq_type = BlastUtils.guess_sequence_type(query)
|
55
|
-
|
81
|
+
raise SequenceTypeError if seq_type != :protein
|
82
|
+
rescue SequenceTypeError => e
|
83
|
+
warn e
|
84
|
+
exit 1
|
56
85
|
end
|
57
86
|
end
|
58
87
|
end
|
@@ -1,10 +1,7 @@
|
|
1
|
-
require 'erb'
|
2
|
-
require 'fileutils'
|
3
1
|
require 'forwardable'
|
4
2
|
require 'json'
|
5
3
|
|
6
4
|
require 'genevalidator'
|
7
|
-
require 'genevalidator/output'
|
8
5
|
require 'genevalidator/version'
|
9
6
|
|
10
7
|
module GeneValidator
|
@@ -12,139 +9,58 @@ module GeneValidator
|
|
12
9
|
class JsonToGVResults
|
13
10
|
class << self
|
14
11
|
extend Forwardable
|
15
|
-
def_delegators GeneValidator, :opt
|
12
|
+
def_delegators GeneValidator, :opt, :config, :dirs
|
16
13
|
|
17
|
-
def init
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
aux: File.expand_path(File.join(File.dirname(__FILE__), '../../aux')),
|
23
|
-
filename: File.basename(@opt[:json_file]),
|
24
|
-
output_max: 2500,
|
25
|
-
run_no: 0
|
26
|
-
}
|
27
|
-
@json_array = load_json_file
|
14
|
+
def init(opt)
|
15
|
+
GeneValidator.opt = opt
|
16
|
+
GeneValidator.config = { output_max: 2500, run_no: 0,
|
17
|
+
json_output: load_json_file }
|
18
|
+
GeneValidator.dirs = GeneValidator.setup_dirnames(opt[:json_file])
|
28
19
|
end
|
29
20
|
|
30
21
|
def run
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
output_html = output_filename
|
37
|
-
generate_html_header(output_html) unless File.exist?(output_html)
|
38
|
-
generate_html_query(output_html, row)
|
22
|
+
warn '==> Parsing input JSON results'
|
23
|
+
print_console_header(config[:json_output][0])
|
24
|
+
config[:json_output].each do |row|
|
25
|
+
print_output_console(row)
|
26
|
+
create_row_json_plot_files(row)
|
39
27
|
end
|
40
|
-
|
41
|
-
calculate_overall_score
|
28
|
+
GeneValidator.produce_output
|
42
29
|
end
|
43
30
|
|
44
|
-
def
|
45
|
-
|
46
|
-
|
31
|
+
def print_console_header(first_row)
|
32
|
+
return unless opt[:output_formats].include? 'stdout'
|
33
|
+
return if config[:console_header_printed]
|
34
|
+
config[:console_header_printed] = true
|
35
|
+
warn '' # blank line
|
36
|
+
c_fmt = "%3s\t%5s\t%20s\t%7s\t"
|
37
|
+
print format(c_fmt, 'No', 'Score', 'Identifier', 'No_Hits')
|
38
|
+
puts first_row[:validations].keys.join("\t")
|
47
39
|
end
|
48
40
|
|
49
|
-
def
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
File.join(@config[:html_path], "results#{i}.html")
|
58
|
-
end
|
59
|
-
|
60
|
-
def generate_html_header(output_html)
|
61
|
-
return if File.exist?(output_html)
|
62
|
-
json_header_template = File.join(@config[:aux], 'json_header.erb')
|
63
|
-
template_contents = File.open(json_header_template, 'r').read
|
64
|
-
erb = ERB.new(template_contents, 0, '>')
|
65
|
-
File.open(output_html, 'w+') { |f| f.write(erb.result(binding)) }
|
66
|
-
end
|
67
|
-
|
68
|
-
def generate_html_query(output_html, row)
|
69
|
-
@row = row
|
70
|
-
json_query_template = File.join(@config[:aux], 'json_query.erb')
|
71
|
-
template_contents = File.open(json_query_template, 'r').read
|
72
|
-
erb = ERB.new(template_contents, 0, '>')
|
73
|
-
File.open(output_html, 'a') { |f| f.write(erb.result(binding)) }
|
74
|
-
end
|
75
|
-
|
76
|
-
# Add footer to all output files
|
77
|
-
def html_footer
|
78
|
-
no_of_output_files = (@config[:run_no].to_f / @config[:output_max]).ceil
|
79
|
-
|
80
|
-
output_files = []
|
81
|
-
(1..no_of_output_files).each { |i| output_files << "results#{i}.html" }
|
82
|
-
|
83
|
-
write_html_footer(no_of_output_files, output_files)
|
84
|
-
end
|
85
|
-
|
86
|
-
def write_html_footer(no_of_output_files, output_files)
|
87
|
-
turn_off_automated_sorting
|
88
|
-
json_footer_template = File.join(@config[:aux], 'json_footer.erb')
|
89
|
-
template_contents = File.open(json_footer_template, 'r').read
|
90
|
-
erb = ERB.new(template_contents, 0, '>')
|
91
|
-
(1..no_of_output_files).each do |i|
|
92
|
-
results_html = File.join(@config[:html_path], "results#{i}.html")
|
93
|
-
File.open(results_html, 'a+') { |f| f.write(erb.result(binding)) }
|
41
|
+
def print_output_console(row)
|
42
|
+
return unless opt[:output_formats].include? 'stdout'
|
43
|
+
c_fmt = "%3s\t%5s\t%20s\t%7s\t"
|
44
|
+
short_def = row[:definition].split(' ')[0]
|
45
|
+
print format(c_fmt, row[:idx], row[:overall_score], short_def,
|
46
|
+
row[:no_hits])
|
47
|
+
puts row[:validations].values.map { |e| e[:print] }.join("\t")
|
48
|
+
.gsub(' ', ' ')
|
94
49
|
end
|
95
|
-
end
|
96
|
-
|
97
|
-
# By default, on page load, the results are automatically sorted by the
|
98
|
-
# index. However since the whole idea is that users would sort by JSON,
|
99
|
-
# this is not wanted here.
|
100
|
-
def turn_off_automated_sorting
|
101
|
-
script_file = File.join(@config[:html_path],
|
102
|
-
'files/js/genevalidator.compiled.min.js')
|
103
|
-
original_content = File.read(script_file)
|
104
|
-
# removes the automatic sort on page load
|
105
|
-
updated_content = original_content.gsub(',sortList:[[0,0]]', '')
|
106
|
-
File.open("#{script_file}.tmp", 'w') { |f| f.puts updated_content }
|
107
|
-
FileUtils.mv("#{script_file}.tmp", script_file)
|
108
|
-
end
|
109
50
|
|
110
|
-
|
111
|
-
scores = []
|
112
|
-
@json_array.each { |row| scores << row['overall_score'] }
|
113
|
-
plot_dir = File.join(@config[:html_path], 'files/json')
|
114
|
-
less = generate_evaluation(scores)
|
115
|
-
Output.create_overview_json(scores, plot_dir, less, less)
|
116
|
-
end
|
117
|
-
|
118
|
-
def generate_evaluation(scores)
|
119
|
-
no_of_queries = scores.length
|
120
|
-
good_scores = scores.count { |s| s >= 75 }
|
121
|
-
bad_scores = scores.count { |s| s < 75 }
|
122
|
-
nee = calculate_no_quries_with_no_evidence # nee = no evidence
|
51
|
+
private
|
123
52
|
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
' were:<br>' \
|
129
|
-
"#{good_pred} good prediction(s),<br>" \
|
130
|
-
"#{bad_pred} possibly weak prediction(s).<br>"
|
131
|
-
return eval if nee == 0
|
132
|
-
eval << "#{nee} could not be evaluated due to the lack of" \
|
133
|
-
' evidence.<br>'
|
134
|
-
eval
|
135
|
-
end
|
53
|
+
def load_json_file
|
54
|
+
json_contents = File.read(File.expand_path(opt[:json_file]))
|
55
|
+
JSON.parse(json_contents, symbolize_names: true)
|
56
|
+
end
|
136
57
|
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
if status.count { |r| r == 'warning' } == status.length
|
143
|
-
all_warnings += 1
|
144
|
-
end
|
58
|
+
def create_row_json_plot_files(row)
|
59
|
+
config[:run_no] += 1
|
60
|
+
fname = "#{dirs[:filename]}_#{row[:idx]}.json"
|
61
|
+
json_file = File.join(dirs[:json_dir], fname)
|
62
|
+
File.open(json_file, 'w') { |f| f.write(row.to_json) }
|
145
63
|
end
|
146
|
-
all_warnings
|
147
|
-
end
|
148
64
|
end
|
149
65
|
end
|
150
66
|
end
|